diff --git a/common/src/lib.rs b/common/src/lib.rs index 5692e7636..e9cf27b36 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -109,6 +109,22 @@ pub fn u64_to_f64(val: u64) -> f64 { }) } +/// Replaces a given byte in the `bytes` slice of bytes. +/// +/// This function assumes that the needle is rarely contained in the bytes string +/// and offers a fast path if the needle is not present. +#[inline(always)] +pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) { + if !bytes.contains(&needle) { + return; + } + for b in bytes { + if *b == needle { + *b = replacement; + } + } +} + #[cfg(test)] pub mod test { @@ -173,4 +189,20 @@ pub mod test { assert!(f64_to_u64(-2.0) < f64_to_u64(1.0)); assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5)); } + + #[test] + fn test_replace_in_place() { + let test_aux = |before_replacement: &[u8], expected: &[u8]| { + let mut bytes: Vec = before_replacement.to_vec(); + super::replace_in_place(b'b', b'c', &mut bytes); + assert_eq!(&bytes[..], expected); + }; + test_aux(b"", b""); + test_aux(b"b", b"c"); + test_aux(b"baaa", b"caaa"); + test_aux(b"aaab", b"aaac"); + test_aux(b"aaabaa", b"aaacaa"); + test_aux(b"aaaaaa", b"aaaaaa"); + test_aux(b"bbbb", b"cccc"); + } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index f35498caf..78746ef67 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -108,7 +108,7 @@ mod tests { use std::ops::{Range, RangeInclusive}; use std::path::Path; - use columnar::{Column, MonotonicallyMappableToU64}; + use columnar::{Column, MonotonicallyMappableToU64, StrColumn}; use common::{HasLen, TerminatingWrite}; use once_cell::sync::Lazy; use rand::prelude::SliceRandom; @@ -119,7 +119,8 @@ mod tests { use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::merge_policy::NoMergePolicy; use crate::schema::{ - Document, Facet, FacetOptions, Field, Schema, SchemaBuilder, FAST, INDEXED, STRING, TEXT, + Document, Facet, FacetOptions, Field, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, + TEXT, }; use crate::time::OffsetDateTime; use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader}; @@ -1071,4 +1072,38 @@ mod tests { test_range_variant(1000, 1000); test_range_variant(1000, 1002); } + + #[test] + fn test_json_object_fast_field() { + let mut schema_builder = Schema::builder(); + let without_fast_field = schema_builder.add_json_field("without", STORED); + let with_fast_field = schema_builder.add_json_field("with", STORED | FAST); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer = index.writer_for_tests().unwrap(); + writer + .add_document(doc!(without_fast_field=>json!({"hello": "without"}))) + .unwrap(); + writer + .add_document(doc!(with_fast_field=>json!({"hello": "with"}))) + .unwrap(); + writer + .add_document(doc!(with_fast_field=>json!({"hello": "with2"}))) + .unwrap(); + writer + .add_document(doc!(with_fast_field=>json!({"hello": "with1"}))) + .unwrap(); + writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let segment_reader = searcher.segment_reader(0u32); + let fast_fields = segment_reader.fast_fields(); + let column_without_opt: Option = fast_fields.str("without\u{1}hello").unwrap(); + assert!(column_without_opt.is_none()); + let column_with_opt: Option = fast_fields.str("with\u{1}hello").unwrap(); + let column_with: StrColumn = column_with_opt.unwrap(); + assert!(column_with.term_ords(0).next().is_none()); + assert!(column_with.term_ords(1).eq([0])); + assert!(column_with.term_ords(2).eq([2])); + assert!(column_with.term_ords(3).eq([1])); + } } diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index f6c450986..617f1c76d 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -92,7 +92,7 @@ impl FastFieldReaders { /// Returns a typed column associated to a given field name. /// /// Returns an error if no column associated with that field_name exists. - pub fn column(&self, field: &str) -> crate::Result> + fn column(&self, field: &str) -> crate::Result> where T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static, DynamicColumn: Into>>, diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index b773b90ea..ab00ebeca 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,34 +1,44 @@ use std::io; use columnar::{ColumnType, ColumnarWriter, NumericalValue}; +use common::replace_in_place; use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR}; use crate::schema::{Document, FieldType, Schema, Type, Value}; use crate::{DatePrecision, DocId}; +/// Only index JSON down to a depth of 20. +/// This is mostly to guard us from a stack overflow triggered by malicious input. +const JSON_DEPTH_LIMIT: usize = 20; + /// The `FastFieldsWriter` groups all of the fast field writers. pub struct FastFieldsWriter { columnar_writer: ColumnarWriter, fast_field_names: Vec>, //< TODO see if we can cash the field name hash too. date_precisions: Vec, + expand_dots: Vec, num_docs: DocId, + // Buffer that we recycle to avoid allocation. + json_path_buffer: String, } impl FastFieldsWriter { /// Create all `FastFieldWriter` required by the schema. pub fn from_schema(schema: &Schema) -> FastFieldsWriter { let mut columnar_writer = ColumnarWriter::default(); - let mut fast_fields: Vec> = vec![None; schema.num_fields()]; + let mut fast_field_names: Vec> = vec![None; schema.num_fields()]; let mut date_precisions: Vec = std::iter::repeat_with(DatePrecision::default) .take(schema.num_fields()) .collect(); + let mut expand_dots = vec![false; schema.num_fields()]; // TODO see other types for (field_id, field_entry) in schema.fields() { if !field_entry.field_type().is_fast() { continue; } - fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string()); + fast_field_names[field_id.field_id() as usize] = Some(field_entry.name().to_string()); let value_type = field_entry.field_type().value_type(); let column_type = match value_type { Type::Str => ColumnType::Str, @@ -47,6 +57,10 @@ impl FastFieldsWriter { if let FieldType::Date(date_options) = field_entry.field_type() { date_precisions[field_id.field_id() as usize] = date_options.get_precision(); } + if let FieldType::JsonObject(json_object_options) = field_entry.field_type() { + expand_dots[field_id.field_id() as usize] = + json_object_options.is_expand_dots_enabled(); + } let sort_values_within_row = value_type == Type::Facet; columnar_writer.record_column_type( field_entry.name(), @@ -56,9 +70,11 @@ impl FastFieldsWriter { } FastFieldsWriter { columnar_writer, - fast_field_names: fast_fields, + fast_field_names, num_docs: 0u32, date_precisions, + expand_dots, + json_path_buffer: String::new(), } } @@ -82,7 +98,7 @@ impl FastFieldsWriter { let doc_id = self.num_docs; for field_value in doc.field_values() { if let Some(field_name) = - self.fast_field_names[field_value.field().field_id() as usize].as_ref() + &self.fast_field_names[field_value.field().field_id() as usize] { match &field_value.value { Value::U64(u64_val) => { @@ -136,7 +152,19 @@ impl FastFieldsWriter { facet.encoded_str(), ); } - Value::JsonObject(_) => todo!(), + Value::JsonObject(json_obj) => { + let expand_dots = self.expand_dots[field_value.field().field_id() as usize]; + self.json_path_buffer.clear(); + self.json_path_buffer.push_str(field_name); + record_json_obj_to_columnar_writer( + doc_id, + json_obj, + expand_dots, + JSON_DEPTH_LIMIT, + &mut self.json_path_buffer, + &mut self.columnar_writer, + ); + } Value::IpAddr(ip_addr) => { self.columnar_writer .record_ip_addr(doc_id, field_name.as_str(), *ip_addr); @@ -163,3 +191,239 @@ impl FastFieldsWriter { Ok(()) } } + +#[inline] +fn columnar_numerical_value(json_number: &serde_json::Number) -> Option { + if let Some(num_i64) = json_number.as_i64() { + return Some(num_i64.into()); + } + if let Some(num_u64) = json_number.as_u64() { + return Some(num_u64.into()); + } + if let Some(num_f64) = json_number.as_f64() { + return Some(num_f64.into()); + } + // This can happen with arbitrary precision.... but we do not handle it. + None +} + +fn record_json_obj_to_columnar_writer( + doc: DocId, + json_obj: &serde_json::Map, + expand_dots: bool, + remaining_depth_limit: usize, + json_path_buffer: &mut String, + columnar_writer: &mut columnar::ColumnarWriter, +) { + for (key, child) in json_obj { + let len_path = json_path_buffer.len(); + if !json_path_buffer.is_empty() { + json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR); + } + json_path_buffer.push_str(key); + if expand_dots { + // This might include the separation byte, which is ok because it is not a dot. + let appended_segment = &mut json_path_buffer[len_path..]; + // The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are + // valid single byte ut8 strings. + // By utf-8 design, they cannot be part of another codepoint. + replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe { + appended_segment.as_bytes_mut() + }); + } + record_json_value_to_columnar_writer( + doc, + child, + expand_dots, + remaining_depth_limit, + json_path_buffer, + columnar_writer, + ); + // popping our sub path. + json_path_buffer.truncate(len_path); + } +} + +fn record_json_value_to_columnar_writer( + doc: DocId, + json_val: &serde_json::Value, + expand_dots: bool, + mut remaining_depth_limit: usize, + json_path_writer: &mut String, + columnar_writer: &mut columnar::ColumnarWriter, +) { + if remaining_depth_limit == 0 { + return; + } + remaining_depth_limit -= 1; + match json_val { + serde_json::Value::Null => { + // TODO handle null + } + serde_json::Value::Bool(bool_val) => { + columnar_writer.record_bool(doc, &json_path_writer, *bool_val); + } + serde_json::Value::Number(json_number) => { + if let Some(numerical_value) = columnar_numerical_value(&json_number) { + columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value); + } + } + serde_json::Value::String(text) => { + columnar_writer.record_str(doc, json_path_writer.as_str(), text); + } + serde_json::Value::Array(arr) => { + for el in arr { + record_json_value_to_columnar_writer( + doc, + el, + expand_dots, + remaining_depth_limit, + json_path_writer, + columnar_writer, + ); + } + } + serde_json::Value::Object(json_obj) => { + record_json_obj_to_columnar_writer( + doc, + json_obj, + expand_dots, + remaining_depth_limit, + json_path_writer, + columnar_writer, + ); + } + } +} + +#[cfg(test)] +mod tests { + use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn}; + + use super::record_json_value_to_columnar_writer; + use crate::fastfield::writer::JSON_DEPTH_LIMIT; + use crate::DocId; + + fn test_columnar_from_jsons_aux( + json_docs: &[serde_json::Value], + expand_dots: bool, + ) -> ColumnarReader { + let mut columnar_writer = ColumnarWriter::default(); + let mut json_path = String::new(); + for (doc, json_doc) in json_docs.iter().enumerate() { + record_json_value_to_columnar_writer( + doc as u32, + json_doc, + expand_dots, + JSON_DEPTH_LIMIT, + &mut json_path, + &mut columnar_writer, + ); + } + let mut buffer = Vec::new(); + columnar_writer + .serialize(json_docs.len() as DocId, None, &mut buffer) + .unwrap(); + ColumnarReader::open(buffer).unwrap() + } + + #[test] + fn test_json_fastfield_record_simple() { + let json_doc = serde_json::json!({ + "float": 1.02, + "text": "hello happy tax payer", + "nested": {"child": 3, "child2": 5}, + "arr": ["hello", "happy", "tax", "payer"] + }); + let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); + let columns = columnar_reader.list_columns().unwrap(); + { + assert_eq!(columns[0].0, "arr"); + let column_arr_opt: Option = columns[0].1.open().unwrap().into(); + assert!(column_arr_opt + .unwrap() + .term_ords(0) + .eq([1, 0, 3, 2].into_iter())); + } + { + assert_eq!(columns[1].0, "float"); + let column_float_opt: Option> = columns[1].1.open().unwrap().into(); + assert!(column_float_opt + .unwrap() + .values(0) + .eq([1.02f64].into_iter())); + } + { + assert_eq!(columns[2].0, "nested\u{1}child"); + let column_nest_child_opt: Option> = columns[2].1.open().unwrap().into(); + assert!(column_nest_child_opt.unwrap().values(0).eq([3].into_iter())); + } + { + assert_eq!(columns[3].0, "nested\u{1}child2"); + let column_nest_child2_opt: Option> = columns[3].1.open().unwrap().into(); + assert!(column_nest_child2_opt + .unwrap() + .values(0) + .eq([5].into_iter())); + } + { + assert_eq!(columns[4].0, "text"); + let column_text_opt: Option = columns[4].1.open().unwrap().into(); + assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter())); + } + } + + #[test] + fn test_json_fastfield_deep_obj() { + let json_doc = serde_json::json!( + {"a": {"a": {"a": {"a": {"a": + {"a": {"a": {"a": {"a": {"a": + {"a": {"a": {"a": {"a": {"a": + {"a": {"a": {"a": {"depth_accepted": 19, "a": { "depth_truncated": 20} + }}}}}}}}}}}}}}}}}}}); + let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); + let columns = columnar_reader.list_columns().unwrap(); + assert_eq!(columns.len(), 1); + assert!(columns[0].0.ends_with("a\u{1}a\u{1}a\u{1}depth_accepted")); + } + + #[test] + fn test_json_fastfield_deep_arr() { + let json_doc = json!( + {"obj": + [[[[[, + [[[[[, + [[[[[, + [[18, [19, //< within limits + [20]]]]]]]]]]]]]]]]]]]}); + let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); + let columns = columnar_reader.list_columns().unwrap(); + assert_eq!(columns.len(), 1); + assert_eq!(columns[0].0, "obj"); + let dynamic_column = columns[0].1.open().unwrap(); + let col: Option> = dynamic_column.into(); + let vals: Vec = col.unwrap().values(0).collect(); + assert_eq!(&vals, &[18, 19]) + } + + #[test] + fn test_json_fast_field_do_not_expand_dots() { + let json_doc = json!({"field.with.dots": {"child.with.dot": "hello"}}); + let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); + let columns = columnar_reader.list_columns().unwrap(); + assert_eq!(columns.len(), 1); + assert_eq!(columns[0].0, "field.with.dots\u{1}child.with.dot"); + } + + #[test] + fn test_json_fast_field_expand_dots() { + let json_doc = json!({"field.with.dots": {"child.with.dot": "hello"}}); + let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], true); + let columns = columnar_reader.list_columns().unwrap(); + assert_eq!(columns.len(), 1); + assert_eq!( + columns[0].0, + "field\u{1}with\u{1}dots\u{1}child\u{1}with\u{1}dot" + ); + } +} diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 1dbb29f25..5be02d3a2 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -1,4 +1,5 @@ use columnar::MonotonicallyMappableToU64; +use common::replace_in_place; use murmurhash32::murmurhash2; use rustc_hash::FxHashMap; @@ -343,18 +344,10 @@ impl<'a> JsonTermWriter<'a> { if self.path_stack.len() > 1 { buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; } - if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') { + let appended_segment = self.term_buffer.append_bytes(segment.as_bytes()); + if self.expand_dots_enabled { // We need to replace `.` by JSON_PATH_SEGMENT_SEP. - self.term_buffer - .append_bytes(segment.as_bytes()) - .iter_mut() - .for_each(|byte| { - if *byte == b'.' { - *byte = JSON_PATH_SEGMENT_SEP; - } - }); - } else { - self.term_buffer.append_bytes(segment.as_bytes()); + replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment); } self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP); self.path_stack.push(self.term_buffer.len_bytes()); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index c08d32a3e..7675096e9 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -237,7 +237,7 @@ impl FieldType { FieldType::Date(ref date_options) => date_options.is_fast(), FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_fast(), FieldType::Facet(_) => true, - FieldType::JsonObject(_) => false, + FieldType::JsonObject(ref json_object_options) => json_object_options.is_fast(), } } diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs index d6ea40b14..ea4f47c19 100644 --- a/src/schema/json_object_options.rs +++ b/src/schema/json_object_options.rs @@ -2,7 +2,7 @@ use std::ops::BitOr; use serde::{Deserialize, Serialize}; -use crate::schema::flags::{SchemaFlagList, StoredFlag}; +use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag}; use crate::schema::{TextFieldIndexing, TextOptions}; /// The `JsonObjectOptions` make it possible to @@ -13,7 +13,32 @@ pub struct JsonObjectOptions { // If set to some, int, date, f64 and text will be indexed. // Text will use the TextFieldIndexing setting for indexing. indexing: Option, - + // Store all field as fast fields. + fast: bool, + /// tantivy will generate pathes to the different nodes of the json object + /// both in: + /// - the inverted index (for the terms) + /// - fast fields (for the column names). + /// + /// These json path are encoded by concatenating the list of object keys that + /// are visited from the root to the leaf. + /// + /// By default, if an object key contains a `.`, we keep it as a `.` it as is. + /// On the search side, users will then have to escape this `.` in the query parser + /// or when refering to a column name. + /// + /// For instance: + /// `{"root": {"child.with.dot": "hello"}}` + /// + /// Can be searched using the following query + /// `root.child\.with\.dot:hello` + /// + /// If `expand_dots_enabled` is set to true, we will treat this `.` in object keys + /// as json seperators. In other words, if set to true, our object will be + /// processed as if it was + /// `{"root": {"child": {"with": {"dot": "hello"}}}}` + /// and it can be search using the following query: + /// `root.child.with.dot:hello` expand_dots_enabled: bool, } @@ -28,6 +53,12 @@ impl JsonObjectOptions { self.indexing.is_some() } + /// Returns true if and only if the json object fields are + /// to be treated as fast fields. + pub fn is_fast(&self) -> bool { + self.fast + } + /// Returns `true` iff dots in json keys should be expanded. /// /// When expand_dots is enabled, json object like @@ -67,6 +98,13 @@ impl JsonObjectOptions { self } + /// Sets the field as a fast field + #[must_use] + pub fn set_fast(mut self) -> Self { + self.fast = true; + self + } + /// Sets the field as indexed, with the specific indexing options. #[must_use] pub fn set_indexing_options(mut self, indexing: TextFieldIndexing) -> Self { @@ -80,6 +118,18 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: true, indexing: None, + fast: false, + expand_dots_enabled: false, + } + } +} + +impl From for JsonObjectOptions { + fn from(_fast_flag: FastFlag) -> Self { + JsonObjectOptions { + stored: false, + indexing: None, + fast: true, expand_dots_enabled: false, } } @@ -99,6 +149,7 @@ impl> BitOr for JsonObjectOptions { JsonObjectOptions { indexing: self.indexing.or(other.indexing), stored: self.stored | other.stored, + fast: self.fast | other.fast, expand_dots_enabled: self.expand_dots_enabled | other.expand_dots_enabled, } } @@ -120,6 +171,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: text_options.is_stored(), indexing: text_options.get_indexing_options().cloned(), + fast: text_options.is_fast(), expand_dots_enabled: false, } } @@ -128,7 +180,7 @@ impl From for JsonObjectOptions { #[cfg(test)] mod tests { use super::*; - use crate::schema::{STORED, TEXT}; + use crate::schema::{FAST, STORED, TEXT}; #[test] fn test_json_options() { @@ -136,16 +188,31 @@ mod tests { let json_options: JsonObjectOptions = (STORED | TEXT).into(); assert!(json_options.is_stored()); assert!(json_options.is_indexed()); + assert!(!json_options.is_fast()); } { let json_options: JsonObjectOptions = TEXT.into(); assert!(!json_options.is_stored()); assert!(json_options.is_indexed()); + assert!(!json_options.is_fast()); } { let json_options: JsonObjectOptions = STORED.into(); assert!(json_options.is_stored()); assert!(!json_options.is_indexed()); + assert!(!json_options.is_fast()); + } + { + let json_options: JsonObjectOptions = FAST.into(); + assert!(!json_options.is_stored()); + assert!(!json_options.is_indexed()); + assert!(json_options.is_fast()); + } + { + let json_options: JsonObjectOptions = (FAST | STORED).into(); + assert!(json_options.is_stored()); + assert!(!json_options.is_indexed()); + assert!(json_options.is_fast()); } } } diff --git a/src/schema/term.rs b/src/schema/term.rs index 689000d01..747cb434e 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -10,8 +10,7 @@ use crate::fastfield::FastValue; use crate::schema::{Facet, Type}; use crate::{DatePrecision, DateTime}; -/// Separates the different segments of -/// the json path. +/// Separates the different segments of a json path. pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8; pub const JSON_PATH_SEGMENT_SEP_STR: &str = unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };