Issue/columnar for json (#1876)

Adding support for JSON fast field.
This commit is contained in:
Paul Masurel
2023-02-16 20:38:32 +09:00
committed by GitHub
parent f2f38c43ce
commit 7423f99719
8 changed files with 415 additions and 25 deletions

View File

@@ -108,7 +108,7 @@ mod tests {
use std::ops::{Range, RangeInclusive};
use std::path::Path;
use columnar::{Column, MonotonicallyMappableToU64};
use columnar::{Column, MonotonicallyMappableToU64, StrColumn};
use common::{HasLen, TerminatingWrite};
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
@@ -119,7 +119,8 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Document, Facet, FacetOptions, Field, Schema, SchemaBuilder, FAST, INDEXED, STRING, TEXT,
Document, Facet, FacetOptions, Field, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING,
TEXT,
};
use crate::time::OffsetDateTime;
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
@@ -1071,4 +1072,38 @@ mod tests {
test_range_variant(1000, 1000);
test_range_variant(1000, 1002);
}
#[test]
fn test_json_object_fast_field() {
let mut schema_builder = Schema::builder();
let without_fast_field = schema_builder.add_json_field("without", STORED);
let with_fast_field = schema_builder.add_json_field("with", STORED | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc!(without_fast_field=>json!({"hello": "without"})))
.unwrap();
writer
.add_document(doc!(with_fast_field=>json!({"hello": "with"})))
.unwrap();
writer
.add_document(doc!(with_fast_field=>json!({"hello": "with2"})))
.unwrap();
writer
.add_document(doc!(with_fast_field=>json!({"hello": "with1"})))
.unwrap();
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0u32);
let fast_fields = segment_reader.fast_fields();
let column_without_opt: Option<StrColumn> = fast_fields.str("without\u{1}hello").unwrap();
assert!(column_without_opt.is_none());
let column_with_opt: Option<StrColumn> = fast_fields.str("with\u{1}hello").unwrap();
let column_with: StrColumn = column_with_opt.unwrap();
assert!(column_with.term_ords(0).next().is_none());
assert!(column_with.term_ords(1).eq([0]));
assert!(column_with.term_ords(2).eq([2]));
assert!(column_with.term_ords(3).eq([1]));
}
}

View File

@@ -92,7 +92,7 @@ impl FastFieldReaders {
/// Returns a typed column associated to a given field name.
///
/// Returns an error if no column associated with that field_name exists.
pub fn column<T>(&self, field: &str) -> crate::Result<Column<T>>
fn column<T>(&self, field: &str) -> crate::Result<Column<T>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static,
DynamicColumn: Into<Option<Column<T>>>,

View File

@@ -1,34 +1,44 @@
use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalValue};
use common::replace_in_place;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Document, FieldType, Schema, Type, Value};
use crate::{DatePrecision, DocId};
/// Only index JSON down to a depth of 20.
/// This is mostly to guard us from a stack overflow triggered by malicious input.
const JSON_DEPTH_LIMIT: usize = 20;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
date_precisions: Vec<DatePrecision>,
expand_dots: Vec<bool>,
num_docs: DocId,
// Buffer that we recycle to avoid allocation.
json_path_buffer: String,
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut columnar_writer = ColumnarWriter::default();
let mut fast_fields: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut fast_field_names: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
.take(schema.num_fields())
.collect();
let mut expand_dots = vec![false; schema.num_fields()];
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
continue;
}
fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string());
fast_field_names[field_id.field_id() as usize] = Some(field_entry.name().to_string());
let value_type = field_entry.field_type().value_type();
let column_type = match value_type {
Type::Str => ColumnType::Str,
@@ -47,6 +57,10 @@ impl FastFieldsWriter {
if let FieldType::Date(date_options) = field_entry.field_type() {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
expand_dots[field_id.field_id() as usize] =
json_object_options.is_expand_dots_enabled();
}
let sort_values_within_row = value_type == Type::Facet;
columnar_writer.record_column_type(
field_entry.name(),
@@ -56,9 +70,11 @@ impl FastFieldsWriter {
}
FastFieldsWriter {
columnar_writer,
fast_field_names: fast_fields,
fast_field_names,
num_docs: 0u32,
date_precisions,
expand_dots,
json_path_buffer: String::new(),
}
}
@@ -82,7 +98,7 @@ impl FastFieldsWriter {
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) =
self.fast_field_names[field_value.field().field_id() as usize].as_ref()
&self.fast_field_names[field_value.field().field_id() as usize]
{
match &field_value.value {
Value::U64(u64_val) => {
@@ -136,7 +152,19 @@ impl FastFieldsWriter {
facet.encoded_str(),
);
}
Value::JsonObject(_) => todo!(),
Value::JsonObject(json_obj) => {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
expand_dots,
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
);
}
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
@@ -163,3 +191,239 @@ impl FastFieldsWriter {
Ok(())
}
}
#[inline]
fn columnar_numerical_value(json_number: &serde_json::Number) -> Option<NumericalValue> {
if let Some(num_i64) = json_number.as_i64() {
return Some(num_i64.into());
}
if let Some(num_u64) = json_number.as_u64() {
return Some(num_u64.into());
}
if let Some(num_f64) = json_number.as_f64() {
return Some(num_f64.into());
}
// This can happen with arbitrary precision.... but we do not handle it.
None
}
fn record_json_obj_to_columnar_writer(
doc: DocId,
json_obj: &serde_json::Map<String, serde_json::Value>,
expand_dots: bool,
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
if !json_path_buffer.is_empty() {
json_path_buffer.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
json_path_buffer.push_str(key);
if expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut json_path_buffer[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, unsafe {
appended_segment.as_bytes_mut()
});
}
record_json_value_to_columnar_writer(
doc,
child,
expand_dots,
remaining_depth_limit,
json_path_buffer,
columnar_writer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
}
}
fn record_json_value_to_columnar_writer(
doc: DocId,
json_val: &serde_json::Value,
expand_dots: bool,
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
) {
if remaining_depth_limit == 0 {
return;
}
remaining_depth_limit -= 1;
match json_val {
serde_json::Value::Null => {
// TODO handle null
}
serde_json::Value::Bool(bool_val) => {
columnar_writer.record_bool(doc, &json_path_writer, *bool_val);
}
serde_json::Value::Number(json_number) => {
if let Some(numerical_value) = columnar_numerical_value(&json_number) {
columnar_writer.record_numerical(doc, json_path_writer.as_str(), numerical_value);
}
}
serde_json::Value::String(text) => {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
serde_json::Value::Array(arr) => {
for el in arr {
record_json_value_to_columnar_writer(
doc,
el,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
);
}
}
serde_json::Value::Object(json_obj) => {
record_json_obj_to_columnar_writer(
doc,
json_obj,
expand_dots,
remaining_depth_limit,
json_path_writer,
columnar_writer,
);
}
}
}
#[cfg(test)]
mod tests {
use columnar::{Column, ColumnarReader, ColumnarWriter, StrColumn};
use super::record_json_value_to_columnar_writer;
use crate::fastfield::writer::JSON_DEPTH_LIMIT;
use crate::DocId;
fn test_columnar_from_jsons_aux(
json_docs: &[serde_json::Value],
expand_dots: bool,
) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
let mut json_path = String::new();
for (doc, json_doc) in json_docs.iter().enumerate() {
record_json_value_to_columnar_writer(
doc as u32,
json_doc,
expand_dots,
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
);
}
let mut buffer = Vec::new();
columnar_writer
.serialize(json_docs.len() as DocId, None, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
}
#[test]
fn test_json_fastfield_record_simple() {
let json_doc = serde_json::json!({
"float": 1.02,
"text": "hello happy tax payer",
"nested": {"child": 3, "child2": 5},
"arr": ["hello", "happy", "tax", "payer"]
});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
{
assert_eq!(columns[0].0, "arr");
let column_arr_opt: Option<StrColumn> = columns[0].1.open().unwrap().into();
assert!(column_arr_opt
.unwrap()
.term_ords(0)
.eq([1, 0, 3, 2].into_iter()));
}
{
assert_eq!(columns[1].0, "float");
let column_float_opt: Option<Column<f64>> = columns[1].1.open().unwrap().into();
assert!(column_float_opt
.unwrap()
.values(0)
.eq([1.02f64].into_iter()));
}
{
assert_eq!(columns[2].0, "nested\u{1}child");
let column_nest_child_opt: Option<Column<i64>> = columns[2].1.open().unwrap().into();
assert!(column_nest_child_opt.unwrap().values(0).eq([3].into_iter()));
}
{
assert_eq!(columns[3].0, "nested\u{1}child2");
let column_nest_child2_opt: Option<Column<i64>> = columns[3].1.open().unwrap().into();
assert!(column_nest_child2_opt
.unwrap()
.values(0)
.eq([5].into_iter()));
}
{
assert_eq!(columns[4].0, "text");
let column_text_opt: Option<StrColumn> = columns[4].1.open().unwrap().into();
assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter()));
}
}
#[test]
fn test_json_fastfield_deep_obj() {
let json_doc = serde_json::json!(
{"a": {"a": {"a": {"a": {"a":
{"a": {"a": {"a": {"a": {"a":
{"a": {"a": {"a": {"a": {"a":
{"a": {"a": {"a": {"depth_accepted": 19, "a": { "depth_truncated": 20}
}}}}}}}}}}}}}}}}}}});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 1);
assert!(columns[0].0.ends_with("a\u{1}a\u{1}a\u{1}depth_accepted"));
}
#[test]
fn test_json_fastfield_deep_arr() {
let json_doc = json!(
{"obj":
[[[[[,
[[[[[,
[[[[[,
[[18, [19, //< within limits
[20]]]]]]]]]]]]]]]]]]]});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 1);
assert_eq!(columns[0].0, "obj");
let dynamic_column = columns[0].1.open().unwrap();
let col: Option<Column<i64>> = dynamic_column.into();
let vals: Vec<i64> = col.unwrap().values(0).collect();
assert_eq!(&vals, &[18, 19])
}
#[test]
fn test_json_fast_field_do_not_expand_dots() {
let json_doc = json!({"field.with.dots": {"child.with.dot": "hello"}});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 1);
assert_eq!(columns[0].0, "field.with.dots\u{1}child.with.dot");
}
#[test]
fn test_json_fast_field_expand_dots() {
let json_doc = json!({"field.with.dots": {"child.with.dot": "hello"}});
let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], true);
let columns = columnar_reader.list_columns().unwrap();
assert_eq!(columns.len(), 1);
assert_eq!(
columns[0].0,
"field\u{1}with\u{1}dots\u{1}child\u{1}with\u{1}dot"
);
}
}

View File

@@ -1,4 +1,5 @@
use columnar::MonotonicallyMappableToU64;
use common::replace_in_place;
use murmurhash32::murmurhash2;
use rustc_hash::FxHashMap;
@@ -343,18 +344,10 @@ impl<'a> JsonTermWriter<'a> {
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') {
let appended_segment = self.term_buffer.append_bytes(segment.as_bytes());
if self.expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
self.term_buffer
.append_bytes(segment.as_bytes())
.iter_mut()
.for_each(|byte| {
if *byte == b'.' {
*byte = JSON_PATH_SEGMENT_SEP;
}
});
} else {
self.term_buffer.append_bytes(segment.as_bytes());
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment);
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.path_stack.push(self.term_buffer.len_bytes());

View File

@@ -237,7 +237,7 @@ impl FieldType {
FieldType::Date(ref date_options) => date_options.is_fast(),
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.is_fast(),
FieldType::Facet(_) => true,
FieldType::JsonObject(_) => false,
FieldType::JsonObject(ref json_object_options) => json_object_options.is_fast(),
}
}

View File

@@ -2,7 +2,7 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use crate::schema::flags::{SchemaFlagList, StoredFlag};
use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag};
use crate::schema::{TextFieldIndexing, TextOptions};
/// The `JsonObjectOptions` make it possible to
@@ -13,7 +13,32 @@ pub struct JsonObjectOptions {
// If set to some, int, date, f64 and text will be indexed.
// Text will use the TextFieldIndexing setting for indexing.
indexing: Option<TextFieldIndexing>,
// Store all field as fast fields.
fast: bool,
/// tantivy will generate pathes to the different nodes of the json object
/// both in:
/// - the inverted index (for the terms)
/// - fast fields (for the column names).
///
/// These json path are encoded by concatenating the list of object keys that
/// are visited from the root to the leaf.
///
/// By default, if an object key contains a `.`, we keep it as a `.` it as is.
/// On the search side, users will then have to escape this `.` in the query parser
/// or when refering to a column name.
///
/// For instance:
/// `{"root": {"child.with.dot": "hello"}}`
///
/// Can be searched using the following query
/// `root.child\.with\.dot:hello`
///
/// If `expand_dots_enabled` is set to true, we will treat this `.` in object keys
/// as json seperators. In other words, if set to true, our object will be
/// processed as if it was
/// `{"root": {"child": {"with": {"dot": "hello"}}}}`
/// and it can be search using the following query:
/// `root.child.with.dot:hello`
expand_dots_enabled: bool,
}
@@ -28,6 +53,12 @@ impl JsonObjectOptions {
self.indexing.is_some()
}
/// Returns true if and only if the json object fields are
/// to be treated as fast fields.
pub fn is_fast(&self) -> bool {
self.fast
}
/// Returns `true` iff dots in json keys should be expanded.
///
/// When expand_dots is enabled, json object like
@@ -67,6 +98,13 @@ impl JsonObjectOptions {
self
}
/// Sets the field as a fast field
#[must_use]
pub fn set_fast(mut self) -> Self {
self.fast = true;
self
}
/// Sets the field as indexed, with the specific indexing options.
#[must_use]
pub fn set_indexing_options(mut self, indexing: TextFieldIndexing) -> Self {
@@ -80,6 +118,18 @@ impl From<StoredFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: true,
indexing: None,
fast: false,
expand_dots_enabled: false,
}
}
}
impl From<FastFlag> for JsonObjectOptions {
fn from(_fast_flag: FastFlag) -> Self {
JsonObjectOptions {
stored: false,
indexing: None,
fast: true,
expand_dots_enabled: false,
}
}
@@ -99,6 +149,7 @@ impl<T: Into<JsonObjectOptions>> BitOr<T> for JsonObjectOptions {
JsonObjectOptions {
indexing: self.indexing.or(other.indexing),
stored: self.stored | other.stored,
fast: self.fast | other.fast,
expand_dots_enabled: self.expand_dots_enabled | other.expand_dots_enabled,
}
}
@@ -120,6 +171,7 @@ impl From<TextOptions> for JsonObjectOptions {
JsonObjectOptions {
stored: text_options.is_stored(),
indexing: text_options.get_indexing_options().cloned(),
fast: text_options.is_fast(),
expand_dots_enabled: false,
}
}
@@ -128,7 +180,7 @@ impl From<TextOptions> for JsonObjectOptions {
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::{STORED, TEXT};
use crate::schema::{FAST, STORED, TEXT};
#[test]
fn test_json_options() {
@@ -136,16 +188,31 @@ mod tests {
let json_options: JsonObjectOptions = (STORED | TEXT).into();
assert!(json_options.is_stored());
assert!(json_options.is_indexed());
assert!(!json_options.is_fast());
}
{
let json_options: JsonObjectOptions = TEXT.into();
assert!(!json_options.is_stored());
assert!(json_options.is_indexed());
assert!(!json_options.is_fast());
}
{
let json_options: JsonObjectOptions = STORED.into();
assert!(json_options.is_stored());
assert!(!json_options.is_indexed());
assert!(!json_options.is_fast());
}
{
let json_options: JsonObjectOptions = FAST.into();
assert!(!json_options.is_stored());
assert!(!json_options.is_indexed());
assert!(json_options.is_fast());
}
{
let json_options: JsonObjectOptions = (FAST | STORED).into();
assert!(json_options.is_stored());
assert!(!json_options.is_indexed());
assert!(json_options.is_fast());
}
}
}

View File

@@ -10,8 +10,7 @@ use crate::fastfield::FastValue;
use crate::schema::{Facet, Type};
use crate::{DatePrecision, DateTime};
/// Separates the different segments of
/// the json path.
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };