Added a expand_dots JsonObjectOptions. (#1687)

Related with quickwit#2345.
This commit is contained in:
Paul Masurel
2022-11-21 15:03:00 +01:00
committed by GitHub
parent e758080465
commit 0b40a7fe43
6 changed files with 165 additions and 53 deletions

View File

@@ -67,11 +67,12 @@ pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_value_res in json_values {
let json_value = json_value_res?;
@@ -259,6 +260,7 @@ pub(crate) fn set_string_and_get_terms(
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
expand_dots_enabled: bool,
}
/// Splits a json path supplied to the query parser in such a way that
@@ -298,23 +300,25 @@ impl<'a> JsonTermWriter<'a> {
pub fn from_field_and_json_path(
field: Field,
json_path: &str,
expand_dots_enabled: bool,
term_buffer: &'a mut Term,
) -> Self {
term_buffer.set_field_and_type(field, Type::Json);
let mut json_term_writer = Self::wrap(term_buffer);
let mut json_term_writer = Self::wrap(term_buffer, expand_dots_enabled);
for segment in split_json_path(json_path) {
json_term_writer.push_path_segment(&segment);
}
json_term_writer
}
pub fn wrap(term_buffer: &'a mut Term) -> Self {
pub fn wrap(term_buffer: &'a mut Term, expand_dots_enabled: bool) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(0);
Self {
term_buffer,
path_stack,
expand_dots_enabled,
}
}
@@ -336,11 +340,24 @@ impl<'a> JsonTermWriter<'a> {
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
}
self.term_buffer.append_bytes(segment.as_bytes());
self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]);
if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
self.term_buffer
.append_bytes(segment.as_bytes())
.iter_mut()
.for_each(|byte| {
if *byte == b'.' {
*byte = JSON_PATH_SEGMENT_SEP;
}
});
} else {
self.term_buffer.append_bytes(segment.as_bytes());
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.path_stack.push(self.term_buffer.len_bytes());
}
@@ -391,7 +408,7 @@ mod tests {
fn test_json_writer() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("attributes");
json_writer.push_path_segment("color");
json_writer.set_str("red");
@@ -425,7 +442,7 @@ mod tests {
fn test_string_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
@@ -438,7 +455,7 @@ mod tests {
fn test_i64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
@@ -451,7 +468,7 @@ mod tests {
fn test_u64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
@@ -464,7 +481,7 @@ mod tests {
fn test_f64_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
@@ -477,7 +494,7 @@ mod tests {
fn test_bool_term() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
assert_eq!(
@@ -490,7 +507,7 @@ mod tests {
fn test_push_after_set_path_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("attribute");
json_writer.set_str("something");
json_writer.push_path_segment("color");
@@ -505,7 +522,7 @@ mod tests {
fn test_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
json_writer.push_path_segment("hue");
json_writer.pop_path_segment();
@@ -520,7 +537,7 @@ mod tests {
fn test_json_writer_path() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color");
assert_eq!(json_writer.path(), b"color");
json_writer.push_path_segment("hue");
@@ -529,6 +546,37 @@ mod tests {
assert_eq!(json_writer.path(), b"color\x01hue");
}
#[test]
fn test_json_path_expand_dots_disabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, false);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color.hue");
}
#[test]
fn test_json_path_expand_dots_enabled() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"color\x01hue");
}
#[test]
fn test_json_path_expand_dots_enabled_pop_segment() {
let field = Field::from_field_id(1);
let mut term = Term::with_type_and_field(Type::Json, field);
let mut json_writer = JsonTermWriter::wrap(&mut term, true);
json_writer.push_path_segment("hello");
assert_eq!(json_writer.path(), b"hello");
json_writer.push_path_segment("color.hue");
assert_eq!(json_writer.path(), b"hello\x01color\x01hue");
json_writer.pop_path_segment();
assert_eq!(json_writer.path(), b"hello");
}
#[test]
fn test_split_json_path_simple() {
let json_path = split_json_path("titi.toto");

View File

@@ -60,7 +60,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, STORED, TEXT};
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
#[test]
@@ -81,9 +81,9 @@ mod tests_mmap {
}
#[test]
fn test_json_field_espace() {
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | STORED);
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
@@ -99,4 +99,26 @@ mod tests_mmap {
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
}

View File

@@ -180,7 +180,7 @@ impl SegmentWriter {
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
match *field_entry.field_type() {
match field_entry.field_type() {
FieldType::Facet(_) => {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
@@ -307,7 +307,7 @@ impl SegmentWriter {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::JsonObject(_) => {
FieldType::JsonObject(json_options) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
@@ -315,6 +315,7 @@ impl SegmentWriter {
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
ctx,
@@ -557,7 +558,7 @@ mod tests {
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut term_stream = term_dict.stream().unwrap();
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("bool");
json_term_writer.set_fast_value(true);
@@ -648,7 +649,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("token");
let term_info = inv_index
@@ -692,7 +693,7 @@ mod tests {
let segment_reader = searcher.segment_reader(0u32);
let inv_index = segment_reader.inverted_index(json_field).unwrap();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.set_str("two tokens");
let term_info = inv_index
@@ -737,7 +738,7 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut term = Term::with_type_and_field(Type::Json, json_field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
let mut json_term_writer = JsonTermWriter::wrap(&mut term, false);
json_term_writer.push_path_segment("mykey");
json_term_writer.push_path_segment("field");
json_term_writer.set_str("hello");

View File

@@ -16,7 +16,8 @@ use crate::query::{
TermQuery, TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type,
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
@@ -482,28 +483,14 @@ impl QueryParser {
.into_iter()
.collect())
}
FieldType::JsonObject(ref json_options) => {
let option = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&text_analyzer,
index_record_option,
)
}
FieldType::JsonObject(ref json_options) => generate_literals_for_json_object(
field_name,
field,
json_path,
phrase,
&self.tokenizer_manager,
json_options,
),
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => {
let facet_term = Term::from_facet(field, &facet);
@@ -767,17 +754,32 @@ fn generate_literals_for_json_object(
field: Field,
json_path: &str,
phrase: &str,
text_analyzer: &TextAnalyzer,
index_record_option: IndexRecordOption,
tokenizer_manager: &TokenizerManager,
json_options: &JsonObjectOptions,
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let text_options = json_options.get_text_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_name.to_string())
})?;
let text_analyzer = tokenizer_manager
.get(text_options.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: text_options.tokenizer().to_string(),
})?;
let index_record_option = text_options.index_option();
let mut logical_literals = Vec::new();
let mut term = Term::with_capacity(100);
let mut json_term_writer =
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
let mut json_term_writer = JsonTermWriter::from_field_and_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
&mut term,
);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer);
drop(json_term_writer);
if terms.len() <= 1 {
for (_, term) in terms {

View File

@@ -13,6 +13,8 @@ pub struct JsonObjectOptions {
// If set to some, int, date, f64 and text will be indexed.
// Text will use the TextFieldIndexing setting for indexing.
indexing: Option<TextFieldIndexing>,
expand_dots_enabled: bool,
}
impl JsonObjectOptions {
@@ -26,6 +28,29 @@ impl JsonObjectOptions {
self.indexing.is_some()
}
/// Returns `true` iff dots in json keys should be expanded.
///
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// It option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
///
/// If disabled, the "." need to be escaped:
/// `k8s\.node\.id:5`.
pub fn is_expand_dots_enabled(&self) -> bool {
self.expand_dots_enabled
}
/// Sets `expands_dots` to true.
/// See `is_expand_dots_enabled` for more information.
pub fn set_expand_dots_enabled(mut self) -> Self {
self.expand_dots_enabled = true;
self
}
/// Returns the text indexing options.
///
/// If set to `Some` then both int and str values will be indexed.
@@ -55,6 +80,7 @@ impl From<StoredFlag> for JsonObjectOptions {
JsonObjectOptions {
stored: true,
indexing: None,
expand_dots_enabled: false,
}
}
}
@@ -69,10 +95,11 @@ impl<T: Into<JsonObjectOptions>> BitOr<T> for JsonObjectOptions {
type Output = JsonObjectOptions;
fn bitor(self, other: T) -> Self {
let other = other.into();
let other: JsonObjectOptions = other.into();
JsonObjectOptions {
indexing: self.indexing.or(other.indexing),
stored: self.stored | other.stored,
expand_dots_enabled: self.expand_dots_enabled | other.expand_dots_enabled,
}
}
}
@@ -93,6 +120,7 @@ impl From<TextOptions> for JsonObjectOptions {
JsonObjectOptions {
stored: text_options.is_stored(),
indexing: text_options.get_indexing_options().cloned(),
expand_dots_enabled: false,
}
}
}

View File

@@ -197,8 +197,19 @@ impl Term {
}
/// Appends value bytes to the Term.
pub fn append_bytes(&mut self, bytes: &[u8]) {
///
/// This function returns the segment that has just been added.
#[inline]
pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] {
let len_before = self.0.len();
self.0.extend_from_slice(bytes);
&mut self.0[len_before..]
}
/// Appends a single byte to the term.
#[inline]
pub fn push_byte(&mut self, byte: u8) {
self.0.push(byte);
}
}