allow more JSON values, fix i64 special case (#2383)

This changes three things:
- Reuse positions_per_path hashmap instead of allocating one per
  indexed JSON value
- Try to cast u64 values to i64 to streamline with search behaviour
- Allow top level json values to be of any type, instead of limiting it
  to JSON objects. Remove special JSON object handling method.

TODO: We probably should also try to check f64 to i64 and u64 when
indexing, as values may get converted to f64 by the JSON parser
This commit is contained in:
PSeitz
2024-05-01 12:08:12 +02:00
committed by GitHub
parent 99a59ad37e
commit 92b5526310
3 changed files with 77 additions and 58 deletions

View File

@@ -31,7 +31,7 @@ use crate::{DateTime, DocId, Term};
/// position 1.
/// As a result, with lemmatization, "The Smiths" will match our object.
///
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed
/// Worse, if a same term appears in the second object, a non increasing value would be pushed
/// to the position recorder probably provoking a panic.
///
/// This problem is solved for regular multivalued object by offsetting the position
@@ -50,7 +50,7 @@ use crate::{DateTime, DocId, Term};
/// We can therefore afford working with a map that is not imperfect. It is fine if several
/// path map to the same index position as long as the probability is relatively low.
#[derive(Default)]
struct IndexingPositionsPerPath {
pub(crate) struct IndexingPositionsPerPath {
positions_per_path: FxHashMap<u32, IndexingPosition>,
}
@@ -58,6 +58,9 @@ impl IndexingPositionsPerPath {
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
self.positions_per_path.entry(id).or_default()
}
pub fn clear(&mut self) {
self.positions_per_path.clear();
}
}
/// Convert JSON_PATH_SEGMENT_SEP to a dot.
@@ -68,36 +71,6 @@ pub fn json_path_sep_to_dot(path: &mut str) {
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn index_json_values<'a, V: Value<'a>>(
doc: DocId,
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
json_path_writer: &mut JsonPathWriter,
ctx: &mut IndexingContext,
) -> crate::Result<()> {
json_path_writer.clear();
json_path_writer.set_expand_dots(expand_dots_enabled);
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
for json_visitor_res in json_visitors {
let json_visitor = json_visitor_res?;
index_json_object::<V>(
doc,
json_visitor,
text_analyzer,
term_buffer,
json_path_writer,
postings_writer,
ctx,
&mut positions_per_path,
);
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>(
doc: DocId,
@@ -126,7 +99,7 @@ fn index_json_object<'a, V: Value<'a>>(
}
#[allow(clippy::too_many_arguments)]
fn index_json_value<'a, V: Value<'a>>(
pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId,
json_value: V,
text_analyzer: &mut TextAnalyzer,
@@ -166,12 +139,18 @@ fn index_json_value<'a, V: Value<'a>>(
);
}
ReferenceValueLeaf::U64(val) => {
// try to parse to i64, since when querying we will apply the same logic and prefer
// i64 values
set_path_id(
term_buffer,
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
term_buffer.append_type_and_fast_value(val);
if let Ok(i64_val) = val.try_into() {
term_buffer.append_type_and_fast_value::<i64>(i64_val);
} else {
term_buffer.append_type_and_fast_value(val);
}
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
ReferenceValueLeaf::I64(val) => {

View File

@@ -417,7 +417,7 @@ fn test_non_text_json_term_freq() {
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64);
term.append_type_and_fast_value(75i64);
let postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -451,7 +451,7 @@ fn test_non_text_json_term_freq_bitpacked() {
let inv_idx = segment_reader.inverted_index(field).unwrap();
let mut term = Term::from_field_json_path(field, "tenant_id", false);
term.append_type_and_fast_value(75u64);
term.append_type_and_fast_value(75i64);
let mut postings = inv_idx
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)

View File

@@ -5,16 +5,16 @@ use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
use crate::core::json_utils::index_json_values;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::index::Segment;
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
use crate::postings::{
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::document::{Document, ReferenceValue, Value};
use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
@@ -68,6 +68,7 @@ pub struct SegmentWriter {
pub(crate) fast_field_writers: FastFieldsWriter,
pub(crate) fieldnorms_writer: FieldNormsWriter,
pub(crate) json_path_writer: JsonPathWriter,
pub(crate) json_positions_per_path: IndexingPositionsPerPath,
pub(crate) doc_opstamps: Vec<Opstamp>,
per_field_text_analyzers: Vec<TextAnalyzer>,
term_buffer: Term,
@@ -119,6 +120,7 @@ impl SegmentWriter {
per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
json_path_writer: JsonPathWriter::default(),
json_positions_per_path: IndexingPositionsPerPath::default(),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
&schema,
@@ -342,26 +344,24 @@ impl SegmentWriter {
FieldType::JsonObject(json_options) => {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it = values.map(|value_access| {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
match value {
ReferenceValue::Object(object_iter) => Ok(object_iter),
_ => Err(make_schema_error()),
}
});
index_json_values::<D::Value<'_>>(
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
&mut self.json_path_writer,
ctx,
)?;
self.json_positions_per_path.clear();
self.json_path_writer
.set_expand_dots(json_options.is_expand_dots_enabled());
for json_value in values {
self.json_path_writer.clear();
index_json_value(
doc_id,
json_value,
text_analyzer,
term_buffer,
&mut self.json_path_writer,
postings_writer,
ctx,
&mut self.json_positions_per_path,
);
}
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
@@ -502,7 +502,8 @@ mod tests {
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::schema::{
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT,
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
@@ -597,6 +598,45 @@ mod tests {
assert_eq!(score_docs.len(), 2);
}
#[test]
fn test_flat_json_indexing() {
// A JSON Object that contains mixed values on the first level
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
// Text, i64, u64
writer.add_document(doc!(json_field=>"b")).unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::I64(10i64)))
.unwrap();
writer
.add_document(doc!(json_field=>OwnedValue::U64(55u64)))
.unwrap();
writer
.add_document(doc!(json_field=>json!({"my_field": "a"})))
.unwrap();
writer.commit().unwrap();
let search_and_expect = |query| {
let query_parser = QueryParser::for_index(&index, vec![json_field]);
let text_query = query_parser.parse_query(query).unwrap();
let score_docs: Vec<(_, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
assert_eq!(score_docs.len(), 1);
};
search_and_expect("my_field:a");
search_and_expect("b");
search_and_expect("10");
search_and_expect("55");
}
#[test]
fn test_json_indexing() {
let mut schema_builder = Schema::builder();