mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
allow more JSON values, fix i64 special case (#2383)
This changes three things: - Reuse positions_per_path hashmap instead of allocating one per indexed JSON value - Try to cast u64 values to i64 to streamline with search behaviour - Allow top level json values to be of any type, instead of limiting it to JSON objects. Remove special JSON object handling method. TODO: We probably should also try to check f64 to i64 and u64 when indexing, as values may get converted to f64 by the JSON parser
This commit is contained in:
@@ -31,7 +31,7 @@ use crate::{DateTime, DocId, Term};
|
||||
/// position 1.
|
||||
/// As a result, with lemmatization, "The Smiths" will match our object.
|
||||
///
|
||||
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed
|
||||
/// Worse, if a same term appears in the second object, a non increasing value would be pushed
|
||||
/// to the position recorder probably provoking a panic.
|
||||
///
|
||||
/// This problem is solved for regular multivalued object by offsetting the position
|
||||
@@ -50,7 +50,7 @@ use crate::{DateTime, DocId, Term};
|
||||
/// We can therefore afford working with a map that is not imperfect. It is fine if several
|
||||
/// path map to the same index position as long as the probability is relatively low.
|
||||
#[derive(Default)]
|
||||
struct IndexingPositionsPerPath {
|
||||
pub(crate) struct IndexingPositionsPerPath {
|
||||
positions_per_path: FxHashMap<u32, IndexingPosition>,
|
||||
}
|
||||
|
||||
@@ -58,6 +58,9 @@ impl IndexingPositionsPerPath {
|
||||
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
|
||||
self.positions_per_path.entry(id).or_default()
|
||||
}
|
||||
pub fn clear(&mut self) {
|
||||
self.positions_per_path.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert JSON_PATH_SEGMENT_SEP to a dot.
|
||||
@@ -68,36 +71,6 @@ pub fn json_path_sep_to_dot(path: &mut str) {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn index_json_values<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_visitors: impl Iterator<Item = crate::Result<V::ObjectIter>>,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
expand_dots_enabled: bool,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
json_path_writer: &mut JsonPathWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> crate::Result<()> {
|
||||
json_path_writer.clear();
|
||||
json_path_writer.set_expand_dots(expand_dots_enabled);
|
||||
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
|
||||
for json_visitor_res in json_visitors {
|
||||
let json_visitor = json_visitor_res?;
|
||||
index_json_object::<V>(
|
||||
doc,
|
||||
json_visitor,
|
||||
text_analyzer,
|
||||
term_buffer,
|
||||
json_path_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
&mut positions_per_path,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn index_json_object<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
@@ -126,7 +99,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn index_json_value<'a, V: Value<'a>>(
|
||||
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_value: V,
|
||||
text_analyzer: &mut TextAnalyzer,
|
||||
@@ -166,12 +139,18 @@ fn index_json_value<'a, V: Value<'a>>(
|
||||
);
|
||||
}
|
||||
ReferenceValueLeaf::U64(val) => {
|
||||
// try to parse to i64, since when querying we will apply the same logic and prefer
|
||||
// i64 values
|
||||
set_path_id(
|
||||
term_buffer,
|
||||
ctx.path_to_unordered_id
|
||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||
);
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
if let Ok(i64_val) = val.try_into() {
|
||||
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
||||
} else {
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
ReferenceValueLeaf::I64(val) => {
|
||||
|
||||
@@ -417,7 +417,7 @@ fn test_non_text_json_term_freq() {
|
||||
let inv_idx = segment_reader.inverted_index(field).unwrap();
|
||||
|
||||
let mut term = Term::from_field_json_path(field, "tenant_id", false);
|
||||
term.append_type_and_fast_value(75u64);
|
||||
term.append_type_and_fast_value(75i64);
|
||||
|
||||
let postings = inv_idx
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -451,7 +451,7 @@ fn test_non_text_json_term_freq_bitpacked() {
|
||||
let inv_idx = segment_reader.inverted_index(field).unwrap();
|
||||
|
||||
let mut term = Term::from_field_json_path(field, "tenant_id", false);
|
||||
term.append_type_and_fast_value(75u64);
|
||||
term.append_type_and_fast_value(75i64);
|
||||
|
||||
let mut postings = inv_idx
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
|
||||
@@ -5,16 +5,16 @@ use tokenizer_api::BoxTokenStream;
|
||||
|
||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
use super::operation::AddOperation;
|
||||
use crate::core::json_utils::index_json_values;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::index::Segment;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::json_utils::{index_json_value, IndexingPositionsPerPath};
|
||||
use crate::postings::{
|
||||
compute_table_memory_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::document::{Document, ReferenceValue, Value};
|
||||
use crate::schema::document::{Document, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
|
||||
@@ -68,6 +68,7 @@ pub struct SegmentWriter {
|
||||
pub(crate) fast_field_writers: FastFieldsWriter,
|
||||
pub(crate) fieldnorms_writer: FieldNormsWriter,
|
||||
pub(crate) json_path_writer: JsonPathWriter,
|
||||
pub(crate) json_positions_per_path: IndexingPositionsPerPath,
|
||||
pub(crate) doc_opstamps: Vec<Opstamp>,
|
||||
per_field_text_analyzers: Vec<TextAnalyzer>,
|
||||
term_buffer: Term,
|
||||
@@ -119,6 +120,7 @@ impl SegmentWriter {
|
||||
per_field_postings_writers,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
json_path_writer: JsonPathWriter::default(),
|
||||
json_positions_per_path: IndexingPositionsPerPath::default(),
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema_and_tokenizer_manager(
|
||||
&schema,
|
||||
@@ -342,26 +344,24 @@ impl SegmentWriter {
|
||||
FieldType::JsonObject(json_options) => {
|
||||
let text_analyzer =
|
||||
&mut self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it = values.map(|value_access| {
|
||||
// Used to help with linting and type checking.
|
||||
let value_access = value_access as D::Value<'_>;
|
||||
let value = value_access.as_value();
|
||||
|
||||
match value {
|
||||
ReferenceValue::Object(object_iter) => Ok(object_iter),
|
||||
_ => Err(make_schema_error()),
|
||||
}
|
||||
});
|
||||
index_json_values::<D::Value<'_>>(
|
||||
doc_id,
|
||||
json_values_it,
|
||||
text_analyzer,
|
||||
json_options.is_expand_dots_enabled(),
|
||||
term_buffer,
|
||||
postings_writer,
|
||||
&mut self.json_path_writer,
|
||||
ctx,
|
||||
)?;
|
||||
self.json_positions_per_path.clear();
|
||||
self.json_path_writer
|
||||
.set_expand_dots(json_options.is_expand_dots_enabled());
|
||||
for json_value in values {
|
||||
self.json_path_writer.clear();
|
||||
|
||||
index_json_value(
|
||||
doc_id,
|
||||
json_value,
|
||||
text_analyzer,
|
||||
term_buffer,
|
||||
&mut self.json_path_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
&mut self.json_positions_per_path,
|
||||
);
|
||||
}
|
||||
}
|
||||
FieldType::IpAddr(_) => {
|
||||
let mut num_vals = 0;
|
||||
@@ -502,7 +502,8 @@ mod tests {
|
||||
use crate::query::{PhraseQuery, QueryParser};
|
||||
use crate::schema::document::Value;
|
||||
use crate::schema::{
|
||||
Document, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT,
|
||||
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
|
||||
STRING, TEXT,
|
||||
};
|
||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -597,6 +598,45 @@ mod tests {
|
||||
assert_eq!(score_docs.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_flat_json_indexing() {
|
||||
// A JSON Object that contains mixed values on the first level
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | STRING);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
// Text, i64, u64
|
||||
writer.add_document(doc!(json_field=>"b")).unwrap();
|
||||
writer
|
||||
.add_document(doc!(json_field=>OwnedValue::I64(10i64)))
|
||||
.unwrap();
|
||||
writer
|
||||
.add_document(doc!(json_field=>OwnedValue::U64(55u64)))
|
||||
.unwrap();
|
||||
writer
|
||||
.add_document(doc!(json_field=>json!({"my_field": "a"})))
|
||||
.unwrap();
|
||||
writer.commit().unwrap();
|
||||
|
||||
let search_and_expect = |query| {
|
||||
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
||||
let text_query = query_parser.parse_query(query).unwrap();
|
||||
let score_docs: Vec<(_, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
assert_eq!(score_docs.len(), 1);
|
||||
};
|
||||
|
||||
search_and_expect("my_field:a");
|
||||
search_and_expect("b");
|
||||
search_and_expect("10");
|
||||
search_and_expect("55");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
Reference in New Issue
Block a user