mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-25 19:49:57 +00:00
Addressed all `todo!()` markers created when adding `Spatial` field type and `Geometry` value type to existing code paths: - Dynamic field handling: `Geometry` not supported in dynamic JSON fields, return `unimplemented!()` consistently with other complex types. - Fast field writer: Panic if geometry routed incorrectly (internal error.) - `OwnedValue` serialization: Implement `Geometry` to GeoJSON serialization and reference-to-owned conversion. - Field type: Return `None` for `get_index_record_option()` since spatial fields use BKD trees, not inverted index. - Space usage tracking: Add spatial field to `SegmentSpaceUsage` with proper integration through `SegmentReader`. - Spatial query explain: Implement `explain()` method following pattern of other binary/constant-score queries. Fixed `MultiPolygon` deserialization bug: count total points across all rings, not number of rings. Added clippy expects for legitimate too_many_arguments cases in geometric predicates.
505 lines
18 KiB
Rust
505 lines
18 KiB
Rust
use columnar::NumericalValue;
|
|
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
|
use common::{replace_in_place, JsonPathWriter};
|
|
use rustc_hash::FxHashMap;
|
|
|
|
use crate::indexer::indexing_term::IndexingTerm;
|
|
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
|
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
|
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
|
use crate::time::format_description::well_known::Rfc3339;
|
|
use crate::time::{OffsetDateTime, UtcOffset};
|
|
use crate::tokenizer::TextAnalyzer;
|
|
use crate::{DateTime, DocId, Term};
|
|
|
|
/// This object is a map storing the last position for a given path for the current document
|
|
/// being indexed.
|
|
///
|
|
/// It is key to solve the following problem:
|
|
/// If we index a JsonObject emitting several terms with the same path
|
|
/// we do not want to create false positive in phrase queries.
|
|
///
|
|
/// For instance:
|
|
///
|
|
/// ```json
|
|
/// {"bands": [
|
|
/// {"band_name": "Elliot Smith"},
|
|
/// {"band_name": "The Who"},
|
|
/// ]}
|
|
/// ```
|
|
///
|
|
/// If we are careless and index each band names independently,
|
|
/// `Elliot` and `The` will end up indexed at position 0, and `Smith` and `Who` will be indexed at
|
|
/// position 1.
|
|
/// As a result, with lemmatization, "The Smiths" will match our object.
|
|
///
|
|
/// Worse, if a same term appears in the second object, a non increasing value would be pushed
|
|
/// to the position recorder probably provoking a panic.
|
|
///
|
|
/// This problem is solved for regular multivalued object by offsetting the position
|
|
/// of values, with a position gap. Here we would like `The` and `Who` to get indexed at
|
|
/// position 2 and 3 respectively.
|
|
///
|
|
/// With regular fields, we sort the fields beforehand, so that all terms with the same
|
|
/// path are indexed consecutively.
|
|
///
|
|
/// In JSON object, we do not have this comfort, so we need to record these position offsets in
|
|
/// a map.
|
|
///
|
|
/// Note that using a single position for the entire object would not hurt correctness.
|
|
/// It would however hurt compression.
|
|
///
|
|
/// We can therefore afford working with a map that is not imperfect. It is fine if several
|
|
/// path map to the same index position as long as the probability is relatively low.
|
|
#[derive(Default)]
|
|
pub(crate) struct IndexingPositionsPerPath {
|
|
positions_per_path: FxHashMap<u32, IndexingPosition>,
|
|
}
|
|
|
|
impl IndexingPositionsPerPath {
|
|
fn get_position_from_id(&mut self, id: u32) -> &mut IndexingPosition {
|
|
self.positions_per_path.entry(id).or_default()
|
|
}
|
|
pub fn clear(&mut self) {
|
|
self.positions_per_path.clear();
|
|
}
|
|
}
|
|
|
|
/// Convert JSON_PATH_SEGMENT_SEP to a dot.
|
|
pub fn json_path_sep_to_dot(path: &mut str) {
|
|
// This is safe since we are replacing a ASCII character by another ASCII character.
|
|
unsafe {
|
|
replace_in_place(JSON_PATH_SEGMENT_SEP, b'.', path.as_bytes_mut());
|
|
}
|
|
}
|
|
|
|
#[expect(clippy::too_many_arguments)]
|
|
fn index_json_object<'a, V: Value<'a>>(
|
|
doc: DocId,
|
|
json_visitor: V::ObjectIter,
|
|
text_analyzer: &mut TextAnalyzer,
|
|
term_buffer: &mut IndexingTerm,
|
|
json_path_writer: &mut JsonPathWriter,
|
|
postings_writer: &mut dyn PostingsWriter,
|
|
ctx: &mut IndexingContext,
|
|
positions_per_path: &mut IndexingPositionsPerPath,
|
|
) {
|
|
for (json_path_segment, json_value_visitor) in json_visitor {
|
|
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
|
|
continue;
|
|
}
|
|
json_path_writer.push(json_path_segment);
|
|
index_json_value(
|
|
doc,
|
|
json_value_visitor,
|
|
text_analyzer,
|
|
term_buffer,
|
|
json_path_writer,
|
|
postings_writer,
|
|
ctx,
|
|
positions_per_path,
|
|
);
|
|
json_path_writer.pop();
|
|
}
|
|
}
|
|
|
|
#[expect(clippy::too_many_arguments)]
|
|
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|
doc: DocId,
|
|
json_value: V,
|
|
text_analyzer: &mut TextAnalyzer,
|
|
term_buffer: &mut IndexingTerm,
|
|
json_path_writer: &mut JsonPathWriter,
|
|
postings_writer: &mut dyn PostingsWriter,
|
|
ctx: &mut IndexingContext,
|
|
positions_per_path: &mut IndexingPositionsPerPath,
|
|
) {
|
|
let set_path_id = |term_buffer: &mut IndexingTerm, unordered_id: u32| {
|
|
term_buffer.truncate_value_bytes(0);
|
|
term_buffer.append_bytes(&unordered_id.to_be_bytes());
|
|
};
|
|
let set_type = |term_buffer: &mut IndexingTerm, typ: Type| {
|
|
term_buffer.append_bytes(&[typ.to_code()]);
|
|
};
|
|
|
|
match json_value.as_value() {
|
|
ReferenceValue::Leaf(leaf) => match leaf {
|
|
ReferenceValueLeaf::Null => {}
|
|
ReferenceValueLeaf::Str(val) => {
|
|
let mut token_stream = text_analyzer.token_stream(val);
|
|
let unordered_id = ctx
|
|
.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str());
|
|
|
|
// TODO: make sure the chain position works out.
|
|
set_path_id(term_buffer, unordered_id);
|
|
set_type(term_buffer, Type::Str);
|
|
let indexing_position = positions_per_path.get_position_from_id(unordered_id);
|
|
postings_writer.index_text(
|
|
doc,
|
|
&mut *token_stream,
|
|
term_buffer,
|
|
ctx,
|
|
indexing_position,
|
|
);
|
|
}
|
|
ReferenceValueLeaf::U64(val) => {
|
|
// try to parse to i64, since when querying we will apply the same logic and prefer
|
|
// i64 values
|
|
set_path_id(
|
|
term_buffer,
|
|
ctx.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
|
);
|
|
if let Ok(i64_val) = val.try_into() {
|
|
term_buffer.append_type_and_fast_value::<i64>(i64_val);
|
|
} else {
|
|
term_buffer.append_type_and_fast_value::<u64>(val);
|
|
}
|
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
|
}
|
|
ReferenceValueLeaf::I64(val) => {
|
|
set_path_id(
|
|
term_buffer,
|
|
ctx.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
|
);
|
|
term_buffer.append_type_and_fast_value(val);
|
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
|
}
|
|
ReferenceValueLeaf::F64(val) => {
|
|
if !val.is_finite() {
|
|
return;
|
|
};
|
|
set_path_id(
|
|
term_buffer,
|
|
ctx.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
|
);
|
|
// Normalize here is important.
|
|
// In the inverted index, we coerce all numerical values to their canonical
|
|
// representation.
|
|
//
|
|
// (We do the same thing on the query side)
|
|
match NumericalValue::F64(val).normalize() {
|
|
NumericalValue::I64(val_i64) => {
|
|
term_buffer.append_type_and_fast_value::<i64>(val_i64);
|
|
}
|
|
NumericalValue::U64(val_u64) => {
|
|
term_buffer.append_type_and_fast_value::<u64>(val_u64);
|
|
}
|
|
NumericalValue::F64(val_f64) => {
|
|
term_buffer.append_type_and_fast_value::<f64>(val_f64);
|
|
}
|
|
}
|
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
|
}
|
|
ReferenceValueLeaf::Bool(val) => {
|
|
set_path_id(
|
|
term_buffer,
|
|
ctx.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
|
);
|
|
term_buffer.append_type_and_fast_value(val);
|
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
|
}
|
|
ReferenceValueLeaf::Date(val) => {
|
|
set_path_id(
|
|
term_buffer,
|
|
ctx.path_to_unordered_id
|
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
|
);
|
|
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
|
|
term_buffer.append_type_and_fast_value(val);
|
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
|
}
|
|
ReferenceValueLeaf::PreTokStr(_) => {
|
|
unimplemented!(
|
|
"Pre-tokenized string support in dynamic fields is not yet implemented"
|
|
)
|
|
}
|
|
ReferenceValueLeaf::Bytes(_) => {
|
|
unimplemented!("Bytes support in dynamic fields is not yet implemented")
|
|
}
|
|
ReferenceValueLeaf::Facet(_) => {
|
|
unimplemented!("Facet support in dynamic fields is not yet implemented")
|
|
}
|
|
ReferenceValueLeaf::IpAddr(_) => {
|
|
unimplemented!("IP address support in dynamic fields is not yet implemented")
|
|
}
|
|
ReferenceValueLeaf::Geometry(_) => {
|
|
unimplemented!("Geometry support in dynamic fields is not implemented")
|
|
}
|
|
},
|
|
ReferenceValue::Array(elements) => {
|
|
for val in elements {
|
|
index_json_value(
|
|
doc,
|
|
val,
|
|
text_analyzer,
|
|
term_buffer,
|
|
json_path_writer,
|
|
postings_writer,
|
|
ctx,
|
|
positions_per_path,
|
|
);
|
|
}
|
|
}
|
|
ReferenceValue::Object(object) => {
|
|
index_json_object::<V>(
|
|
doc,
|
|
object,
|
|
text_analyzer,
|
|
term_buffer,
|
|
json_path_writer,
|
|
postings_writer,
|
|
ctx,
|
|
positions_per_path,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tries to infer a JSON type from a string and append it to the term.
|
|
///
|
|
/// The term must be json + JSON path.
|
|
pub fn convert_to_fast_value_and_append_to_json_term(
|
|
term: &Term,
|
|
text: &str,
|
|
truncate_date_for_search: bool,
|
|
) -> Option<Term> {
|
|
assert_eq!(
|
|
term.value()
|
|
.as_json_value_bytes()
|
|
.expect("expecting a Term with a json type and json path")
|
|
.as_serialized()
|
|
.len(),
|
|
0,
|
|
"JSON value bytes should be empty"
|
|
);
|
|
try_convert_to_datetime_and_append_to_json_term(term, text, truncate_date_for_search)
|
|
.or_else(|| try_convert_to_number_and_append_to_json_term(term, text))
|
|
.or_else(|| try_convert_to_bool_and_append_to_json_term_typed(term, text))
|
|
}
|
|
|
|
fn try_convert_to_datetime_and_append_to_json_term(
|
|
term: &Term,
|
|
text: &str,
|
|
truncate_date_for_search: bool,
|
|
) -> Option<Term> {
|
|
let dt = OffsetDateTime::parse(text, &Rfc3339).ok()?;
|
|
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
|
if truncate_date_for_search {
|
|
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
|
}
|
|
let mut term_clone = term.clone();
|
|
term_clone.append_type_and_fast_value(dt);
|
|
Some(term_clone)
|
|
}
|
|
|
|
fn try_convert_to_number_and_append_to_json_term(term: &Term, text: &str) -> Option<Term> {
|
|
let numerical_value: NumericalValue = str::parse::<NumericalValue>(text).ok()?;
|
|
let mut term_clone = term.clone();
|
|
// Parse is actually returning normalized values already today, but let's not
|
|
// not rely on that hidden contract.
|
|
match numerical_value.normalize() {
|
|
NumericalValue::I64(i64_value) => {
|
|
term_clone.append_type_and_fast_value::<i64>(i64_value);
|
|
}
|
|
NumericalValue::U64(u64_value) => {
|
|
term_clone.append_type_and_fast_value::<u64>(u64_value);
|
|
}
|
|
NumericalValue::F64(f64_value) => {
|
|
term_clone.append_type_and_fast_value::<f64>(f64_value);
|
|
}
|
|
}
|
|
Some(term_clone)
|
|
}
|
|
|
|
fn try_convert_to_bool_and_append_to_json_term_typed(term: &Term, text: &str) -> Option<Term> {
|
|
let val = str::parse::<bool>(text).ok()?;
|
|
let mut term_clone = term.clone();
|
|
term_clone.append_type_and_fast_value(val);
|
|
Some(term_clone)
|
|
}
|
|
|
|
/// Splits a json path supplied to the query parser in such a way that
|
|
/// `.` can be escaped.
|
|
///
|
|
/// In other words,
|
|
/// - `k8s.node` ends up as `["k8s", "node"]`.
|
|
/// - `k8s\.node` ends up as `["k8s.node"]`.
|
|
pub fn split_json_path(json_path: &str) -> Vec<String> {
|
|
let mut escaped_state: bool = false;
|
|
let mut json_path_segments = Vec::new();
|
|
let mut buffer = String::new();
|
|
for ch in json_path.chars() {
|
|
if escaped_state {
|
|
buffer.push(ch);
|
|
escaped_state = false;
|
|
continue;
|
|
}
|
|
match ch {
|
|
'\\' => {
|
|
escaped_state = true;
|
|
}
|
|
'.' => {
|
|
let new_segment = std::mem::take(&mut buffer);
|
|
json_path_segments.push(new_segment);
|
|
}
|
|
_ => {
|
|
buffer.push(ch);
|
|
}
|
|
}
|
|
}
|
|
json_path_segments.push(buffer);
|
|
json_path_segments
|
|
}
|
|
|
|
/// Takes a field name, a json path as supplied by a user, and whether we should expand dots, and
|
|
/// return a column key, as expected by the columnar crate.
|
|
///
|
|
/// This function will detect unescaped dots in the path, and split over them.
|
|
/// If expand_dots is enabled, then even escaped dots will be split over.
|
|
///
|
|
/// The resulting list of segment then gets stitched together, joined by \1 separator,
|
|
/// as defined in the columnar crate.
|
|
pub(crate) fn encode_column_name(
|
|
field_name: &str,
|
|
json_path: &str,
|
|
expand_dots_enabled: bool,
|
|
) -> String {
|
|
let mut path = JsonPathWriter::default();
|
|
path.push(field_name);
|
|
path.set_expand_dots(expand_dots_enabled);
|
|
for segment in split_json_path(json_path) {
|
|
path.push(&segment);
|
|
}
|
|
path.into()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::split_json_path;
|
|
use crate::schema::Field;
|
|
use crate::Term;
|
|
|
|
#[test]
|
|
fn test_json_writer() {
|
|
let field = Field::from_field_id(1);
|
|
|
|
let mut term = Term::from_field_json_path(field, "attributes.color", false);
|
|
term.append_type_and_str("red");
|
|
assert_eq!(
|
|
format!("{term:?}"),
|
|
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
|
|
);
|
|
|
|
let mut term = Term::from_field_json_path(field, "attributes.dimensions.width", false);
|
|
term.append_type_and_fast_value(400i64);
|
|
assert_eq!(
|
|
format!("{term:?}"),
|
|
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_string_term() {
|
|
let field = Field::from_field_id(1);
|
|
let mut term = Term::from_field_json_path(field, "color", false);
|
|
term.append_type_and_str("red");
|
|
|
|
assert_eq!(term.serialized_term(), b"\x00\x00\x00\x01jcolor\x00sred")
|
|
}
|
|
|
|
#[test]
|
|
fn test_i64_term() {
|
|
let field = Field::from_field_id(1);
|
|
let mut term = Term::from_field_json_path(field, "color", false);
|
|
term.append_type_and_fast_value(-4i64);
|
|
|
|
assert_eq!(
|
|
term.serialized_term(),
|
|
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_u64_term() {
|
|
let field = Field::from_field_id(1);
|
|
let mut term = Term::from_field_json_path(field, "color", false);
|
|
term.append_type_and_fast_value(4u64);
|
|
|
|
assert_eq!(
|
|
term.serialized_term(),
|
|
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_f64_term() {
|
|
let field = Field::from_field_id(1);
|
|
let mut term = Term::from_field_json_path(field, "color", false);
|
|
term.append_type_and_fast_value(4.0f64);
|
|
assert_eq!(
|
|
term.serialized_term(),
|
|
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_bool_term() {
|
|
let field = Field::from_field_id(1);
|
|
let mut term = Term::from_field_json_path(field, "color", false);
|
|
term.append_type_and_fast_value(true);
|
|
assert_eq!(
|
|
term.serialized_term(),
|
|
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
|
|
)
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_simple() {
|
|
let json_path = split_json_path("titi.toto");
|
|
assert_eq!(&json_path, &["titi", "toto"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_single_segment() {
|
|
let json_path = split_json_path("toto");
|
|
assert_eq!(&json_path, &["toto"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_trailing_dot() {
|
|
let json_path = split_json_path("toto.");
|
|
assert_eq!(&json_path, &["toto", ""]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_heading_dot() {
|
|
let json_path = split_json_path(".toto");
|
|
assert_eq!(&json_path, &["", "toto"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_escaped_dot() {
|
|
let json_path = split_json_path(r"toto\.titi");
|
|
assert_eq!(&json_path, &["toto.titi"]);
|
|
let json_path_2 = split_json_path(r"k8s\.container\.name");
|
|
assert_eq!(&json_path_2, &["k8s.container.name"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_escaped_backslash() {
|
|
let json_path = split_json_path(r"toto\\titi");
|
|
assert_eq!(&json_path, &[r"toto\titi"]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_split_json_path_escaped_normal_letter() {
|
|
let json_path = split_json_path(r"toto\titi");
|
|
assert_eq!(&json_path, &[r#"tototiti"#]);
|
|
}
|
|
}
|