From 9e383433520987ca12e52497460d7b1b91c2c41b Mon Sep 17 00:00:00 2001 From: Saroh <325288+saroh@users.noreply.github.com> Date: Wed, 4 May 2022 12:27:18 +0200 Subject: [PATCH] expose helpers for json field writer manipulation closes #1302 --- src/indexer/json_term_writer.rs | 41 +++++++++++++++++- src/indexer/mod.rs | 4 +- src/query/query_parser/query_parser.rs | 58 +++++++------------------- 3 files changed, 58 insertions(+), 45 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 5f1330730..8cd172cb1 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -4,7 +4,7 @@ use murmurhash32::murmurhash2; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; -use crate::schema::Type; +use crate::schema::{Field, Type}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -199,12 +199,51 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime { } } +// helper function to generate a Term from a json fastvalue +pub(crate) fn generate_term_from_json_writer( + json_term_writer: &mut JsonTermWriter, + value: T, +) -> Term { + json_term_writer.set_fast_value(value); + json_term_writer.term().clone() +} + +// helper function to generate a list of terms with their positions from a textual json value +pub(crate) fn generate_terms_from_json_writer( + json_term_writer: &mut JsonTermWriter, + value: &str, + text_analyzer: &TextAnalyzer, +) -> Vec<(usize, Term)> { + let mut positions_and_terms = Vec::<(usize, Term)>::new(); + json_term_writer.close_path_and_set_type(Type::Str); + let term_num_bytes = json_term_writer.term_buffer.as_slice().len(); + let mut token_stream = text_analyzer.token_stream(value); + token_stream.process(&mut |token| { + json_term_writer.term_buffer.truncate(term_num_bytes); + json_term_writer + .term_buffer + .append_bytes(token.text.as_bytes()); + positions_and_terms.push((token.position, json_term_writer.term().clone())); + }); + positions_and_terms +} + pub struct JsonTermWriter<'a> { term_buffer: &'a mut Term, path_stack: Vec, } impl<'a> JsonTermWriter<'a> { + // Prepares writing terms for a given field + pub fn initialize(field: Field, json_path: &str, term_buffer: &'a mut Term) -> Self { + term_buffer.set_field(Type::Json, field); + let mut json_term_writer = Self::wrap(term_buffer); + for segment in json_path.split('.') { + json_term_writer.push_path_segment(segment); + } + json_term_writer + } + pub fn wrap(term_buffer: &'a mut Term) -> Self { term_buffer.clear_with_type(Type::Json); let mut path_stack = Vec::with_capacity(10); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index b0634d2ba..53b358b01 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -25,7 +25,9 @@ use crossbeam::channel; use smallvec::SmallVec; pub use self::index_writer::IndexWriter; -pub(crate) use self::json_term_writer::JsonTermWriter; +pub(crate) use self::json_term_writer::{ + generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, +}; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 354638111..5e5ebd98c 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp use super::logical_ast::*; use crate::core::Index; -use crate::indexer::JsonTermWriter; +use crate::indexer::{ + generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, +}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, @@ -660,26 +662,22 @@ fn generate_literals_for_str( Ok(Some(LogicalLiteral::Phrase(terms))) } -enum NumValue { - U64(u64), - I64(i64), - F64(f64), - DateTime(OffsetDateTime), -} - -fn infer_type_num(phrase: &str) -> Option { +fn infer_fast_value_term(json_term_writer: &mut JsonTermWriter, phrase: &str) -> Option { if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { let dt_utc = dt.to_offset(UtcOffset::UTC); - return Some(NumValue::DateTime(dt_utc)); + return Some(generate_term_from_json_writer( + json_term_writer, + DateTime::from_utc(dt_utc), + )); } if let Ok(u64_val) = str::parse::(phrase) { - return Some(NumValue::U64(u64_val)); + return Some(generate_term_from_json_writer(json_term_writer, u64_val)); } if let Ok(i64_val) = str::parse::(phrase) { - return Some(NumValue::I64(i64_val)); + return Some(generate_term_from_json_writer(json_term_writer, i64_val)); } if let Ok(f64_val) = str::parse::(phrase) { - return Some(NumValue::F64(f64_val)); + return Some(generate_term_from_json_writer(json_term_writer, f64_val)); } None } @@ -694,38 +692,12 @@ fn generate_literals_for_json_object( ) -> Result, QueryParserError> { let mut logical_literals = Vec::new(); let mut term = Term::new(); - term.set_field(Type::Json, field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); - for segment in json_path.split('.') { - json_term_writer.push_path_segment(segment); + let mut json_term_writer = JsonTermWriter::initialize(field, json_path, &mut term); + if let Some(term) = infer_fast_value_term(&mut json_term_writer, phrase) { + logical_literals.push(LogicalLiteral::Term(term)); } - if let Some(num_value) = infer_type_num(phrase) { - match num_value { - NumValue::U64(u64_val) => { - json_term_writer.set_fast_value(u64_val); - } - NumValue::I64(i64_val) => { - json_term_writer.set_fast_value(i64_val); - } - NumValue::F64(f64_val) => { - json_term_writer.set_fast_value(f64_val); - } - NumValue::DateTime(dt_val) => { - json_term_writer.set_fast_value(DateTime::from_utc(dt_val)); - } - } - logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone())); - } - json_term_writer.close_path_and_set_type(Type::Str); + let terms = generate_terms_from_json_writer(&mut json_term_writer, phrase, text_analyzer); drop(json_term_writer); - let term_num_bytes = term.as_slice().len(); - let mut token_stream = text_analyzer.token_stream(phrase); - let mut terms: Vec<(usize, Term)> = Vec::new(); - token_stream.process(&mut |token| { - term.truncate(term_num_bytes); - term.append_bytes(token.text.as_bytes()); - terms.push((token.position, term.clone())); - }); if terms.len() <= 1 { for (_, term) in terms { logical_literals.push(LogicalLiteral::Term(term));