diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 5f1330730..503fc69a1 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -4,7 +4,7 @@ use murmurhash32::murmurhash2; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; -use crate::schema::Type; +use crate::schema::{Field, Type}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -199,12 +199,77 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime { } } +// Tries to infer a JSON type from a string +pub(crate) fn convert_to_fast_value_and_get_term( + json_term_writer: &mut JsonTermWriter, + phrase: &str, +) -> Option { + if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { + let dt_utc = dt.to_offset(UtcOffset::UTC); + return Some(set_fastvalue_and_get_term( + json_term_writer, + DateTime::from_utc(dt_utc), + )); + } + if let Ok(u64_val) = str::parse::(phrase) { + return Some(set_fastvalue_and_get_term(json_term_writer, u64_val)); + } + if let Ok(i64_val) = str::parse::(phrase) { + return Some(set_fastvalue_and_get_term(json_term_writer, i64_val)); + } + if let Ok(f64_val) = str::parse::(phrase) { + return Some(set_fastvalue_and_get_term(json_term_writer, f64_val)); + } + None +} +// helper function to generate a Term from a json fastvalue +pub(crate) fn set_fastvalue_and_get_term( + json_term_writer: &mut JsonTermWriter, + value: T, +) -> Term { + json_term_writer.set_fast_value(value); + json_term_writer.term().clone() +} + +// helper function to generate a list of terms with their positions from a textual json value +pub(crate) fn set_string_and_get_terms( + json_term_writer: &mut JsonTermWriter, + value: &str, + text_analyzer: &TextAnalyzer, +) -> Vec<(usize, Term)> { + let mut positions_and_terms = Vec::<(usize, Term)>::new(); + json_term_writer.close_path_and_set_type(Type::Str); + let term_num_bytes = json_term_writer.term_buffer.as_slice().len(); + let mut token_stream = text_analyzer.token_stream(value); + token_stream.process(&mut |token| { + json_term_writer.term_buffer.truncate(term_num_bytes); + json_term_writer + .term_buffer + .append_bytes(token.text.as_bytes()); + positions_and_terms.push((token.position, json_term_writer.term().clone())); + }); + positions_and_terms +} + pub struct JsonTermWriter<'a> { term_buffer: &'a mut Term, path_stack: Vec, } impl<'a> JsonTermWriter<'a> { + pub fn from_field_and_json_path( + field: Field, + json_path: &str, + term_buffer: &'a mut Term, + ) -> Self { + term_buffer.set_field(Type::Json, field); + let mut json_term_writer = Self::wrap(term_buffer); + for segment in json_path.split('.') { + json_term_writer.push_path_segment(segment); + } + json_term_writer + } + pub fn wrap(term_buffer: &'a mut Term) -> Self { term_buffer.clear_with_type(Type::Json); let mut path_stack = Vec::with_capacity(10); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index b0634d2ba..37bdb1ba1 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -25,7 +25,9 @@ use crossbeam::channel; use smallvec::SmallVec; pub use self::index_writer::IndexWriter; -pub(crate) use self::json_term_writer::JsonTermWriter; +pub(crate) use self::json_term_writer::{ + convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, +}; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 3066654c6..d0141833d 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp use super::logical_ast::*; use crate::core::Index; -use crate::indexer::JsonTermWriter; +use crate::indexer::{ + convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, +}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, @@ -16,7 +18,7 @@ use crate::schema::{ Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type, }; use crate::time::format_description::well_known::Rfc3339; -use crate::time::{OffsetDateTime, UtcOffset}; +use crate::time::OffsetDateTime; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::{DateTime, Score}; @@ -698,30 +700,6 @@ fn generate_literals_for_str( Ok(Some(LogicalLiteral::Phrase(terms))) } -enum NumValue { - U64(u64), - I64(i64), - F64(f64), - DateTime(OffsetDateTime), -} - -fn infer_type_num(phrase: &str) -> Option { - if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let dt_utc = dt.to_offset(UtcOffset::UTC); - return Some(NumValue::DateTime(dt_utc)); - } - if let Ok(u64_val) = str::parse::(phrase) { - return Some(NumValue::U64(u64_val)); - } - if let Ok(i64_val) = str::parse::(phrase) { - return Some(NumValue::I64(i64_val)); - } - if let Ok(f64_val) = str::parse::(phrase) { - return Some(NumValue::F64(f64_val)); - } - None -} - fn generate_literals_for_json_object( field_name: &str, field: Field, @@ -732,38 +710,13 @@ fn generate_literals_for_json_object( ) -> Result, QueryParserError> { let mut logical_literals = Vec::new(); let mut term = Term::new(); - term.set_field(Type::Json, field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); - for segment in json_path.split('.') { - json_term_writer.push_path_segment(segment); + let mut json_term_writer = + JsonTermWriter::from_field_and_json_path(field, json_path, &mut term); + if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { + logical_literals.push(LogicalLiteral::Term(term)); } - if let Some(num_value) = infer_type_num(phrase) { - match num_value { - NumValue::U64(u64_val) => { - json_term_writer.set_fast_value(u64_val); - } - NumValue::I64(i64_val) => { - json_term_writer.set_fast_value(i64_val); - } - NumValue::F64(f64_val) => { - json_term_writer.set_fast_value(f64_val); - } - NumValue::DateTime(dt_val) => { - json_term_writer.set_fast_value(DateTime::from_utc(dt_val)); - } - } - logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone())); - } - json_term_writer.close_path_and_set_type(Type::Str); + let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer); drop(json_term_writer); - let term_num_bytes = term.as_slice().len(); - let mut token_stream = text_analyzer.token_stream(phrase); - let mut terms: Vec<(usize, Term)> = Vec::new(); - token_stream.process(&mut |token| { - term.truncate(term_num_bytes); - term.append_bytes(token.text.as_bytes()); - terms.push((token.position, term.clone())); - }); if terms.len() <= 1 { for (_, term) in terms { logical_literals.push(LogicalLiteral::Term(term));