From 9e383433520987ca12e52497460d7b1b91c2c41b Mon Sep 17 00:00:00 2001 From: Saroh <325288+saroh@users.noreply.github.com> Date: Wed, 4 May 2022 12:27:18 +0200 Subject: [PATCH 1/4] expose helpers for json field writer manipulation closes #1302 --- src/indexer/json_term_writer.rs | 41 +++++++++++++++++- src/indexer/mod.rs | 4 +- src/query/query_parser/query_parser.rs | 58 +++++++------------------- 3 files changed, 58 insertions(+), 45 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 5f1330730..8cd172cb1 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -4,7 +4,7 @@ use murmurhash32::murmurhash2; use crate::fastfield::FastValue; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; -use crate::schema::Type; +use crate::schema::{Field, Type}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -199,12 +199,51 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime { } } +// helper function to generate a Term from a json fastvalue +pub(crate) fn generate_term_from_json_writer( + json_term_writer: &mut JsonTermWriter, + value: T, +) -> Term { + json_term_writer.set_fast_value(value); + json_term_writer.term().clone() +} + +// helper function to generate a list of terms with their positions from a textual json value +pub(crate) fn generate_terms_from_json_writer( + json_term_writer: &mut JsonTermWriter, + value: &str, + text_analyzer: &TextAnalyzer, +) -> Vec<(usize, Term)> { + let mut positions_and_terms = Vec::<(usize, Term)>::new(); + json_term_writer.close_path_and_set_type(Type::Str); + let term_num_bytes = json_term_writer.term_buffer.as_slice().len(); + let mut token_stream = text_analyzer.token_stream(value); + token_stream.process(&mut |token| { + json_term_writer.term_buffer.truncate(term_num_bytes); + json_term_writer + .term_buffer + .append_bytes(token.text.as_bytes()); + positions_and_terms.push((token.position, json_term_writer.term().clone())); + }); + positions_and_terms +} + pub struct JsonTermWriter<'a> { term_buffer: &'a mut Term, path_stack: Vec, } impl<'a> JsonTermWriter<'a> { + // Prepares writing terms for a given field + pub fn initialize(field: Field, json_path: &str, term_buffer: &'a mut Term) -> Self { + term_buffer.set_field(Type::Json, field); + let mut json_term_writer = Self::wrap(term_buffer); + for segment in json_path.split('.') { + json_term_writer.push_path_segment(segment); + } + json_term_writer + } + pub fn wrap(term_buffer: &'a mut Term) -> Self { term_buffer.clear_with_type(Type::Json); let mut path_stack = Vec::with_capacity(10); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index b0634d2ba..53b358b01 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -25,7 +25,9 @@ use crossbeam::channel; use smallvec::SmallVec; pub use self::index_writer::IndexWriter; -pub(crate) use self::json_term_writer::JsonTermWriter; +pub(crate) use self::json_term_writer::{ + generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, +}; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 354638111..5e5ebd98c 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp use super::logical_ast::*; use crate::core::Index; -use crate::indexer::JsonTermWriter; +use crate::indexer::{ + generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, +}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, @@ -660,26 +662,22 @@ fn generate_literals_for_str( Ok(Some(LogicalLiteral::Phrase(terms))) } -enum NumValue { - U64(u64), - I64(i64), - F64(f64), - DateTime(OffsetDateTime), -} - -fn infer_type_num(phrase: &str) -> Option { +fn infer_fast_value_term(json_term_writer: &mut JsonTermWriter, phrase: &str) -> Option { if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { let dt_utc = dt.to_offset(UtcOffset::UTC); - return Some(NumValue::DateTime(dt_utc)); + return Some(generate_term_from_json_writer( + json_term_writer, + DateTime::from_utc(dt_utc), + )); } if let Ok(u64_val) = str::parse::(phrase) { - return Some(NumValue::U64(u64_val)); + return Some(generate_term_from_json_writer(json_term_writer, u64_val)); } if let Ok(i64_val) = str::parse::(phrase) { - return Some(NumValue::I64(i64_val)); + return Some(generate_term_from_json_writer(json_term_writer, i64_val)); } if let Ok(f64_val) = str::parse::(phrase) { - return Some(NumValue::F64(f64_val)); + return Some(generate_term_from_json_writer(json_term_writer, f64_val)); } None } @@ -694,38 +692,12 @@ fn generate_literals_for_json_object( ) -> Result, QueryParserError> { let mut logical_literals = Vec::new(); let mut term = Term::new(); - term.set_field(Type::Json, field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); - for segment in json_path.split('.') { - json_term_writer.push_path_segment(segment); + let mut json_term_writer = JsonTermWriter::initialize(field, json_path, &mut term); + if let Some(term) = infer_fast_value_term(&mut json_term_writer, phrase) { + logical_literals.push(LogicalLiteral::Term(term)); } - if let Some(num_value) = infer_type_num(phrase) { - match num_value { - NumValue::U64(u64_val) => { - json_term_writer.set_fast_value(u64_val); - } - NumValue::I64(i64_val) => { - json_term_writer.set_fast_value(i64_val); - } - NumValue::F64(f64_val) => { - json_term_writer.set_fast_value(f64_val); - } - NumValue::DateTime(dt_val) => { - json_term_writer.set_fast_value(DateTime::from_utc(dt_val)); - } - } - logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone())); - } - json_term_writer.close_path_and_set_type(Type::Str); + let terms = generate_terms_from_json_writer(&mut json_term_writer, phrase, text_analyzer); drop(json_term_writer); - let term_num_bytes = term.as_slice().len(); - let mut token_stream = text_analyzer.token_stream(phrase); - let mut terms: Vec<(usize, Term)> = Vec::new(); - token_stream.process(&mut |token| { - term.truncate(term_num_bytes); - term.append_bytes(token.text.as_bytes()); - terms.push((token.position, term.clone())); - }); if terms.len() <= 1 { for (_, term) in terms { logical_literals.push(LogicalLiteral::Term(term)); From 14cb66ee0083e24f32306c4e934f4c206d6c0e00 Mon Sep 17 00:00:00 2001 From: Saroh <325288+saroh@users.noreply.github.com> Date: Wed, 4 May 2022 16:24:54 +0200 Subject: [PATCH 2/4] move helper to indexer module --- src/indexer/json_term_writer.rs | 23 +++++++++++++++++++++++ src/indexer/mod.rs | 2 +- src/query/query_parser/query_parser.rs | 26 ++------------------------ 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 8cd172cb1..cf8532ed3 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -199,6 +199,29 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime { } } +// Tries to infer a JSON type from a string +pub(crate) fn infer_fast_value_term( + json_term_writer: &mut JsonTermWriter, + phrase: &str, +) -> Option { + if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { + let dt_utc = dt.to_offset(UtcOffset::UTC); + return Some(generate_term_from_json_writer( + json_term_writer, + DateTime::from_utc(dt_utc), + )); + } + if let Ok(u64_val) = str::parse::(phrase) { + return Some(generate_term_from_json_writer(json_term_writer, u64_val)); + } + if let Ok(i64_val) = str::parse::(phrase) { + return Some(generate_term_from_json_writer(json_term_writer, i64_val)); + } + if let Ok(f64_val) = str::parse::(phrase) { + return Some(generate_term_from_json_writer(json_term_writer, f64_val)); + } + None +} // helper function to generate a Term from a json fastvalue pub(crate) fn generate_term_from_json_writer( json_term_writer: &mut JsonTermWriter, diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 53b358b01..f0eb48457 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -26,7 +26,7 @@ use smallvec::SmallVec; pub use self::index_writer::IndexWriter; pub(crate) use self::json_term_writer::{ - generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, + generate_terms_from_json_writer, infer_fast_value_term, JsonTermWriter, }; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 5e5ebd98c..d7a839423 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -7,9 +7,7 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp use super::logical_ast::*; use crate::core::Index; -use crate::indexer::{ - generate_term_from_json_writer, generate_terms_from_json_writer, JsonTermWriter, -}; +use crate::indexer::{generate_terms_from_json_writer, infer_fast_value_term, JsonTermWriter}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, @@ -18,7 +16,7 @@ use crate::schema::{ Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type, }; use crate::time::format_description::well_known::Rfc3339; -use crate::time::{OffsetDateTime, UtcOffset}; +use crate::time::OffsetDateTime; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::{DateTime, Score}; @@ -662,26 +660,6 @@ fn generate_literals_for_str( Ok(Some(LogicalLiteral::Phrase(terms))) } -fn infer_fast_value_term(json_term_writer: &mut JsonTermWriter, phrase: &str) -> Option { - if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let dt_utc = dt.to_offset(UtcOffset::UTC); - return Some(generate_term_from_json_writer( - json_term_writer, - DateTime::from_utc(dt_utc), - )); - } - if let Ok(u64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, u64_val)); - } - if let Ok(i64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, i64_val)); - } - if let Ok(f64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, f64_val)); - } - None -} - fn generate_literals_for_json_object( field_name: &str, field: Field, From 65d129afbdf2e3bc0f51a90feb2d9d44e04588c9 Mon Sep 17 00:00:00 2001 From: Saroh <325288+saroh@users.noreply.github.com> Date: Thu, 5 May 2022 10:12:28 +0200 Subject: [PATCH 3/4] better function names --- src/indexer/json_term_writer.rs | 14 +++++++------- src/indexer/mod.rs | 2 +- src/query/query_parser/query_parser.rs | 8 +++++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index cf8532ed3..6e54cb6c8 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -200,30 +200,30 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime { } // Tries to infer a JSON type from a string -pub(crate) fn infer_fast_value_term( +pub(crate) fn convert_to_fast_value_and_get_term( json_term_writer: &mut JsonTermWriter, phrase: &str, ) -> Option { if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { let dt_utc = dt.to_offset(UtcOffset::UTC); - return Some(generate_term_from_json_writer( + return Some(set_fastvalue_and_get_term( json_term_writer, DateTime::from_utc(dt_utc), )); } if let Ok(u64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, u64_val)); + return Some(set_fastvalue_and_get_term(json_term_writer, u64_val)); } if let Ok(i64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, i64_val)); + return Some(set_fastvalue_and_get_term(json_term_writer, i64_val)); } if let Ok(f64_val) = str::parse::(phrase) { - return Some(generate_term_from_json_writer(json_term_writer, f64_val)); + return Some(set_fastvalue_and_get_term(json_term_writer, f64_val)); } None } // helper function to generate a Term from a json fastvalue -pub(crate) fn generate_term_from_json_writer( +pub(crate) fn set_fastvalue_and_get_term( json_term_writer: &mut JsonTermWriter, value: T, ) -> Term { @@ -232,7 +232,7 @@ pub(crate) fn generate_term_from_json_writer( } // helper function to generate a list of terms with their positions from a textual json value -pub(crate) fn generate_terms_from_json_writer( +pub(crate) fn set_string_and_get_terms( json_term_writer: &mut JsonTermWriter, value: &str, text_analyzer: &TextAnalyzer, diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index f0eb48457..37bdb1ba1 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -26,7 +26,7 @@ use smallvec::SmallVec; pub use self::index_writer::IndexWriter; pub(crate) use self::json_term_writer::{ - generate_terms_from_json_writer, infer_fast_value_term, JsonTermWriter, + convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, }; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index d7a839423..2ce07983f 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp use super::logical_ast::*; use crate::core::Index; -use crate::indexer::{generate_terms_from_json_writer, infer_fast_value_term, JsonTermWriter}; +use crate::indexer::{ + convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, +}; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, @@ -671,10 +673,10 @@ fn generate_literals_for_json_object( let mut logical_literals = Vec::new(); let mut term = Term::new(); let mut json_term_writer = JsonTermWriter::initialize(field, json_path, &mut term); - if let Some(term) = infer_fast_value_term(&mut json_term_writer, phrase) { + if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { logical_literals.push(LogicalLiteral::Term(term)); } - let terms = generate_terms_from_json_writer(&mut json_term_writer, phrase, text_analyzer); + let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer); drop(json_term_writer); if terms.len() <= 1 { for (_, term) in terms { From 0ade8711268cfa49d24dd3bd99788bc9a01acf91 Mon Sep 17 00:00:00 2001 From: saroh <325288+saroh@users.noreply.github.com> Date: Fri, 6 May 2022 13:29:07 +0200 Subject: [PATCH 4/4] rename constructor to be more explicit --- src/indexer/json_term_writer.rs | 7 +++++-- src/query/query_parser/query_parser.rs | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 6e54cb6c8..503fc69a1 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -257,8 +257,11 @@ pub struct JsonTermWriter<'a> { } impl<'a> JsonTermWriter<'a> { - // Prepares writing terms for a given field - pub fn initialize(field: Field, json_path: &str, term_buffer: &'a mut Term) -> Self { + pub fn from_field_and_json_path( + field: Field, + json_path: &str, + term_buffer: &'a mut Term, + ) -> Self { term_buffer.set_field(Type::Json, field); let mut json_term_writer = Self::wrap(term_buffer); for segment in json_path.split('.') { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 2ce07983f..1f2478af5 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -672,7 +672,8 @@ fn generate_literals_for_json_object( ) -> Result, QueryParserError> { let mut logical_literals = Vec::new(); let mut term = Term::new(); - let mut json_term_writer = JsonTermWriter::initialize(field, json_path, &mut term); + let mut json_term_writer = + JsonTermWriter::from_field_and_json_path(field, json_path, &mut term); if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { logical_literals.push(LogicalLiteral::Term(term)); }