From d7b46d2137e1fabfb528f8d78e2867ddb8779455 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 24 Feb 2022 16:25:22 +0900 Subject: [PATCH] Added JSON Type (#1270) - Removed useless copy when ingesting JSON. - Bugfix in phrase query with a missing field norms. - Disabled range query on default fields Closes #1251 --- CHANGELOG.md | 1 + benches/index-bench.rs | 39 +- bitpacker/benches/bench.rs | 2 + doc/src/json.md | 128 ++++ examples/json_field.rs | 80 +++ query-grammar/src/user_input_ast.rs | 4 +- src/aggregation/mod.rs | 4 +- src/core/segment_reader.rs | 5 +- src/fastfield/multivalued/mod.rs | 9 +- src/indexer/json_term_writer.rs | 415 ++++++++++++ src/indexer/merger.rs | 10 +- src/indexer/mod.rs | 2 + src/indexer/segment_writer.rs | 321 +++++++++- src/postings/json_postings_writer.rs | 94 +++ src/postings/mod.rs | 1 + src/postings/per_field_postings_writer.rs | 28 +- src/postings/postings_writer.rs | 59 +- src/postings/recorder.rs | 23 +- src/postings/serializer.rs | 26 +- src/query/boolean_query/boolean_query.rs | 42 +- src/query/phrase_query/mod.rs | 56 +- src/query/phrase_query/phrase_weight.rs | 7 +- src/query/query_parser/query_parser.rs | 744 +++++++++++++++------- src/query/term_query/mod.rs | 2 +- src/schema/document.rs | 11 +- src/schema/field_entry.rs | 78 +-- src/schema/field_type.rs | 83 ++- src/schema/json_object_options.rs | 109 ++++ src/schema/mod.rs | 10 +- src/schema/schema.rs | 56 +- src/schema/term.rs | 158 +++-- src/schema/text_options.rs | 2 +- src/schema/value.rs | 43 ++ src/tokenizer/empty_tokenizer.rs | 41 ++ src/tokenizer/mod.rs | 1 + src/tokenizer/tokenizer.rs | 8 + 36 files changed, 2242 insertions(+), 460 deletions(-) create mode 100644 doc/src/json.md create mode 100644 examples/json_field.rs create mode 100644 src/indexer/json_term_writer.rs create mode 100644 src/postings/json_postings_writer.rs create mode 100644 src/schema/json_object_options.rs create mode 100644 src/tokenizer/empty_tokenizer.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f59742c2..97c9e7e08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Tantivy 0.17 - Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225) - Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278) - Added an aggregation collector compatible with Elasticsearch (@PSeitz) +- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251) Tantivy 0.16.2 ================================ diff --git a/benches/index-bench.rs b/benches/index-bench.rs index 350305b1b..c6cf31b62 100644 --- a/benches/index-bench.rs +++ b/benches/index-bench.rs @@ -4,7 +4,7 @@ use tantivy::schema::{INDEXED, STORED, STRING, TEXT}; use tantivy::Index; const HDFS_LOGS: &str = include_str!("hdfs.json"); -const NUM_REPEATS: usize = 10; +const NUM_REPEATS: usize = 2; pub fn hdfs_index_benchmark(c: &mut Criterion) { let schema = { @@ -21,6 +21,11 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) { schema_builder.add_text_field("severity", STRING | STORED); schema_builder.build() }; + let dynamic_schema = { + let mut schema_builder = tantivy::schema::SchemaBuilder::new(); + schema_builder.add_json_field("json", TEXT); + schema_builder.build() + }; let mut group = c.benchmark_group("index-hdfs"); group.sample_size(20); @@ -74,6 +79,38 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) { index_writer.commit().unwrap(); }) }); + group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| { + b.iter(|| { + let index = Index::create_in_ram(dynamic_schema.clone()); + let json_field = dynamic_schema.get_field("json").unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); + for _ in 0..NUM_REPEATS { + for doc_json in HDFS_LOGS.trim().split("\n") { + let json_val: serde_json::Map = + serde_json::from_str(doc_json).unwrap(); + let doc = tantivy::doc!(json_field=>json_val); + index_writer.add_document(doc).unwrap(); + } + } + index_writer.commit().unwrap(); + }) + }); + group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| { + b.iter(|| { + let index = Index::create_in_ram(dynamic_schema.clone()); + let json_field = dynamic_schema.get_field("json").unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap(); + for _ in 0..NUM_REPEATS { + for doc_json in HDFS_LOGS.trim().split("\n") { + let json_val: serde_json::Map = + serde_json::from_str(doc_json).unwrap(); + let doc = tantivy::doc!(json_field=>json_val); + index_writer.add_document(doc).unwrap(); + } + } + index_writer.commit().unwrap(); + }) + }); } criterion_group! { diff --git a/bitpacker/benches/bench.rs b/bitpacker/benches/bench.rs index c0f8e44df..8053c63b4 100644 --- a/bitpacker/benches/bench.rs +++ b/bitpacker/benches/bench.rs @@ -6,6 +6,7 @@ extern crate test; mod tests { use tantivy_bitpacker::BlockedBitpacker; use test::Bencher; + #[bench] fn bench_blockedbitp_read(b: &mut Bencher) { let mut blocked_bitpacker = BlockedBitpacker::new(); @@ -20,6 +21,7 @@ mod tests { out }); } + #[bench] fn bench_blockedbitp_create(b: &mut Bencher) { b.iter(|| { diff --git a/doc/src/json.md b/doc/src/json.md new file mode 100644 index 000000000..810ea610a --- /dev/null +++ b/doc/src/json.md @@ -0,0 +1,128 @@ +# Json + +As of tantivy 0.17, tantivy supports a json object type. +This type can be used to allow for a schema-less search index. + +When indexing a json object, we "flatten" the JSON. This operation emits terms that represent a triplet `(json_path, value_type, value)` + +For instance, if user is a json field, the following document: + +```json +{ + "user": { + "name": "Paul Masurel", + "address": { + "city": "Tokyo", + "country": "Japan" + }, + "created_at": "2018-11-12T23:20:50.52Z" + } +} +``` + +emits the following tokens: +- ("name", Text, "Paul") +- ("name", Text, "Masurel") +- ("address.city", Text, "Tokyo") +- ("address.country", Text, "Japan") +- ("created_at", Date, 15420648505) + + +# Bytes-encoding and lexicographical sort. + +Like any other terms, these triplets are encoded into a binary format as follows. +- `json_path`: the json path is a sequence of "segments". In the example above, `address.city` +is just a debug representation of the json path `["address", "city"]`. +Its representation is done by separating segments by a unicode char `\x01`, and ending the path by `\x00`. +- `value type`: One byte represents the `Value` type. +- `value`: The value representation is just the regular Value representation. + +This representation is designed to align the natural sort of Terms with the lexicographical sort +of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding). + +In the example above, the terms will be sorted as +- ("address.city", Text, "Tokyo") +- ("address.country", Text, "Japan") +- ("name", Text, "Masurel") +- ("name", Text, "Paul") +- ("created_at", Date, 15420648505) + +As seen in "pitfalls", we may end up having to search for a value for a same path in several different fields. Putting the field code after the path makes it maximizes compression opportunities but also increases the chances for the two terms to end up in the actual same term dictionary block. + + +# Pitfalls, limitation and corner cases. + +Json gives very little information about the type of the literals it stores. +All numeric types end up mapped as a "Number" and there are no types for dates. + +At indexing, tantivy will try to interpret number and strings as different type with a +priority order. + +Numbers will be interpreted as u64, i64 and f64 in that order. +Strings will be interpreted as rfc3999 dates or simple strings. + +The first working type is picked and is the only term that is emitted for indexing. +Note this interpretation happens on a per-document basis, and there is no effort to try to sniff +a consistent field type at the scale of a segment. + +On the query parser side on the other hand, we may end up emitting more than one type. +For instance, we do not even know if the type is a number or string based. + +So the query + +``` +my_path.my_segment:233 +``` + +Will be interpreted as +`(my_path.my_segment, String, 233) or (my_path.my_segment, u64, 233)` + +Likewise, we need to emit two tokens if the query contains an rfc3999 date. +Indeed the date could have been actually a single token inside the text of a document at ingestion time. Generally speaking, we will always at least emit a string token in query parsing, and sometimes more. + +If one more json field is defined, things get even more complicated. + + +## Default json field + +If the schema contains a text field called "text" and a json field that is set as a default field: +`text:hello` could be reasonably interpreted as targetting the text field or as targetting the json field called `json_dynamic` with the json_path "text". + +If there is such an ambiguity, we decide to only search in the "text" field: `text:hello`. + +In other words, the parser will not search in default json fields if there is a schema hit. +This is a product decision. + +The user can still target the JSON field by specifying its name explicitly: +`json_dynamic.text:hello`. + +## Range queries are not supported. + +Json field do not support range queries. + +## Arrays do not work like nested object. + +If json object contains an array, a search query might return more documents +than what might be expected. + +Let's take an example. + +```json +{ + "cart_id": 3234234 , + "cart": [ + {"product_type": "sneakers", "attributes": {"color": "white"} }, + {"product_type": "t-shirt", "attributes": {"color": "red"}}, + ] +} +``` + +Despite the array structure, a document in tantivy is a bag of terms. +The query: + +``` +cart.product_type:sneakers AND cart.attributes.color:red +``` + +Actually match the document above. + diff --git a/examples/json_field.rs b/examples/json_field.rs new file mode 100644 index 000000000..e61f8a934 --- /dev/null +++ b/examples/json_field.rs @@ -0,0 +1,80 @@ +// # Json field example +// +// This example shows how the json field can be used +// to make tantivy partially schemaless. + +use tantivy::collector::{Count, TopDocs}; +use tantivy::query::QueryParser; +use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT}; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // # Defining the schema + // + // We need two fields: + // - a timestamp + // - a json object field + let mut schema_builder = Schema::builder(); + schema_builder.add_date_field("timestamp", FAST | STORED); + let event_type = schema_builder.add_text_field("event_type", STRING | STORED); + let attributes = schema_builder.add_json_field("attributes", STORED | TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer(50_000_000)?; + let doc = schema.parse_document( + r#"{ + "timestamp": "2022-02-22T23:20:50.53Z", + "event_type": "click", + "attributes": { + "target": "submit-button", + "cart": {"product_id": 103}, + "description": "the best vacuum cleaner ever" + } + }"#, + )?; + index_writer.add_document(doc)?; + let doc = schema.parse_document( + r#"{ + "timestamp": "2022-02-22T23:20:51.53Z", + "event_type": "click", + "attributes": { + "target": "submit-button", + "cart": {"product_id": 133}, + "description": "das keyboard" + } + }"#, + )?; + index_writer.add_document(doc)?; + index_writer.commit()?; + + let reader = index.reader()?; + let searcher = reader.searcher(); + + let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]); + { + let query = query_parser.parse_query("target:submit-button")?; + let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?; + assert_eq!(count_docs.len(), 2); + } + { + let query = query_parser.parse_query("target:submit")?; + let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?; + assert_eq!(count_docs.len(), 2); + } + { + let query = query_parser.parse_query("cart.product_id:103")?; + let count_docs = searcher.search(&*query, &Count)?; + assert_eq!(count_docs, 1); + } + { + let query = query_parser + .parse_query("event_type:click AND cart.product_id:133") + .unwrap(); + let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap(); + assert_eq!(hits.len(), 1); + } + Ok(()) +} diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs index 7b32525d2..359900bab 100644 --- a/query-grammar/src/user_input_ast.rs +++ b/query-grammar/src/user_input_ast.rs @@ -59,7 +59,7 @@ pub enum UserInputBound { } impl UserInputBound { - fn display_lower(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word), UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word), @@ -67,7 +67,7 @@ impl UserInputBound { } } - fn display_upper(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word), UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word), diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index cb5185239..8644abd6f 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -256,7 +256,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 { Type::U64 => val as f64, Type::I64 => i64::from_u64(val) as f64, Type::F64 => f64::from_u64(val), - Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(), + Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(), } } @@ -275,7 +275,7 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> u64 { Type::U64 => val as u64, Type::I64 => (val as i64).to_u64(), Type::F64 => val.to_u64(), - Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(), + Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(), } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 63e99d84f..c6288170f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -121,9 +121,8 @@ impl SegmentReader { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( - "Field norm not found for field {:?}. Was the field set to record norm during \ - indexing?", - field_name + "Field norm not found for field {field_name:?}. Was the field set to record norm \ + during indexing?" ); crate::TantivyError::SchemaError(err_msg) }) diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index f2058e056..9d28beb5d 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -94,8 +94,11 @@ mod tests { assert_eq!(reader.num_docs(), 5); { - let parser = QueryParser::for_index(&index, vec![date_field]); - let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?; + let parser = QueryParser::for_index(&index, vec![]); + let query = parser.parse_query(&format!( + "multi_date_field:\"{}\"", + first_time_stamp.to_rfc3339() + ))?; let results = searcher.search(&query, &TopDocs::with_limit(5))?; assert_eq!(results.len(), 1); for (_score, doc_address) in results { @@ -150,7 +153,7 @@ mod tests { { let parser = QueryParser::for_index(&index, vec![date_field]); let range_q = format!( - "[{} TO {}}}", + "multi_date_field:[{} TO {}}}", (first_time_stamp + Duration::seconds(1)).to_rfc3339(), (first_time_stamp + Duration::seconds(3)).to_rfc3339() ); diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs new file mode 100644 index 000000000..118eb3a76 --- /dev/null +++ b/src/indexer/json_term_writer.rs @@ -0,0 +1,415 @@ +use chrono::Utc; +use fnv::FnvHashMap; +use murmurhash32::murmurhash2; + +use crate::fastfield::FastValue; +use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; +use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP}; +use crate::schema::Type; +use crate::tokenizer::TextAnalyzer; +use crate::{DocId, Term}; + +/// This object is a map storing the last position for a given path for the current document +/// being indexed. +/// +/// It is key to solve the following problem: +/// If we index a JsonObject emitting several terms with the same path +/// we do not want to create false positive in phrase queries. +/// +/// For instance: +/// +/// ```json +/// {"bands": [ +/// {"band_name": "Elliot Smith"}, +/// {"band_name": "The Who"}, +/// ]} +/// ``` +/// +/// If we are careless and index each band names independently, +/// `Elliot` and `The` will end up indexed at position 0, and `Smith` and `Who` will be indexed at +/// position 1. +/// As a result, with lemmatization, "The Smiths" will match our object. +/// +/// Worse, if a same term is appears in the second object, a non increasing value would be pushed +/// to the position recorder probably provoking a panic. +/// +/// This problem is solved for regular multivalued object by offsetting the position +/// of values, with a position gap. Here we would like `The` and `Who` to get indexed at +/// position 2 and 3 respectively. +/// +/// With regular fields, we sort the fields beforehands, so that all terms with the same +/// path are indexed consecutively. +/// +/// In JSON object, we do not have this confort, so we need to record these position offsets in +/// a map. +/// +/// Note that using a single position for the entire object would not hurt correctness. +/// It would however hurt compression. +/// +/// We can therefore afford working with a map that is not imperfect. It is fine if several +/// path map to the same index position as long as the probability is relatively low. +#[derive(Default)] +struct IndexingPositionsPerPath { + positions_per_path: FnvHashMap, +} + +impl IndexingPositionsPerPath { + fn get_position(&mut self, term: &Term) -> &mut IndexingPosition { + self.positions_per_path + .entry(murmurhash2(term.as_slice())) + .or_insert_with(Default::default) + } +} + +pub(crate) fn index_json_values<'a>( + doc: DocId, + json_values: impl Iterator>>, + text_analyzer: &TextAnalyzer, + term_buffer: &mut Term, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, +) -> crate::Result<()> { + let mut json_term_writer = JsonTermWriter::wrap(term_buffer); + let mut positions_per_path: IndexingPositionsPerPath = Default::default(); + for json_value_res in json_values { + let json_value = json_value_res?; + index_json_object( + doc, + json_value, + text_analyzer, + &mut json_term_writer, + postings_writer, + ctx, + &mut positions_per_path, + ); + } + Ok(()) +} + +fn index_json_object<'a>( + doc: DocId, + json_value: &serde_json::Map, + text_analyzer: &TextAnalyzer, + json_term_writer: &mut JsonTermWriter<'a>, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, + positions_per_path: &mut IndexingPositionsPerPath, +) { + for (json_path_segment, json_value) in json_value { + json_term_writer.push_path_segment(json_path_segment); + index_json_value( + doc, + json_value, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + json_term_writer.pop_path_segment(); + } +} + +fn index_json_value<'a>( + doc: DocId, + json_value: &serde_json::Value, + text_analyzer: &TextAnalyzer, + json_term_writer: &mut JsonTermWriter<'a>, + postings_writer: &mut dyn PostingsWriter, + ctx: &mut IndexingContext, + positions_per_path: &mut IndexingPositionsPerPath, +) { + match json_value { + serde_json::Value::Null => {} + serde_json::Value::Bool(val_bool) => { + let bool_u64 = if *val_bool { 1u64 } else { 0u64 }; + json_term_writer.set_fast_value(bool_u64); + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + serde_json::Value::Number(number) => { + if let Some(number_u64) = number.as_u64() { + json_term_writer.set_fast_value(number_u64); + } else if let Some(number_i64) = number.as_i64() { + json_term_writer.set_fast_value(number_i64); + } else if let Some(number_f64) = number.as_f64() { + json_term_writer.set_fast_value(number_f64); + } + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + serde_json::Value::String(text) => match infer_type_from_str(text) { + TextOrDateTime::Text(text) => { + let mut token_stream = text_analyzer.token_stream(text); + // TODO make sure the chain position works out. + json_term_writer.close_path_and_set_type(Type::Str); + let indexing_position = positions_per_path.get_position(json_term_writer.term()); + postings_writer.index_text( + doc, + &mut *token_stream, + json_term_writer.term_buffer, + ctx, + indexing_position, + ); + } + TextOrDateTime::DateTime(dt) => { + json_term_writer.set_fast_value(dt); + postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx); + } + }, + serde_json::Value::Array(arr) => { + for val in arr { + index_json_value( + doc, + val, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + } + } + serde_json::Value::Object(map) => { + index_json_object( + doc, + map, + text_analyzer, + json_term_writer, + postings_writer, + ctx, + positions_per_path, + ); + } + } +} + +enum TextOrDateTime<'a> { + Text(&'a str), + DateTime(crate::DateTime), +} + +fn infer_type_from_str(text: &str) -> TextOrDateTime { + match chrono::DateTime::parse_from_rfc3339(text) { + Ok(dt) => { + let dt_utc = dt.with_timezone(&Utc); + TextOrDateTime::DateTime(dt_utc) + } + Err(_) => TextOrDateTime::Text(text), + } +} + +pub struct JsonTermWriter<'a> { + term_buffer: &'a mut Term, + path_stack: Vec, +} + +impl<'a> JsonTermWriter<'a> { + pub fn wrap(term_buffer: &'a mut Term) -> Self { + term_buffer.clear_with_type(Type::Json); + let mut path_stack = Vec::with_capacity(10); + path_stack.push(5); + Self { + term_buffer, + path_stack, + } + } + + fn trim_to_end_of_path(&mut self) { + let end_of_path = *self.path_stack.last().unwrap(); + self.term_buffer.truncate(end_of_path); + } + + pub fn close_path_and_set_type(&mut self, typ: Type) { + self.trim_to_end_of_path(); + let buffer = self.term_buffer.as_mut(); + let buffer_len = buffer.len(); + buffer[buffer_len - 1] = JSON_END_OF_PATH; + buffer.push(typ.to_code()); + } + + pub fn push_path_segment(&mut self, segment: &str) { + // the path stack should never be empty. + self.trim_to_end_of_path(); + let buffer = self.term_buffer.as_mut(); + let buffer_len = buffer.len(); + if self.path_stack.len() > 1 { + buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; + } + buffer.extend(segment.as_bytes()); + buffer.push(JSON_PATH_SEGMENT_SEP); + self.path_stack.push(buffer.len()); + } + + pub fn pop_path_segment(&mut self) { + self.path_stack.pop(); + assert!(!self.path_stack.is_empty()); + self.trim_to_end_of_path(); + } + + /// Returns the json path of the term being currently built. + #[cfg(test)] + pub(crate) fn path(&self) -> &[u8] { + let end_of_path = self.path_stack.last().cloned().unwrap_or(6); + &self.term().as_slice()[5..end_of_path - 1] + } + + pub fn set_fast_value(&mut self, val: T) { + self.close_path_and_set_type(T::to_type()); + self.term_buffer + .as_mut() + .extend_from_slice(val.to_u64().to_be_bytes().as_slice()); + } + + #[cfg(test)] + pub(crate) fn set_str(&mut self, text: &str) { + self.close_path_and_set_type(Type::Str); + self.term_buffer.as_mut().extend_from_slice(text.as_bytes()); + } + + pub fn term(&self) -> &Term { + self.term_buffer + } +} + +#[cfg(test)] +mod tests { + use super::JsonTermWriter; + use crate::schema::{Field, Type}; + use crate::Term; + + #[test] + fn test_json_writer() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("attributes"); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")" + ); + json_writer.set_str("blue"); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("dimensions"); + json_writer.push_path_segment("width"); + json_writer.set_fast_value(400i64); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)" + ); + json_writer.pop_path_segment(); + json_writer.push_path_segment("height"); + json_writer.set_fast_value(300i64); + assert_eq!( + format!("{:?}", json_writer.term()), + "Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)" + ); + } + + #[test] + fn test_string_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00sred" + ) + } + + #[test] + fn test_i64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(-4i64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc" + ) + } + + #[test] + fn test_u64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(4u64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04" + ) + } + + #[test] + fn test_f64_term() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.set_fast_value(4.0f64); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00" + ) + } + + #[test] + fn test_push_after_set_path_segment() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("attribute"); + json_writer.set_str("something"); + json_writer.push_path_segment("color"); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jattribute\x01color\x00sred" + ) + } + + #[test] + fn test_pop_segment() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + json_writer.push_path_segment("hue"); + json_writer.pop_path_segment(); + json_writer.set_str("red"); + assert_eq!( + json_writer.term().as_slice(), + b"\x00\x00\x00\x01jcolor\x00sred" + ) + } + + #[test] + fn test_json_writer_path() { + let field = Field::from_field_id(1); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term); + json_writer.push_path_segment("color"); + assert_eq!(json_writer.path(), b"color"); + json_writer.push_path_segment("hue"); + assert_eq!(json_writer.path(), b"color\x01hue"); + json_writer.set_str("pink"); + assert_eq!(json_writer.path(), b"color\x01hue"); + } +} diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index f73307927..92f724a78 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -307,16 +307,16 @@ impl IndexMerger { } None => {} }, - FieldType::Str(_) => { - // We don't handle str fast field for the moment - // They can be implemented using what is done - // for facets in the future. - } FieldType::Bytes(byte_options) => { if byte_options.is_fast() { self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?; } } + FieldType::Str(_) | FieldType::JsonObject(_) => { + // We don't handle json / string fast field for the moment + // They can be implemented using what is done + // for facets in the future + } } } Ok(()) diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index d8f9b0568..b0634d2ba 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -5,6 +5,7 @@ pub mod doc_id_mapping; mod doc_opstamp_mapping; pub mod index_writer; mod index_writer_status; +mod json_term_writer; mod log_merge_policy; mod merge_operation; pub mod merge_policy; @@ -24,6 +25,7 @@ use crossbeam::channel; use smallvec::SmallVec; pub use self::index_writer::IndexWriter; +pub(crate) use self::json_term_writer::JsonTermWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index cbd2241fb..76fdfaf95 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -3,12 +3,13 @@ use super::operation::AddOperation; use crate::core::Segment; use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; +use crate::indexer::json_term_writer::index_json_values; use crate::indexer::segment_serializer::SegmentSerializer; use crate::postings::{ compute_table_size, serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter, }; -use crate::schema::{Field, FieldEntry, FieldType, FieldValue, Schema, Term, Type, Value}; +use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value}; use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{ BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer, @@ -61,7 +62,7 @@ pub struct SegmentWriter { pub(crate) fast_field_writers: FastFieldsWriter, pub(crate) fieldnorms_writer: FieldNormsWriter, pub(crate) doc_opstamps: Vec, - tokenizers: Vec>, + per_field_text_analyzers: Vec, term_buffer: Term, schema: Schema, } @@ -85,19 +86,23 @@ impl SegmentWriter { let table_size = compute_initial_table_size(memory_budget_in_bytes)?; let segment_serializer = SegmentSerializer::for_segment(segment, false)?; let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema); - let tokenizers = schema + let per_field_text_analyzers = schema .fields() - .map( - |(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() { - FieldType::Str(ref text_options) => text_options - .get_indexing_options() - .and_then(|text_index_option| { - let tokenizer_name = &text_index_option.tokenizer(); - tokenizer_manager.get(tokenizer_name) - }), + .map(|(_, field_entry): (_, &FieldEntry)| { + let text_options = match field_entry.field_type() { + FieldType::Str(ref text_options) => text_options.get_indexing_options(), + FieldType::JsonObject(ref json_object_options) => { + json_object_options.get_text_indexing_options() + } _ => None, - }, - ) + }; + text_options + .and_then(|text_index_option| { + let tokenizer_name = &text_index_option.tokenizer(); + tokenizer_manager.get(tokenizer_name) + }) + .unwrap_or_default() + }) .collect(); Ok(SegmentWriter { max_doc: 0, @@ -107,7 +112,7 @@ impl SegmentWriter { segment_serializer, fast_field_writers: FastFieldsWriter::from_schema(&schema), doc_opstamps: Vec::with_capacity(1_000), - tokenizers, + per_field_text_analyzers, term_buffer: Term::new(), schema, }) @@ -165,9 +170,9 @@ impl SegmentWriter { let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx); let postings_writer: &mut dyn PostingsWriter = self.per_field_postings_writers.get_for_field_mut(field); + term_buffer.set_field(field_entry.field_type().value_type(), field); match *field_entry.field_type() { FieldType::Facet(_) => { - term_buffer.set_field(Type::Facet, field); for value in values { let facet = value.as_facet().ok_or_else(make_schema_error)?; let facet_str = facet.encoded_str(); @@ -205,13 +210,11 @@ impl SegmentWriter { .push(PreTokenizedStream::from(tok_str.clone()).into()); } Value::Str(ref text) => { - if let Some(ref mut tokenizer) = - self.tokenizers[field.field_id() as usize] - { - offsets.push(total_offset); - total_offset += text.len(); - token_streams.push(tokenizer.token_stream(text)); - } + let text_analyzer = + &self.per_field_text_analyzers[field.field_id() as usize]; + offsets.push(total_offset); + total_offset += text.len(); + token_streams.push(text_analyzer.token_stream(text)); } _ => (), } @@ -219,9 +222,9 @@ impl SegmentWriter { let mut indexing_position = IndexingPosition::default(); for mut token_stream in token_streams { + assert_eq!(term_buffer.as_slice().len(), 5); postings_writer.index_text( doc_id, - field, &mut *token_stream, term_buffer, ctx, @@ -233,7 +236,6 @@ impl SegmentWriter { } FieldType::U64(_) => { for value in values { - term_buffer.set_field(Type::U64, field); let u64_val = value.as_u64().ok_or_else(make_schema_error)?; term_buffer.set_u64(u64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -241,7 +243,6 @@ impl SegmentWriter { } FieldType::Date(_) => { for value in values { - term_buffer.set_field(Type::Date, field); let date_val = value.as_date().ok_or_else(make_schema_error)?; term_buffer.set_i64(date_val.timestamp()); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -249,7 +250,6 @@ impl SegmentWriter { } FieldType::I64(_) => { for value in values { - term_buffer.set_field(Type::I64, field); let i64_val = value.as_i64().ok_or_else(make_schema_error)?; term_buffer.set_i64(i64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -257,7 +257,6 @@ impl SegmentWriter { } FieldType::F64(_) => { for value in values { - term_buffer.set_field(Type::F64, field); let f64_val = value.as_f64().ok_or_else(make_schema_error)?; term_buffer.set_f64(f64_val); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); @@ -265,12 +264,25 @@ impl SegmentWriter { } FieldType::Bytes(_) => { for value in values { - term_buffer.set_field(Type::Bytes, field); let bytes = value.as_bytes().ok_or_else(make_schema_error)?; term_buffer.set_bytes(bytes); postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx); } } + FieldType::JsonObject(_) => { + let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize]; + let json_values_it = values + .iter() + .map(|value| value.as_json().ok_or_else(make_schema_error)); + index_json_values( + doc_id, + json_values_it, + text_analyzer, + term_buffer, + postings_writer, + ctx, + )?; + } } } Ok(()) @@ -398,10 +410,16 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document { #[cfg(test)] mod tests { + use chrono::Utc; + use super::compute_initial_table_size; - use crate::schema::{Schema, STORED, TEXT}; + use crate::collector::Count; + use crate::indexer::json_term_writer::JsonTermWriter; + use crate::postings::TermInfo; + use crate::query::PhraseQuery; + use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT}; use crate::tokenizer::{PreTokenizedString, Token}; - use crate::Document; + use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED}; #[test] fn test_hashmap_size() { @@ -440,4 +458,247 @@ mod tests { Some("title") ); } + + #[test] + fn test_json_indexing() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STORED | TEXT); + let schema = schema_builder.build(); + let json_val: serde_json::Map = serde_json::from_str( + r#"{ + "toto": "titi", + "float": -0.2, + "unsigned": 1, + "signed": -2, + "complexobject": { + "field.with.dot": 1 + }, + "date": "1985-04-12T23:20:50.52Z", + "my_arr": [2, 3, {"my_key": "two tokens"}, 4] + }"#, + ) + .unwrap(); + let doc = doc!(json_field=>json_val.clone()); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let doc = searcher + .doc(DocAddress { + segment_ord: 0u32, + doc_id: 0u32, + }) + .unwrap(); + let serdeser_json_val = serde_json::from_str::>( + &schema.to_json(&doc), + ) + .unwrap() + .get("json") + .unwrap()[0] + .as_object() + .unwrap() + .clone(); + assert_eq!(json_val, serdeser_json_val); + let segment_reader = searcher.segment_reader(0u32); + let inv_idx = segment_reader.inverted_index(json_field).unwrap(); + let term_dict = inv_idx.terms(); + + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut term_stream = term_dict.stream().unwrap(); + + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("complexobject"); + json_term_writer.push_path_segment("field.with.dot"); + json_term_writer.set_fast_value(1u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("date"); + json_term_writer.set_fast_value( + chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z") + .unwrap() + .with_timezone(&Utc), + ); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("float"); + json_term_writer.set_fast_value(-0.2f64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("my_arr"); + json_term_writer.set_fast_value(2u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_fast_value(3u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_fast_value(4u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.push_path_segment("my_key"); + json_term_writer.set_str("tokens"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.set_str("two"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("signed"); + json_term_writer.set_fast_value(-2i64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("toto"); + json_term_writer.set_str("titi"); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + + json_term_writer.pop_path_segment(); + json_term_writer.push_path_segment("unsigned"); + json_term_writer.set_fast_value(1u64); + assert!(term_stream.advance()); + assert_eq!(term_stream.key(), json_term_writer.term().value_bytes()); + assert!(!term_stream.advance()); + } + + #[test] + fn test_json_tokenized_with_position() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STORED | TEXT); + let schema = schema_builder.build(); + let mut doc = Document::default(); + let json_val: serde_json::Map = + serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap(); + doc.add_json_object(json_field, json_val.clone()); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inv_index = segment_reader.inverted_index(json_field).unwrap(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.set_str("token"); + let term_info = inv_index + .get_term_info(json_term_writer.term()) + .unwrap() + .unwrap(); + assert_eq!( + term_info, + TermInfo { + doc_freq: 1, + postings_range: 2..4, + positions_range: 2..5 + } + ); + let mut postings = inv_index + .read_postings(&term, IndexRecordOption::WithFreqsAndPositions) + .unwrap() + .unwrap(); + assert_eq!(postings.doc(), 0); + assert_eq!(postings.term_freq(), 2); + let mut positions = Vec::new(); + postings.positions(&mut positions); + assert_eq!(&positions[..], &[1, 2]); + assert_eq!(postings.advance(), TERMINATED); + } + + #[test] + fn test_json_raw_no_position() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", STRING); + let schema = schema_builder.build(); + let json_val: serde_json::Map = + serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap(); + let doc = doc!(json_field=>json_val); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inv_index = segment_reader.inverted_index(json_field).unwrap(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.set_str("two tokens"); + let term_info = inv_index + .get_term_info(json_term_writer.term()) + .unwrap() + .unwrap(); + assert_eq!( + term_info, + TermInfo { + doc_freq: 1, + postings_range: 0..1, + positions_range: 0..0 + } + ); + let mut postings = inv_index + .read_postings(&term, IndexRecordOption::WithFreqs) + .unwrap() + .unwrap(); + assert_eq!(postings.doc(), 0); + assert_eq!(postings.term_freq(), 1); + let mut positions = Vec::new(); + postings.positions(&mut positions); + assert_eq!(postings.advance(), TERMINATED); + } + + #[test] + fn test_position_overlapping_path() { + // This test checks that we do not end up detecting phrase query due + // to several string literal in the same json object being overlapping. + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_val: serde_json::Map = serde_json::from_str( + r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#, + ) + .unwrap(); + let doc = doc!(json_field=>json_val); + let index = Index::create_in_ram(schema.clone()); + let mut writer = index.writer_for_tests().unwrap(); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let mut term = Term::new(); + term.set_field(Type::Json, json_field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + json_term_writer.push_path_segment("mykey"); + json_term_writer.push_path_segment("field"); + json_term_writer.set_str("hello"); + let hello_term = json_term_writer.term().clone(); + json_term_writer.set_str("nothello"); + let nothello_term = json_term_writer.term().clone(); + json_term_writer.set_str("happy"); + let happy_term = json_term_writer.term().clone(); + let phrase_query = PhraseQuery::new(vec![hello_term, happy_term.clone()]); + assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 1); + let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]); + assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0); + } } diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs new file mode 100644 index 000000000..7d4dd93f8 --- /dev/null +++ b/src/postings/json_postings_writer.rs @@ -0,0 +1,94 @@ +use std::io; + +use crate::indexer::doc_id_mapping::DocIdMapping; +use crate::postings::postings_writer::SpecializedPostingsWriter; +use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder}; +use crate::postings::stacker::Addr; +use crate::postings::{ + FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId, +}; +use crate::schema::term::as_json_path_type_value_bytes; +use crate::schema::Type; +use crate::tokenizer::TokenStream; +use crate::{DocId, Term}; + +#[derive(Default)] +pub(crate) struct JsonPostingsWriter { + str_posting_writer: SpecializedPostingsWriter, + non_str_posting_writer: SpecializedPostingsWriter, +} + +impl From> for Box { + fn from(json_postings_writer: JsonPostingsWriter) -> Box { + Box::new(json_postings_writer) + } +} + +impl PostingsWriter for JsonPostingsWriter { + fn subscribe( + &mut self, + doc: crate::DocId, + pos: u32, + term: &crate::Term, + ctx: &mut IndexingContext, + ) -> UnorderedTermId { + self.non_str_posting_writer.subscribe(doc, pos, term, ctx) + } + + fn index_text( + &mut self, + doc_id: DocId, + token_stream: &mut dyn TokenStream, + term_buffer: &mut Term, + ctx: &mut IndexingContext, + indexing_position: &mut IndexingPosition, + ) { + self.str_posting_writer.index_text( + doc_id, + token_stream, + term_buffer, + ctx, + indexing_position, + ); + } + + /// The actual serialization format is handled by the `PostingsSerializer`. + fn serialize( + &self, + term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], + doc_id_map: Option<&DocIdMapping>, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let mut buffer_lender = BufferLender::default(); + for (term, addr, _) in term_addrs { + // TODO optimization opportunity here. + if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) { + if typ == Type::Str { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } else { + SpecializedPostingsWriter::::serialize_one_term( + term, + *addr, + doc_id_map, + &mut buffer_lender, + ctx, + serializer, + )?; + } + } + } + Ok(()) + } + + fn total_num_tokens(&self) -> u64 { + self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens() + } +} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4df8beee7..8f4e3078a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -7,6 +7,7 @@ pub(crate) use self::block_search::branchless_binary_search; mod block_segment_postings; pub(crate) mod compression; mod indexing_context; +mod json_postings_writer; mod per_field_postings_writer; mod postings; mod postings_writer; diff --git a/src/postings/per_field_postings_writer.rs b/src/postings/per_field_postings_writer.rs index 4c32cb51e..04966ab42 100644 --- a/src/postings/per_field_postings_writer.rs +++ b/src/postings/per_field_postings_writer.rs @@ -1,3 +1,4 @@ +use crate::postings::json_postings_writer::JsonPostingsWriter; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; use crate::postings::PostingsWriter; @@ -33,21 +34,38 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box { - SpecializedPostingsWriter::::new_boxed() + SpecializedPostingsWriter::::default().into() } IndexRecordOption::WithFreqs => { - SpecializedPostingsWriter::::new_boxed() + SpecializedPostingsWriter::::default().into() } IndexRecordOption::WithFreqsAndPositions => { - SpecializedPostingsWriter::::new_boxed() + SpecializedPostingsWriter::::default().into() } }) - .unwrap_or_else(SpecializedPostingsWriter::::new_boxed), + .unwrap_or_else(|| SpecializedPostingsWriter::::default().into()), FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) | FieldType::Bytes(_) - | FieldType::Facet(_) => SpecializedPostingsWriter::::new_boxed(), + | FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::::default()), + FieldType::JsonObject(ref json_object_options) => { + if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { + match text_indexing_option.index_option() { + IndexRecordOption::Basic => { + JsonPostingsWriter::::default().into() + } + IndexRecordOption::WithFreqs => { + JsonPostingsWriter::::default().into() + } + IndexRecordOption::WithFreqsAndPositions => { + JsonPostingsWriter::::default().into() + } + } + } else { + JsonPostingsWriter::::default().into() + } + } } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index f4ca6394b..020931c39 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -13,7 +13,7 @@ use crate::postings::{ FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter, UnorderedTermId, }; -use crate::schema::{Field, FieldType, Schema, Term, Type}; +use crate::schema::{Field, FieldType, Schema, Term}; use crate::termdict::TermOrdinal; use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN}; use crate::DocId; @@ -85,6 +85,7 @@ pub(crate) fn serialize_postings( } FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {} FieldType::Bytes(_) => {} + FieldType::JsonObject(_) => {} } let postings_writer = per_field_postings_writers.get_for_field(field); @@ -142,13 +143,12 @@ pub(crate) trait PostingsWriter { fn index_text( &mut self, doc_id: DocId, - field: Field, token_stream: &mut dyn TokenStream, term_buffer: &mut Term, ctx: &mut IndexingContext, indexing_position: &mut IndexingPosition, ) { - term_buffer.set_field(Type::Str, field); + let end_of_path_idx = term_buffer.as_slice().len(); let mut num_tokens = 0; let mut end_position = 0; token_stream.process(&mut |token: &Token| { @@ -162,7 +162,8 @@ pub(crate) trait PostingsWriter { ); return; } - term_buffer.set_text(token.text.as_str()); + term_buffer.truncate(end_of_path_idx); + term_buffer.append_bytes(token.text.as_bytes()); let start_position = indexing_position.end_position + token.position as u32; end_position = start_position + token.position_length as u32; self.subscribe(doc_id, start_position, term_buffer, ctx); @@ -170,6 +171,7 @@ pub(crate) trait PostingsWriter { }); indexing_position.end_position = end_position + POSITION_GAP; indexing_position.num_tokens += num_tokens; + term_buffer.truncate(end_of_path_idx); } fn total_num_tokens(&self) -> u64; @@ -177,27 +179,40 @@ pub(crate) trait PostingsWriter { /// The `SpecializedPostingsWriter` is just here to remove dynamic /// dispatch to the recorder information. -pub(crate) struct SpecializedPostingsWriter { +#[derive(Default)] +pub(crate) struct SpecializedPostingsWriter { total_num_tokens: u64, _recorder_type: PhantomData, } -impl SpecializedPostingsWriter { - /// constructor - pub fn new() -> SpecializedPostingsWriter { - SpecializedPostingsWriter { - total_num_tokens: 0u64, - _recorder_type: PhantomData, - } - } - - /// Builds a `SpecializedPostingsWriter` storing its data in a memory arena. - pub fn new_boxed() -> Box { - Box::new(SpecializedPostingsWriter::::new()) +impl From> for Box { + fn from( + specialized_postings_writer: SpecializedPostingsWriter, + ) -> Box { + Box::new(specialized_postings_writer) } } -impl PostingsWriter for SpecializedPostingsWriter { +impl SpecializedPostingsWriter { + #[inline] + pub(crate) fn serialize_one_term( + term: &Term<&[u8]>, + addr: Addr, + doc_id_map: Option<&DocIdMapping>, + buffer_lender: &mut BufferLender, + ctx: &IndexingContext, + serializer: &mut FieldSerializer, + ) -> io::Result<()> { + let recorder: Rec = ctx.term_index.read(addr); + let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); + serializer.new_term(term.value_bytes(), term_doc_freq)?; + recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender); + serializer.close_term()?; + Ok(()) + } +} + +impl PostingsWriter for SpecializedPostingsWriter { fn subscribe( &mut self, doc: DocId, @@ -218,7 +233,7 @@ impl PostingsWriter for SpecializedPostingsWriter recorder.record_position(position, arena); recorder } else { - let mut recorder = Rec::new(); + let mut recorder = Rec::default(); recorder.new_doc(doc, arena); recorder.record_position(position, arena); recorder @@ -235,11 +250,7 @@ impl PostingsWriter for SpecializedPostingsWriter ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); for (term, addr, _) in term_addrs { - let recorder: Rec = ctx.term_index.read(*addr); - let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); - serializer.new_term(term.value_bytes(), term_doc_freq)?; - recorder.serialize(&ctx.arena, doc_id_map, serializer, &mut buffer_lender); - serializer.close_term()?; + Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?; } Ok(()) } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index ff0b62215..8ceca540b 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -56,9 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> { /// * the document id /// * the term frequency /// * the term positions -pub(crate) trait Recorder: Copy + 'static { - /// - fn new() -> Self; +pub(crate) trait Recorder: Copy + Default + 'static { /// Returns the current document fn current_doc(&self) -> u32; /// Starts recording information about a new document @@ -90,14 +88,16 @@ pub struct NothingRecorder { current_doc: DocId, } -impl Recorder for NothingRecorder { - fn new() -> Self { +impl Default for NothingRecorder { + fn default() -> Self { NothingRecorder { stack: ExpUnrolledLinkedList::new(), current_doc: u32::max_value(), } } +} +impl Recorder for NothingRecorder { fn current_doc(&self) -> DocId { self.current_doc } @@ -152,8 +152,8 @@ pub struct TermFrequencyRecorder { term_doc_freq: u32, } -impl Recorder for TermFrequencyRecorder { - fn new() -> Self { +impl Default for TermFrequencyRecorder { + fn default() -> Self { TermFrequencyRecorder { stack: ExpUnrolledLinkedList::new(), current_doc: 0, @@ -161,7 +161,9 @@ impl Recorder for TermFrequencyRecorder { term_doc_freq: 0u32, } } +} +impl Recorder for TermFrequencyRecorder { fn current_doc(&self) -> DocId { self.current_doc } @@ -223,15 +225,18 @@ pub struct TfAndPositionRecorder { current_doc: DocId, term_doc_freq: u32, } -impl Recorder for TfAndPositionRecorder { - fn new() -> Self { + +impl Default for TfAndPositionRecorder { + fn default() -> Self { TfAndPositionRecorder { stack: ExpUnrolledLinkedList::new(), current_doc: u32::max_value(), term_doc_freq: 0u32, } } +} +impl Recorder for TfAndPositionRecorder { fn current_doc(&self) -> DocId { self.current_doc } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 90c37b654..fbf0be4ec 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -76,7 +76,7 @@ impl InvertedIndexSerializer { field: Field, total_num_tokens: u64, fieldnorm_reader: Option, - ) -> io::Result> { + ) -> io::Result { let field_entry: &FieldEntry = self.schema.get_field_entry(field); let term_dictionary_write = self.terms_write.for_field(field); let postings_write = self.postings_write.for_field(field); @@ -122,24 +122,21 @@ impl<'a> FieldSerializer<'a> { fieldnorm_reader: Option, ) -> io::Result> { total_num_tokens.serialize(postings_write)?; - let mode = match field_type { - FieldType::Str(ref text_options) => { - if let Some(text_indexing_options) = text_options.get_indexing_options() { - text_indexing_options.index_option() - } else { - IndexRecordOption::Basic - } - } - _ => IndexRecordOption::Basic, - }; + let index_record_option = field_type + .index_record_option() + .unwrap_or(IndexRecordOption::Basic); let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?; let average_fieldnorm = fieldnorm_reader .as_ref() .map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score)) .unwrap_or(0.0); - let postings_serializer = - PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader); - let positions_serializer_opt = if mode.has_positions() { + let postings_serializer = PostingsSerializer::new( + postings_write, + average_fieldnorm, + index_record_option, + fieldnorm_reader, + ); + let positions_serializer_opt = if index_record_option.has_positions() { Some(PositionSerializer::new(positions_write)) } else { None @@ -203,6 +200,7 @@ impl<'a> FieldSerializer<'a> { self.current_term_info.doc_freq += 1; self.postings_serializer.write_doc(doc_id, term_freq); if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() { + assert_eq!(term_freq as usize, position_deltas.len()); positions_serializer.write_positions_delta(position_deltas); } } diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 2719d3371..576d3a6de 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -204,8 +204,8 @@ impl BooleanQuery { #[cfg(test)] mod tests { use super::BooleanQuery; - use crate::collector::DocSetCollector; - use crate::query::{QueryClone, TermQuery}; + use crate::collector::{Count, DocSetCollector}; + use crate::query::{QueryClone, QueryParser, TermQuery}; use crate::schema::{IndexRecordOption, Schema, TEXT}; use crate::{DocAddress, Index, Term}; @@ -282,4 +282,42 @@ mod tests { } Ok(()) } + + #[test] + pub fn test_json_array_pitfall_bag_of_terms() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(json_field=>json!({ + "cart": [ + {"product_type": "sneakers", "attributes": {"color": "white"}}, + {"product_type": "t-shirt", "attributes": {"color": "red"}}, + {"product_type": "cd", "attributes": {"genre": "blues"}}, + ] + })))?; + index_writer.commit()?; + } + let searcher = index.reader()?.searcher(); + let doc_matches = |query: &str| { + let query_parser = QueryParser::for_index(&index, vec![json_field]); + let query = query_parser.parse_query(query).unwrap(); + searcher.search(&query, &Count).unwrap() == 1 + }; + // As expected + assert!(doc_matches( + r#"cart.product_type:sneakers AND cart.attributes.color:white"# + )); + // Unexpected match, due to the fact that array do not act as nested docs. + assert!(doc_matches( + r#"cart.product_type:sneakers AND cart.attributes.color:red"# + )); + // However, bviously this works... + assert!(!doc_matches( + r#"cart.product_type:sneakers AND cart.attributes.color:blues"# + )); + Ok(()) + } } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 767c4e1fa..edec22fec 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -9,10 +9,12 @@ pub use self::phrase_weight::PhraseWeight; #[cfg(test)] pub mod tests { + use serde_json::json; + use super::*; use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE}; use crate::core::Index; - use crate::query::Weight; + use crate::query::{QueryParser, Weight}; use crate::schema::{Schema, Term, TEXT}; use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED}; @@ -248,4 +250,56 @@ pub mod tests { assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]); Ok(()) } + + #[test] + pub fn test_phrase_query_on_json() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(json_field=>json!({ + "text": "elliot smith the happy who" + })))?; + index_writer.add_document(doc!(json_field=>json!({ + "text": "the who elliot smith" + })))?; + index_writer.add_document(doc!(json_field=>json!({ + "arr": [{"text":"the who"}, {"text":"elliot smith"}] + })))?; + index_writer.add_document(doc!(json_field=>json!({ + "text2": "the smith" + })))?; + index_writer.commit()?; + } + let searcher = index.reader()?.searcher(); + let matching_docs = |query: &str| { + let query_parser = QueryParser::for_index(&index, vec![json_field]); + let phrase_query = query_parser.parse_query(query).unwrap(); + let phrase_weight = phrase_query.weight(&*searcher, false).unwrap(); + let mut phrase_scorer = phrase_weight + .scorer(searcher.segment_reader(0), 1.0f32) + .unwrap(); + let mut docs = Vec::new(); + loop { + let doc = phrase_scorer.doc(); + if doc == TERMINATED { + break; + } + docs.push(doc); + phrase_scorer.advance(); + } + docs + }; + assert!(matching_docs(r#"text:"the smith""#).is_empty()); + assert_eq!(&matching_docs(r#"text:the"#), &[0u32, 1u32]); + assert_eq!(&matching_docs(r#"text:"the""#), &[0u32, 1u32]); + assert_eq!(&matching_docs(r#"text:"smith""#), &[0u32, 1u32]); + assert_eq!(&matching_docs(r#"text:"elliot smith""#), &[0u32, 1u32]); + assert_eq!(&matching_docs(r#"text2:"the smith""#), &[3u32]); + assert!(&matching_docs(r#"arr.text:"the smith""#).is_empty()); + assert_eq!(&matching_docs(r#"arr.text:"elliot smith""#), &[2]); + Ok(()) + } } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 9c08c3557..dcbe60c0b 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -31,10 +31,11 @@ impl PhraseWeight { fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { let field = self.phrase_terms[0].1.field(); if self.scoring_enabled { - reader.get_fieldnorms_reader(field) - } else { - Ok(FieldNormReader::constant(reader.max_doc(), 1)) + if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + return Ok(fieldnorm_reader); + } } + Ok(FieldNormReader::constant(reader.max_doc(), 1)) } fn phrase_scorer( diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 2cfcec7f5..228ec6ad4 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,19 +1,21 @@ -use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::num::{ParseFloatError, ParseIntError}; use std::ops::Bound; use std::str::FromStr; -use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf}; +use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; use super::logical_ast::*; use crate::core::Index; +use crate::indexer::JsonTermWriter; use crate::query::{ AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery, TermQuery, }; -use crate::schema::{Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term}; -use crate::tokenizer::TokenizerManager; +use crate::schema::{ + Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type, +}; +use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::Score; /// Possible error that may happen when parsing a query. @@ -22,22 +24,24 @@ pub enum QueryParserError { /// Error in the query syntax #[error("Syntax Error")] SyntaxError, - /// `FieldDoesNotExist(field_name: String)` + /// This query is unsupported. + #[error("Unsupported query: {0}")] + UnsupportedQuery(String), /// The query references a field that is not in the schema #[error("Field does not exists: '{0:?}'")] FieldDoesNotExist(String), /// The query contains a term for a `u64` or `i64`-field, but the value /// is neither. #[error("Expected a valid integer: '{0:?}'")] - ExpectedInt(ParseIntError), + ExpectedInt(#[from] ParseIntError), /// The query contains a term for a bytes field, but the value is not valid /// base64. #[error("Expected base64: '{0:?}'")] - ExpectedBase64(base64::DecodeError), + ExpectedBase64(#[from] base64::DecodeError), /// The query contains a term for a `f64`-field, but the value /// is not a f64. #[error("Invalid query: Only excluding terms given")] - ExpectedFloat(ParseFloatError), + ExpectedFloat(#[from] ParseFloatError), /// It is forbidden queries that are only "excluding". (e.g. -title:pop) #[error("Invalid query: Only excluding terms given")] AllButQueryForbidden, @@ -63,34 +67,10 @@ pub enum QueryParserError { RangeMustNotHavePhrase, /// The format for the date field is not RFC 3339 compliant. #[error("The date field has an invalid format")] - DateFormatError(chrono::ParseError), + DateFormatError(#[from] chrono::ParseError), /// The format for the facet field is invalid. #[error("The facet field is malformed: {0}")] - FacetFormatError(FacetParseError), -} - -impl From for QueryParserError { - fn from(err: ParseIntError) -> QueryParserError { - QueryParserError::ExpectedInt(err) - } -} - -impl From for QueryParserError { - fn from(err: ParseFloatError) -> QueryParserError { - QueryParserError::ExpectedFloat(err) - } -} - -impl From for QueryParserError { - fn from(err: chrono::ParseError) -> QueryParserError { - QueryParserError::DateFormatError(err) - } -} - -impl From for QueryParserError { - fn from(err: FacetParseError) -> QueryParserError { - QueryParserError::FacetFormatError(err) - } + FacetFormatError(#[from] FacetParseError), } /// Recursively remove empty clause from the AST @@ -182,6 +162,7 @@ pub struct QueryParser { conjunction_by_default: bool, tokenizer_manager: TokenizerManager, boost: HashMap, + field_names: BTreeSet, } fn all_negative(ast: &LogicalAst) -> bool { @@ -203,15 +184,43 @@ impl QueryParser { default_fields: Vec, tokenizer_manager: TokenizerManager, ) -> QueryParser { + let field_names = schema + .fields() + .map(|(_, field_entry)| field_entry.name().to_string()) + .collect(); QueryParser { schema, default_fields, tokenizer_manager, conjunction_by_default: false, boost: Default::default(), + field_names, } } + // Splits a full_path as written in a query, into a field name and a + // json path. + pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> (&'a str, &'a str) { + if full_path.is_empty() { + return ("", ""); + } + if self.field_names.contains(full_path) { + return (full_path, ""); + } + let mut result = ("", full_path); + let mut cursor = 0; + while let Some(pos) = full_path[cursor..].find('.') { + cursor += pos; + let prefix = &full_path[..cursor]; + let suffix = &full_path[cursor + 1..]; + if self.field_names.contains(prefix) { + result = (prefix, suffix); + } + cursor += 1; + } + result + } + /// Creates a `QueryParser`, given /// * an index /// * a set of default - fields used to search if no field is specifically defined @@ -284,93 +293,85 @@ impl QueryParser { Ok(ast) } - fn compute_terms_for_string( + fn compute_boundary_term( &self, field: Field, + json_path: &str, phrase: &str, - ) -> Result, QueryParserError> { + ) -> Result { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); if !field_type.is_indexed() { - let field_name = field_entry.name().to_string(); - return Err(QueryParserError::FieldNotIndexed(field_name)); + return Err(QueryParserError::FieldNotIndexed( + field_entry.name().to_string(), + )); + } + if !json_path.is_empty() && field_type.value_type() != Type::Json { + return Err(QueryParserError::UnsupportedQuery(format!( + "Json path is not supported for field {:?}", + field_entry.name() + ))); } match *field_type { + FieldType::U64(_) => { + let val: u64 = u64::from_str(phrase)?; + Ok(Term::from_field_u64(field, val)) + } FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; - let term = Term::from_field_i64(field, val); - Ok(vec![(0, term)]) + Ok(Term::from_field_i64(field, val)) } FieldType::F64(_) => { let val: f64 = f64::from_str(phrase)?; - let term = Term::from_field_f64(field, val); - Ok(vec![(0, term)]) + Ok(Term::from_field_f64(field, val)) } - FieldType::Date(_) => match chrono::DateTime::parse_from_rfc3339(phrase) { - Ok(x) => Ok(vec![( - 0, - Term::from_field_date(field, &x.with_timezone(&chrono::Utc)), - )]), - Err(e) => Err(QueryParserError::DateFormatError(e)), - }, - FieldType::U64(_) => { - let val: u64 = u64::from_str(phrase)?; - let term = Term::from_field_u64(field, val); - Ok(vec![(0, term)]) + FieldType::Date(_) => { + let dt = chrono::DateTime::parse_from_rfc3339(phrase)?; + Ok(Term::from_field_date( + field, + &dt.with_timezone(&chrono::Utc), + )) } FieldType::Str(ref str_options) => { - if let Some(option) = str_options.get_indexing_options() { - let tokenizer = - self.tokenizer_manager - .get(option.tokenizer()) - .ok_or_else(|| { - QueryParserError::UnknownTokenizer( - field_entry.name().to_string(), - option.tokenizer().to_string(), - ) - })?; - let mut terms: Vec<(usize, Term)> = Vec::new(); - let mut token_stream = tokenizer.token_stream(phrase); - token_stream.process(&mut |token| { - let term = Term::from_field_text(field, &token.text); - terms.push((token.position, term)); - }); - if terms.is_empty() { - Ok(vec![]) - } else if terms.len() == 1 { - Ok(terms) - } else { - let field_entry = self.schema.get_field_entry(field); - let field_type = field_entry.field_type(); - if let Some(index_record_option) = field_type.get_index_record_option() { - if index_record_option.has_positions() { - Ok(terms) - } else { - let fieldname = self.schema.get_field_name(field).to_string(); - Err(QueryParserError::FieldDoesNotHavePositionsIndexed( - fieldname, - )) - } - } else { - let fieldname = self.schema.get_field_name(field).to_string(); - Err(QueryParserError::FieldNotIndexed(fieldname)) - } - } - } else { + let option = str_options.get_indexing_options().ok_or_else(|| { // This should have been seen earlier really. - Err(QueryParserError::FieldNotIndexed( - field_entry.name().to_string(), - )) + QueryParserError::FieldNotIndexed(field_entry.name().to_string()) + })?; + let text_analyzer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_entry.name().to_string(), + option.tokenizer().to_string(), + ) + })?; + let mut terms: Vec = Vec::new(); + let mut token_stream = text_analyzer.token_stream(phrase); + token_stream.process(&mut |token| { + let term = Term::from_field_text(field, &token.text); + terms.push(term); + }); + if terms.len() != 1 { + return Err(QueryParserError::UnsupportedQuery(format!( + "Range query boundary cannot have multiple tokens: {phrase:?}." + ))); } + Ok(terms.into_iter().next().unwrap()) + } + FieldType::JsonObject(_) => { + // Json range are not supported. + Err(QueryParserError::UnsupportedQuery( + "Range query are not supported on json field.".to_string(), + )) } FieldType::Facet(_) => match Facet::from_text(phrase) { - Ok(facet) => Ok(vec![(0, Term::from_facet(field, &facet))]), + Ok(facet) => Ok(Term::from_facet(field, &facet)), Err(e) => Err(QueryParserError::from(e)), }, FieldType::Bytes(_) => { let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; - let term = Term::from_field_bytes(field, &bytes); - Ok(vec![(0, term)]) + Ok(Term::from_field_bytes(field, &bytes)) } } } @@ -378,13 +379,97 @@ impl QueryParser { fn compute_logical_ast_for_leaf( &self, field: Field, + json_path: &str, phrase: &str, - ) -> Result, QueryParserError> { - let terms = self.compute_terms_for_string(field, phrase)?; - match &terms[..] { - [] => Ok(None), - [(_, term)] => Ok(Some(LogicalLiteral::Term(term.clone()))), - _ => Ok(Some(LogicalLiteral::Phrase(terms.clone()))), + ) -> Result, QueryParserError> { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let field_name = field_entry.name(); + if !field_type.is_indexed() { + return Err(QueryParserError::FieldNotIndexed(field_name.to_string())); + } + match *field_type { + FieldType::U64(_) => { + let val: u64 = u64::from_str(phrase)?; + let i64_term = Term::from_field_u64(field, val); + Ok(vec![LogicalLiteral::Term(i64_term)]) + } + FieldType::I64(_) => { + let val: i64 = i64::from_str(phrase)?; + let i64_term = Term::from_field_i64(field, val); + Ok(vec![LogicalLiteral::Term(i64_term)]) + } + FieldType::F64(_) => { + let val: f64 = f64::from_str(phrase)?; + let f64_term = Term::from_field_f64(field, val); + Ok(vec![LogicalLiteral::Term(f64_term)]) + } + FieldType::Date(_) => { + let dt = chrono::DateTime::parse_from_rfc3339(phrase)?; + let dt_term = Term::from_field_date(field, &dt.with_timezone(&chrono::Utc)); + Ok(vec![LogicalLiteral::Term(dt_term)]) + } + FieldType::Str(ref str_options) => { + let option = str_options.get_indexing_options().ok_or_else(|| { + // This should have been seen earlier really. + QueryParserError::FieldNotIndexed(field_name.to_string()) + })?; + let text_analyzer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_name.to_string(), + option.tokenizer().to_string(), + ) + })?; + let index_record_option = option.index_option(); + Ok(generate_literals_for_str( + field_name, + field, + phrase, + &text_analyzer, + index_record_option, + )? + .into_iter() + .collect()) + } + FieldType::JsonObject(ref json_options) => { + let option = json_options.get_text_indexing_options().ok_or_else(|| { + // This should have been seen earlier really. + QueryParserError::FieldNotIndexed(field_name.to_string()) + })?; + let text_analyzer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_name.to_string(), + option.tokenizer().to_string(), + ) + })?; + let index_record_option = option.index_option(); + generate_literals_for_json_object( + field_name, + field, + json_path, + phrase, + &text_analyzer, + index_record_option, + ) + } + FieldType::Facet(_) => match Facet::from_text(phrase) { + Ok(facet) => { + let facet_term = Term::from_facet(field, &facet); + Ok(vec![LogicalLiteral::Term(facet_term)]) + } + Err(e) => Err(QueryParserError::from(e)), + }, + FieldType::Bytes(_) => { + let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; + let bytes_term = Term::from_field_bytes(field, &bytes); + Ok(vec![LogicalLiteral::Term(bytes_term)]) + } } } @@ -399,16 +484,13 @@ impl QueryParser { fn resolve_bound( &self, field: Field, + json_path: &str, bound: &UserInputBound, ) -> Result, QueryParserError> { if bound.term_str() == "*" { return Ok(Bound::Unbounded); } - let terms = self.compute_terms_for_string(field, bound.term_str())?; - if terms.len() != 1 { - return Err(QueryParserError::RangeMustNotHavePhrase); - } - let (_, term) = terms.into_iter().next().unwrap(); + let term = self.compute_boundary_term(field, json_path, bound.term_str())?; match *bound { UserInputBound::Inclusive(_) => Ok(Bound::Included(term)), UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)), @@ -416,22 +498,6 @@ impl QueryParser { } } - fn resolved_fields( - &self, - given_field: &Option, - ) -> Result, QueryParserError> { - match *given_field { - None => { - if self.default_fields.is_empty() { - Err(QueryParserError::NoDefaultFieldDeclared) - } else { - Ok(Cow::from(&self.default_fields[..])) - } - } - Some(ref field) => Ok(Cow::from(vec![self.resolve_field_name(&*field)?])), - } - } - fn compute_logical_ast_with_occur( &self, user_input_ast: UserInputAst, @@ -459,31 +525,58 @@ impl QueryParser { self.boost.get(&field).cloned().unwrap_or(1.0) } + fn default_indexed_json_fields(&self) -> impl Iterator + '_ { + let schema = self.schema.clone(); + self.default_fields.iter().cloned().filter(move |field| { + let field_type = schema.get_field_entry(*field).field_type(); + field_type.value_type() == Type::Json && field_type.is_indexed() + }) + } + + fn compute_path_triplet_for_literal<'a>( + &self, + literal: &'a UserInputLiteral, + ) -> Result, QueryParserError> { + match &literal.field_name { + Some(ref full_path) => { + // We need to add terms associated to json default fields. + let (field_name, path) = self.split_full_path(full_path); + if let Ok(field) = self.resolve_field_name(field_name) { + return Ok(vec![(field, path, literal.phrase.as_str())]); + } + let triplets: Vec<(Field, &str, &str)> = self + .default_indexed_json_fields() + .map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str())) + .collect(); + if triplets.is_empty() { + return Err(QueryParserError::FieldDoesNotExist(field_name.to_string())); + } + Ok(triplets) + } + None => { + if self.default_fields.is_empty() { + return Err(QueryParserError::NoDefaultFieldDeclared); + } + Ok(self + .default_fields + .iter() + .map(|default_field| (*default_field, "", literal.phrase.as_str())) + .collect::>()) + } + } + } + fn compute_logical_ast_from_leaf( &self, leaf: UserInputLeaf, ) -> Result { match leaf { UserInputLeaf::Literal(literal) => { - let term_phrases: Vec<(Field, String)> = match literal.field_name { - Some(ref field_name) => { - let field = self.resolve_field_name(field_name)?; - vec![(field, literal.phrase.clone())] - } - None => { - if self.default_fields.is_empty() { - return Err(QueryParserError::NoDefaultFieldDeclared); - } else { - self.default_fields - .iter() - .map(|default_field| (*default_field, literal.phrase.clone())) - .collect::>() - } - } - }; + let term_phrases: Vec<(Field, &str, &str)> = + self.compute_path_triplet_for_literal(&literal)?; let mut asts: Vec = Vec::new(); - for (field, phrase) in term_phrases { - if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? { + for (field, json_path, phrase) in term_phrases { + for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? { // Apply some field specific boost defined at the query parser level. let boost = self.field_boost(field); asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost)); @@ -498,37 +591,26 @@ impl QueryParser { } UserInputLeaf::All => Ok(LogicalAst::Leaf(Box::new(LogicalLiteral::All))), UserInputLeaf::Range { - field, + field: full_field_opt, lower, upper, } => { - let fields = self.resolved_fields(&field)?; - let mut clauses = fields - .iter() - .map(|&field| { - let boost = self.field_boost(field); - let field_entry = self.schema.get_field_entry(field); - let value_type = field_entry.field_type().value_type(); - let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range { - field, - value_type, - lower: self.resolve_bound(field, &lower)?, - upper: self.resolve_bound(field, &upper)?, - })); - Ok(logical_ast.boost(boost)) - }) - .collect::, QueryParserError>>()?; - let result_ast = if clauses.len() == 1 { - clauses.pop().unwrap() - } else { - LogicalAst::Clause( - clauses - .into_iter() - .map(|clause| (Occur::Should, clause)) - .collect(), + let full_path = full_field_opt.ok_or_else(|| { + QueryParserError::UnsupportedQuery( + "Range query need to target a specific field.".to_string(), ) - }; - Ok(result_ast) + })?; + let (field_name, json_path) = self.split_full_path(&full_path); + let field = self.resolve_field_name(field_name)?; + let field_entry = self.schema.get_field_entry(field); + let value_type = field_entry.field_type().value_type(); + let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range { + field, + value_type, + lower: self.resolve_bound(field, json_path, &lower)?, + upper: self.resolve_bound(field, json_path, &upper)?, + })); + Ok(logical_ast) } } } @@ -552,6 +634,115 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } } +fn generate_literals_for_str( + field_name: &str, + field: Field, + phrase: &str, + text_analyzer: &TextAnalyzer, + index_record_option: IndexRecordOption, +) -> Result, QueryParserError> { + let mut terms: Vec<(usize, Term)> = Vec::new(); + let mut token_stream = text_analyzer.token_stream(phrase); + token_stream.process(&mut |token| { + let term = Term::from_field_text(field, &token.text); + terms.push((token.position, term)); + }); + if terms.len() <= 1 { + let term_literal_opt = terms + .into_iter() + .next() + .map(|(_, term)| LogicalLiteral::Term(term)); + return Ok(term_literal_opt); + } + if !index_record_option.has_positions() { + return Err(QueryParserError::FieldDoesNotHavePositionsIndexed( + field_name.to_string(), + )); + } + Ok(Some(LogicalLiteral::Phrase(terms))) +} + +enum NumValue { + U64(u64), + I64(i64), + F64(f64), + DateTime(crate::DateTime), +} + +fn infer_type_num(phrase: &str) -> Option { + if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(phrase) { + let dt_utc = dt.with_timezone(&chrono::Utc); + return Some(NumValue::DateTime(dt_utc)); + } + if let Ok(u64_val) = str::parse::(phrase) { + return Some(NumValue::U64(u64_val)); + } + if let Ok(i64_val) = str::parse::(phrase) { + return Some(NumValue::I64(i64_val)); + } + if let Ok(f64_val) = str::parse::(phrase) { + return Some(NumValue::F64(f64_val)); + } + None +} + +fn generate_literals_for_json_object( + field_name: &str, + field: Field, + json_path: &str, + phrase: &str, + text_analyzer: &TextAnalyzer, + index_record_option: IndexRecordOption, +) -> Result, QueryParserError> { + let mut logical_literals = Vec::new(); + let mut term = Term::new(); + term.set_field(Type::Json, field); + let mut json_term_writer = JsonTermWriter::wrap(&mut term); + for segment in json_path.split('.') { + json_term_writer.push_path_segment(segment); + } + if let Some(num_value) = infer_type_num(phrase) { + match num_value { + NumValue::U64(u64_val) => { + json_term_writer.set_fast_value(u64_val); + } + NumValue::I64(i64_val) => { + json_term_writer.set_fast_value(i64_val); + } + NumValue::F64(f64_val) => { + json_term_writer.set_fast_value(f64_val); + } + NumValue::DateTime(dt_val) => { + json_term_writer.set_fast_value(dt_val); + } + } + logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone())); + } + json_term_writer.close_path_and_set_type(Type::Str); + drop(json_term_writer); + let term_num_bytes = term.as_slice().len(); + let mut token_stream = text_analyzer.token_stream(phrase); + let mut terms: Vec<(usize, Term)> = Vec::new(); + token_stream.process(&mut |token| { + term.truncate(term_num_bytes); + term.append_bytes(token.text.as_bytes()); + terms.push((token.position, term.clone())); + }); + if terms.len() <= 1 { + for (_, term) in terms { + logical_literals.push(LogicalLiteral::Term(term)); + } + return Ok(logical_literals); + } + if !index_record_option.has_positions() { + return Err(QueryParserError::FieldDoesNotHavePositionsIndexed( + field_name.to_string(), + )); + } + logical_literals.push(LogicalLiteral::Phrase(terms)); + Ok(logical_literals) +} + fn convert_to_query(logical_ast: LogicalAst) -> Box { match trim_ast(logical_ast) { Some(LogicalAst::Clause(trimmed_clause)) => { @@ -615,13 +806,15 @@ mod test { schema_builder.add_facet_field("facet", FacetOptions::default()); schema_builder.add_bytes_field("bytes", INDEXED); schema_builder.add_bytes_field("bytes_not_indexed", STORED); + schema_builder.add_json_field("json", TEXT); + schema_builder.add_json_field("json_not_indexed", STORED); schema_builder.build() } - fn make_query_parser() -> QueryParser { + fn make_query_parser_with_default_fields(default_fields: &[&'static str]) -> QueryParser { let schema = make_schema(); - let default_fields: Vec = vec!["title", "text"] - .into_iter() + let default_fields: Vec = default_fields + .iter() .flat_map(|field_name| schema.get_field(field_name)) .collect(); let tokenizer_manager = TokenizerManager::default(); @@ -634,6 +827,10 @@ mod test { QueryParser::new(schema, default_fields, tokenizer_manager) } + fn make_query_parser() -> QueryParser { + make_query_parser_with_default_fields(&["title", "text"]) + } + fn parse_query_to_logical_ast( query: &str, default_conjunction: bool, @@ -661,7 +858,7 @@ mod test { let query = query_parser.parse_query("facet:/root/branch/leaf").unwrap(); assert_eq!( format!("{:?}", query), - r#"TermQuery(Term(type=Facet, field=11, val="/root/branch/leaf"))"# + r#"TermQuery(Term(type=Facet, field=11, "/root/branch/leaf"))"# ); } @@ -674,21 +871,17 @@ mod test { let query = query_parser.parse_query("text:hello").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2)"# + r#"Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2)"# ); } #[test] pub fn test_parse_query_range_with_boost() { - let mut query_parser = make_query_parser(); - let schema = make_schema(); - let title_field = schema.get_field("title").unwrap(); - query_parser.set_field_boost(title_field, 2.0); - let query = query_parser.parse_query("title:[A TO B]").unwrap(); + let query = make_query_parser().parse_query("title:[A TO B]").unwrap(); assert_eq!( format!("{:?}", query), - "Boost(query=RangeQuery { field: Field(0), value_type: Str, left_bound: \ - Included([97]), right_bound: Included([98]) }, boost=2)" + "RangeQuery { field: Field(0), value_type: Str, left_bound: Included([97]), \ + right_bound: Included([98]) }" ); } @@ -701,7 +894,7 @@ mod test { let query = query_parser.parse_query("text:hello^2").unwrap(); assert_eq!( format!("{:?}", query), - r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, val="hello")), boost=2), boost=2)"# + r#"Boost(query=Boost(query=TermQuery(Term(type=Str, field=1, "hello")), boost=2), boost=2)"# ); } @@ -736,7 +929,7 @@ mod test { pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper( "nottokenized:\"wordone wordtwo\"", - r#"Term(type=Str, field=7, val="wordone wordtwo")"#, + r#"Term(type=Str, field=7, "wordone wordtwo")"#, false, ); } @@ -779,7 +972,7 @@ mod test { .is_ok()); test_parse_query_to_logical_ast_helper( "unsigned:2324", - "Term(type=U64, field=3, val=2324)", + "Term(type=U64, field=3, 2324)", false, ); @@ -806,7 +999,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -817,11 +1010,100 @@ mod test { assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); } + #[test] + fn test_json_field() { + test_parse_query_to_logical_ast_helper( + "json.titi:hello", + "Term(type=Json, field=14, path=titi, vtype=Str, \"hello\")", + false, + ); + } + + #[test] + fn test_json_field_possibly_a_number() { + test_parse_query_to_logical_ast_helper( + "json.titi:5", + r#"(Term(type=Json, field=14, path=titi, vtype=U64, 5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, + true, + ); + test_parse_query_to_logical_ast_helper( + "json.titi:-5", + r#"(Term(type=Json, field=14, path=titi, vtype=I64, -5) Term(type=Json, field=14, path=titi, vtype=Str, "5"))"#, //< Yes this is a bit weird after going through the tokenizer we lose the "-". + true, + ); + test_parse_query_to_logical_ast_helper( + "json.titi:-5.2", + r#"(Term(type=Json, field=14, path=titi, vtype=F64, -5.2) "[(0, Term(type=Json, field=14, path=titi, vtype=Str, "5")), (1, Term(type=Json, field=14, path=titi, vtype=Str, "2"))]")"#, + true, + ); + } + + #[test] + fn test_json_field_possibly_a_date() { + test_parse_query_to_logical_ast_helper( + r#"json.date:"2019-10-12T07:20:50.52Z""#, + r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#, + true, + ); + } + + #[test] + fn test_json_field_not_indexed() { + let error = parse_query_to_logical_ast("json_not_indexed.titi:hello", false).unwrap_err(); + assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); + } + + fn test_query_to_logical_ast_with_default_json( + query: &str, + expected: &str, + default_conjunction: bool, + ) { + let mut query_parser = make_query_parser_with_default_fields(&["json"]); + if default_conjunction { + query_parser.set_conjunction_by_default(); + } + let ast = query_parser.parse_query_to_logical_ast(query).unwrap(); + let ast_str = format!("{ast:?}"); + assert_eq!(ast_str, expected); + } + + #[test] + fn test_json_default() { + test_query_to_logical_ast_with_default_json( + "titi:4", + "(Term(type=Json, field=14, path=titi, vtype=U64, 4) Term(type=Json, field=14, \ + path=titi, vtype=Str, \"4\"))", + false, + ); + } + + #[test] + fn test_json_default_with_different_field() { + for conjunction in [false, true] { + test_query_to_logical_ast_with_default_json( + "text:4", + r#"Term(type=Str, field=1, "4")"#, + conjunction, + ); + } + } + + #[test] + fn test_json_default_with_same_field() { + for conjunction in [false, true] { + test_query_to_logical_ast_with_default_json( + "json:4", + r#"(Term(type=Json, field=14, path=, vtype=U64, 4) Term(type=Json, field=14, path=, vtype=Str, "4"))"#, + conjunction, + ); + } + } + #[test] fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(type=Bytes, field=12, val=[98, 117, 98, 117])", + "Term(type=Bytes, field=12, [98, 117, 98, 117])", false, ); } @@ -837,12 +1119,12 @@ mod test { fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"((+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) Term(type=Str, field=0, val="c"))"#, + r#"((+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) Term(type=Str, field=0, "c"))"#, false, ); test_parse_query_to_logical_ast_helper( "(+title:a +title:b) title:c", - r#"(+(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b")) +Term(type=Str, field=0, val="c"))"#, + r#"(+(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b")) +Term(type=Str, field=0, "c"))"#, true, ); } @@ -851,17 +1133,17 @@ mod test { pub fn test_parse_query_to_ast_single_term() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, false, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, false, ); } @@ -878,54 +1160,62 @@ mod test { pub fn test_parse_query_to_ast_two_terms() { test_parse_query_to_logical_ast_helper( "title:a b", - r#"(Term(type=Str, field=0, val="a") (Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(Term(type=Str, field=0, "a") (Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( r#"title:"a b""#, - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, false, ); } + #[test] + pub fn test_parse_query_all_query() { + let logical_ast = parse_query_to_logical_ast("*", false).unwrap(); + assert_eq!(format!("{logical_ast:?}"), "*"); + } + + #[test] + pub fn test_parse_query_range_require_a_target_field() { + let query_parser_error = parse_query_to_logical_ast("[A TO B]", false).err().unwrap(); + assert_eq!( + query_parser_error.to_string(), + "Unsupported query: Range query need to target a specific field." + ); + } #[test] pub fn test_parse_query_to_ast_ranges() { test_parse_query_to_logical_ast_helper( "title:[a TO b]", - r#"(Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b")))"#, - false, - ); - test_parse_query_to_logical_ast_helper( - "[a TO b]", - r#"((Included(Term(type=Str, field=0, val="a")) TO Included(Term(type=Str, field=0, val="b"))) (Included(Term(type=Str, field=1, val="a")) TO Included(Term(type=Str, field=1, val="b"))))"#, + r#"(Included(Term(type=Str, field=0, "a")) TO Included(Term(type=Str, field=0, "b")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO toto}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{* TO toto}", - r#"(Unbounded TO Excluded(Term(type=Str, field=0, val="toto")))"#, + r#"(Unbounded TO Excluded(Term(type=Str, field=0, "toto")))"#, false, ); test_parse_query_to_logical_ast_helper( "title:{titi TO *}", - r#"(Excluded(Term(type=Str, field=0, val="titi")) TO Unbounded)"#, + r#"(Excluded(Term(type=Str, field=0, "titi")) TO Unbounded)"#, false, ); test_parse_query_to_logical_ast_helper( "signed:{-5 TO 3}", - r#"(Excluded(Term(type=I64, field=2, val=-5)) TO Excluded(Term(type=I64, field=2, val=3)))"#, + r#"(Excluded(Term(type=I64, field=2, -5)) TO Excluded(Term(type=I64, field=2, 3)))"#, false, ); test_parse_query_to_logical_ast_helper( "float:{-1.5 TO 1.5}", - r#"(Excluded(Term(type=F64, field=10, val=-1.5)) TO Excluded(Term(type=F64, field=10, val=1.5)))"#, + r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#, false, ); - test_parse_query_to_logical_ast_helper("*", "*", false); } #[test] @@ -1051,27 +1341,27 @@ mod test { pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( "title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto", - r#"Term(type=Str, field=0, val="toto")"#, + r#"Term(type=Str, field=0, "toto")"#, true, ); test_parse_query_to_logical_ast_helper( "+title:toto -titi", - r#"(+Term(type=Str, field=0, val="toto") -(Term(type=Str, field=0, val="titi") Term(type=Str, field=1, val="titi")))"#, + r#"(+Term(type=Str, field=0, "toto") -(Term(type=Str, field=0, "titi") Term(type=Str, field=1, "titi")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:a b", - r#"(+Term(type=Str, field=0, val="a") +(Term(type=Str, field=0, val="b") Term(type=Str, field=1, val="b")))"#, + r#"(+Term(type=Str, field=0, "a") +(Term(type=Str, field=0, "b") Term(type=Str, field=1, "b")))"#, true, ); test_parse_query_to_logical_ast_helper( "title:\"a b\"", - r#""[(0, Term(type=Str, field=0, val="a")), (1, Term(type=Str, field=0, val="b"))]""#, + r#""[(0, Term(type=Str, field=0, "a")), (1, Term(type=Str, field=0, "b"))]""#, true, ); } @@ -1080,7 +1370,7 @@ mod test { pub fn test_query_parser_hyphen() { test_parse_query_to_logical_ast_helper( "title:www-form-encoded", - r#""[(0, Term(type=Str, field=0, val="www")), (1, Term(type=Str, field=0, val="form")), (2, Term(type=Str, field=0, val="encoded"))]""#, + r#""[(0, Term(type=Str, field=0, "www")), (1, Term(type=Str, field=0, "form")), (2, Term(type=Str, field=0, "encoded"))]""#, false, ); } @@ -1090,7 +1380,7 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a AND title:b", - r#"(+Term(type=Str, field=0, val="a") +Term(type=Str, field=0, val="b"))"#, + r#"(+Term(type=Str, field=0, "a") +Term(type=Str, field=0, "b"))"#, default_conjunction, ); } @@ -1101,9 +1391,35 @@ mod test { for &default_conjunction in &[false, true] { test_parse_query_to_logical_ast_helper( "title:a OR title:b", - r#"(Term(type=Str, field=0, val="a") Term(type=Str, field=0, val="b"))"#, + r#"(Term(type=Str, field=0, "a") Term(type=Str, field=0, "b"))"#, default_conjunction, ); } } + + #[test] + fn test_split_full_path() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("second", STRING); + schema_builder.add_text_field("first", STRING); + schema_builder.add_text_field("first.toto", STRING); + schema_builder.add_text_field("third.a.b.c", STRING); + let schema = schema_builder.build(); + let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default()); + assert_eq!( + query_parser.split_full_path("first.toto"), + ("first.toto", "") + ); + assert_eq!( + query_parser.split_full_path("first.titi"), + ("first", "titi") + ); + assert_eq!(query_parser.split_full_path("third"), ("", "third")); + assert_eq!( + query_parser.split_full_path("hello.toto"), + ("", "hello.toto") + ); + assert_eq!(query_parser.split_full_path(""), ("", "")); + assert_eq!(query_parser.split_full_path("firsty"), ("", "firsty")); + } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index f997cdc51..b6c106721 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -174,7 +174,7 @@ mod tests { ); assert_eq!( format!("{:?}", term_query), - r#"TermQuery(Term(type=Str, field=1, val="hello"))"# + r#"TermQuery(Term(type=Str, field=1, "hello"))"# ); } diff --git a/src/schema/document.rs b/src/schema/document.rs index 2833e27d7..c9da05b53 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -117,7 +117,16 @@ impl Document { /// Add a bytes field pub fn add_bytes>>(&mut self, field: Field, value: T) { - self.add_field_value(field, value.into()) + self.add_field_value(field, value.into()); + } + + /// Add a bytes field + pub fn add_json_object( + &mut self, + field: Field, + json_object: serde_json::Map, + ) { + self.add_field_value(field, json_object); } /// Add a (field, value) to the document. diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 1e05b3b4f..81a4aa917 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,7 +1,9 @@ use serde::{Deserialize, Serialize}; use crate::schema::bytes_options::BytesOptions; -use crate::schema::{is_valid_field_name, FacetOptions, FieldType, NumericOptions, TextOptions}; +use crate::schema::{ + is_valid_field_name, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, TextOptions, +}; /// A `FieldEntry` represents a field and its configuration. /// `Schema` are a collection of `FieldEntry` @@ -27,71 +29,44 @@ impl FieldEntry { } } - /// Creates a new u64 field entry in the schema, given - /// a name, and some options. + /// Creates a new text field entry. pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Str(text_options), - } + Self::new(field_name, FieldType::Str(text_options)) } - /// Creates a new u64 field entry in the schema, given - /// a name, and some options. - pub fn new_u64(field_name: String, field_type: NumericOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::U64(field_type), - } + /// Creates a new u64 field entry. + pub fn new_u64(field_name: String, int_options: NumericOptions) -> FieldEntry { + Self::new(field_name, FieldType::U64(int_options)) } - /// Creates a new i64 field entry in the schema, given - /// a name, and some options. - pub fn new_i64(field_name: String, field_type: NumericOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::I64(field_type), - } + /// Creates a new i64 field entry. + pub fn new_i64(field_name: String, int_options: NumericOptions) -> FieldEntry { + Self::new(field_name, FieldType::I64(int_options)) } - /// Creates a new f64 field entry in the schema, given - /// a name, and some options. - pub fn new_f64(field_name: String, field_type: NumericOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::F64(field_type), - } + /// Creates a new f64 field entry. + pub fn new_f64(field_name: String, f64_options: NumericOptions) -> FieldEntry { + Self::new(field_name, FieldType::F64(f64_options)) } - /// Creates a new date field entry in the schema, given - /// a name, and some options. - pub fn new_date(field_name: String, field_type: NumericOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Date(field_type), - } + /// Creates a new date field entry. + pub fn new_date(field_name: String, date_options: NumericOptions) -> FieldEntry { + Self::new(field_name, FieldType::Date(date_options)) } /// Creates a field entry for a facet. - pub fn new_facet(field_name: String, field_type: FacetOptions) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); - FieldEntry { - name: field_name, - field_type: FieldType::Facet(field_type), - } + pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry { + Self::new(field_name, FieldType::Facet(facet_options)) } /// Creates a field entry for a bytes field - pub fn new_bytes(field_name: String, bytes_type: BytesOptions) -> FieldEntry { - FieldEntry { - name: field_name, - field_type: FieldType::Bytes(bytes_type), - } + pub fn new_bytes(field_name: String, bytes_options: BytesOptions) -> FieldEntry { + Self::new(field_name, FieldType::Bytes(bytes_options)) + } + + /// Creates a field entry for a json field + pub fn new_json(field_name: String, json_object_options: JsonObjectOptions) -> FieldEntry { + Self::new(field_name, FieldType::JsonObject(json_object_options)) } /// Returns the name of the field @@ -137,6 +112,7 @@ impl FieldEntry { FieldType::Str(ref options) => options.is_stored(), FieldType::Facet(ref options) => options.is_stored(), FieldType::Bytes(ref options) => options.is_stored(), + FieldType::JsonObject(ref options) => options.is_stored(), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index dd962b48e..22ae8180f 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -6,7 +6,8 @@ use thiserror::Error; use crate::schema::bytes_options::BytesOptions; use crate::schema::facet_options::FacetOptions; use crate::schema::{ - Facet, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions, Value, + Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions, + Value, }; use crate::tokenizer::PreTokenizedString; @@ -49,9 +50,11 @@ pub enum Type { Facet = b'h', /// `Vec` Bytes = b'b', + /// Leaf in a Json object. + Json = b'j', } -const ALL_TYPES: [Type; 7] = [ +const ALL_TYPES: [Type; 8] = [ Type::Str, Type::U64, Type::I64, @@ -59,6 +62,7 @@ const ALL_TYPES: [Type; 7] = [ Type::Date, Type::Facet, Type::Bytes, + Type::Json, ]; impl Type { @@ -83,6 +87,7 @@ impl Type { Type::Date => "Date", Type::Facet => "Facet", Type::Bytes => "Bytes", + Type::Json => "Json", } } @@ -97,6 +102,7 @@ impl Type { b'd' => Some(Type::Date), b'h' => Some(Type::Facet), b'b' => Some(Type::Bytes), + b'j' => Some(Type::Json), _ => None, } } @@ -123,6 +129,8 @@ pub enum FieldType { Facet(FacetOptions), /// Bytes (one per document) Bytes(BytesOptions), + /// Json object + JsonObject(JsonObjectOptions), } impl FieldType { @@ -136,6 +144,7 @@ impl FieldType { FieldType::Date(_) => Type::Date, FieldType::Facet(_) => Type::Facet, FieldType::Bytes(_) => Type::Bytes, + FieldType::JsonObject(_) => Type::Json, } } @@ -149,6 +158,7 @@ impl FieldType { FieldType::Date(ref date_options) => date_options.is_indexed(), FieldType::Facet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), + FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(), } } @@ -160,6 +170,9 @@ impl FieldType { FieldType::Str(text_options) => text_options .get_indexing_options() .map(|text_indexing| text_indexing.index_option()), + FieldType::JsonObject(json_object_options) => json_object_options + .get_text_indexing_options() + .map(|text_indexing| text_indexing.index_option()), field_type => { if field_type.is_indexed() { Some(IndexRecordOption::Basic) @@ -183,6 +196,7 @@ impl FieldType { | FieldType::Date(ref int_options) => int_options.fieldnorms(), FieldType::Facet(_) => false, FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), + FieldType::JsonObject(ref _json_object_options) => false, } } @@ -217,6 +231,9 @@ impl FieldType { None } } + FieldType::JsonObject(ref json_obj_options) => json_obj_options + .get_text_indexing_options() + .map(TextFieldIndexing::index_option), } } @@ -225,41 +242,43 @@ impl FieldType { /// Tantivy will not try to cast values. /// For instance, If the json value is the integer `3` and the /// target field is a `Str`, this method will return an Error. - pub fn value_from_json(&self, json: &JsonValue) -> Result { - match *json { - JsonValue::String(ref field_text) => match *self { + pub fn value_from_json(&self, json: JsonValue) -> Result { + match json { + JsonValue::String(field_text) => match *self { FieldType::Date(_) => { let dt_with_fixed_tz: chrono::DateTime = - chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| { + chrono::DateTime::parse_from_rfc3339(&field_text).map_err(|_err| { ValueParsingError::TypeError { expected: "rfc3339 format", - json: json.clone(), + json: JsonValue::String(field_text), } })?; Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc))) } - FieldType::Str(_) => Ok(Value::Str(field_text.clone())), + FieldType::Str(_) => Ok(Value::Str(field_text)), FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => { Err(ValueParsingError::TypeError { expected: "an integer", - json: json.clone(), + json: JsonValue::String(field_text), }) } - FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))), - FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| { - ValueParsingError::InvalidBase64 { - base64: field_text.clone(), - } + FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), + FieldType::Bytes(_) => base64::decode(&field_text) + .map(Value::Bytes) + .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: JsonValue::String(field_text), }), }, - JsonValue::Number(ref field_val_num) => match *self { + JsonValue::Number(field_val_num) => match self { FieldType::I64(_) | FieldType::Date(_) => { if let Some(field_val_i64) = field_val_num.as_i64() { Ok(Value::I64(field_val_i64)) } else { Err(ValueParsingError::OverflowError { expected: "an i64 int", - json: json.clone(), + json: JsonValue::Number(field_val_num), }) } } @@ -269,7 +288,7 @@ impl FieldType { } else { Err(ValueParsingError::OverflowError { expected: "u64", - json: json.clone(), + json: JsonValue::Number(field_val_num), }) } } @@ -279,33 +298,38 @@ impl FieldType { } else { Err(ValueParsingError::OverflowError { expected: "a f64", - json: json.clone(), + json: JsonValue::Number(field_val_num), }) } } FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => { Err(ValueParsingError::TypeError { expected: "a string", - json: json.clone(), + json: JsonValue::Number(field_val_num), }) } + FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { + expected: "a json object", + json: JsonValue::Number(field_val_num), + }), }, - JsonValue::Object(_) => match *self { + JsonValue::Object(json_map) => match self { FieldType::Str(_) => { - if let Ok(tok_str_val) = - serde_json::from_value::(json.clone()) - { + if let Ok(tok_str_val) = serde_json::from_value::( + serde_json::Value::Object(json_map.clone()), + ) { Ok(Value::PreTokStr(tok_str_val)) } else { Err(ValueParsingError::TypeError { expected: "a string or an pretokenized string", - json: json.clone(), + json: JsonValue::Object(json_map), }) } } + FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map)), _ => Err(ValueParsingError::TypeError { expected: self.value_type().name(), - json: json.clone(), + json: JsonValue::Object(json_map), }), }, _ => Err(ValueParsingError::TypeError { @@ -319,6 +343,7 @@ impl FieldType { #[cfg(test)] mod tests { use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc}; + use serde_json::json; use super::FieldType; use crate::schema::field_type::ValueParsingError; @@ -354,17 +379,17 @@ mod tests { #[test] fn test_bytes_value_from_json() { let result = FieldType::Bytes(Default::default()) - .value_from_json(&json!("dGhpcyBpcyBhIHRlc3Q=")) + .value_from_json(json!("dGhpcyBpcyBhIHRlc3Q=")) .unwrap(); assert_eq!(result, Value::Bytes("this is a test".as_bytes().to_vec())); - let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521)); + let result = FieldType::Bytes(Default::default()).value_from_json(json!(521)); match result { Err(ValueParsingError::TypeError { .. }) => {} _ => panic!("Expected parse failure for wrong type"), } - let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-")); + let result = FieldType::Bytes(Default::default()).value_from_json(json!("-")); match result { Err(ValueParsingError::InvalidBase64 { .. }) => {} _ => panic!("Expected parse failure for invalid base64"), @@ -428,7 +453,7 @@ mod tests { }); let deserialized_value = FieldType::Str(TextOptions::default()) - .value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap()) + .value_from_json(serde_json::from_str(pre_tokenized_string_json).unwrap()) .unwrap(); assert_eq!(deserialized_value, expected_value); diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs new file mode 100644 index 000000000..2a0795c12 --- /dev/null +++ b/src/schema/json_object_options.rs @@ -0,0 +1,109 @@ +use std::ops::BitOr; + +use serde::{Deserialize, Serialize}; + +use crate::schema::flags::{SchemaFlagList, StoredFlag}; +use crate::schema::{TextFieldIndexing, TextOptions}; + +/// The `JsonObjectOptions` make it possible to +/// configure how a json object field should be indexed and stored. +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct JsonObjectOptions { + stored: bool, + // If set to some, int, date, f64 and text will be indexed. + // Text will use the TextFieldIndexing setting for indexing. + indexing: Option, +} + +impl JsonObjectOptions { + /// Returns `true` iff the json object should be stored. + pub fn is_stored(&self) -> bool { + self.stored + } + + /// Returns `true` iff the json object should be indexed. + pub fn is_indexed(&self) -> bool { + self.indexing.is_some() + } + + /// Returns the text indexing options. + /// + /// If set to `Some` then both int and str values will be indexed. + /// The inner `TextFieldIndexing` will however, only apply to the str values + /// in the json object. + pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> { + self.indexing.as_ref() + } +} + +impl From for JsonObjectOptions { + fn from(_stored_flag: StoredFlag) -> Self { + JsonObjectOptions { + stored: true, + indexing: None, + } + } +} + +impl From<()> for JsonObjectOptions { + fn from(_: ()) -> Self { + Self::default() + } +} + +impl> BitOr for JsonObjectOptions { + type Output = JsonObjectOptions; + + fn bitor(self, other: T) -> Self { + let other = other.into(); + JsonObjectOptions { + indexing: self.indexing.or(other.indexing), + stored: self.stored | other.stored, + } + } +} + +impl From> for JsonObjectOptions +where + Head: Clone, + Tail: Clone, + Self: BitOr + From + From, +{ + fn from(head_tail: SchemaFlagList) -> Self { + Self::from(head_tail.head) | Self::from(head_tail.tail) + } +} + +impl From for JsonObjectOptions { + fn from(text_options: TextOptions) -> Self { + JsonObjectOptions { + stored: text_options.is_stored(), + indexing: text_options.get_indexing_options().cloned(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{STORED, TEXT}; + + #[test] + fn test_json_options() { + { + let json_options: JsonObjectOptions = (STORED | TEXT).into(); + assert!(json_options.is_stored()); + assert!(json_options.is_indexed()); + } + { + let json_options: JsonObjectOptions = TEXT.into(); + assert!(!json_options.is_stored()); + assert!(json_options.is_indexed()); + } + { + let json_options: JsonObjectOptions = STORED.into(); + assert!(json_options.is_stored()); + assert!(!json_options.is_indexed()); + } + } +} diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 92c0b6bc6..f141d32b8 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -104,7 +104,7 @@ mod document; mod facet; mod facet_options; mod schema; -mod term; +pub(crate) mod term; mod field_entry; mod field_type; @@ -112,14 +112,14 @@ mod field_value; mod bytes_options; mod field; +mod flags; mod index_record_option; +mod json_object_options; mod named_field_document; mod numeric_options; mod text_options; mod value; -mod flags; - pub use self::bytes_options::BytesOptions; pub use self::document::Document; pub(crate) use self::facet::FACET_SEP_BYTE; @@ -131,9 +131,11 @@ pub use self::field_type::{FieldType, Type}; pub use self::field_value::FieldValue; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::index_record_option::IndexRecordOption; +pub use self::json_object_options::JsonObjectOptions; pub use self::named_field_document::NamedFieldDocument; +pub use self::numeric_options::NumericOptions; #[allow(deprecated)] -pub use self::numeric_options::{Cardinality, IntOptions, NumericOptions}; +pub use self::numeric_options::{Cardinality, IntOptions}; pub use self::schema::{DocParsingError, Schema, SchemaBuilder}; pub use self::term::Term; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index ef9d978f8..ff38a6b67 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use serde::de::{SeqAccess, Visitor}; use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use serde_json::{self, Map as JsonObject, Value as JsonValue}; +use serde_json::{self, Value as JsonValue}; use super::*; use crate::schema::bytes_options::BytesOptions; @@ -173,6 +173,16 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a json object field to the schema. + pub fn add_json_field>( + &mut self, + field_name: &str, + field_options: T, + ) -> Field { + let field_entry = FieldEntry::new_json(field_name.to_string(), field_options.into()); + self.add_field(field_entry) + } + /// Adds a field entry to the schema in build. pub fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field::from_field_id(self.fields.len() as u32); @@ -298,23 +308,23 @@ impl Schema { /// Build a document object from a json-object. pub fn parse_document(&self, doc_json: &str) -> Result { - let json_obj: JsonObject = - serde_json::from_str(doc_json).map_err(|_| { - let doc_json_sample: String = if doc_json.len() < 20 { - String::from(doc_json) - } else { - format!("{:?}...", &doc_json[0..20]) - }; - DocParsingError::NotJson(doc_json_sample) - })?; + let json_obj: serde_json::Map = + serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?; + self.json_object_to_doc(json_obj) + } + /// Build a document object from a json-object. + pub fn json_object_to_doc( + &self, + json_obj: serde_json::Map, + ) -> Result { let mut doc = Document::default(); - for (field_name, json_value) in json_obj.iter() { - if let Some(field) = self.get_field(field_name) { + for (field_name, json_value) in json_obj { + if let Some(field) = self.get_field(&field_name) { let field_entry = self.get_field_entry(field); let field_type = field_entry.field_type(); - match *json_value { - JsonValue::Array(ref json_items) => { + match json_value { + JsonValue::Array(json_items) => { for json_item in json_items { let value = field_type .value_from_json(json_item) @@ -383,12 +393,24 @@ impl<'de> Deserialize<'de> for Schema { pub enum DocParsingError { /// The payload given is not valid JSON. #[error("The provided string is not valid JSON")] - NotJson(String), + InvalidJson(String), /// One of the value node could not be parsed. #[error("The field '{0:?}' could not be parsed: {1:?}")] ValueError(String, ValueParsingError), } +impl DocParsingError { + /// Builds a NotJson DocParsingError + fn invalid_json(invalid_json: &str) -> Self { + let sample_json: String = if invalid_json.len() < 20 { + invalid_json.to_string() + } else { + format!("{:?}...", &invalid_json[0..20]) + }; + DocParsingError::InvalidJson(sample_json) + } +} + #[cfg(test)] mod tests { @@ -399,7 +421,7 @@ mod tests { use crate::schema::field_type::ValueParsingError; use crate::schema::numeric_options::Cardinality::SingleValue; - use crate::schema::schema::DocParsingError::NotJson; + use crate::schema::schema::DocParsingError::InvalidJson; use crate::schema::*; #[test] @@ -732,7 +754,7 @@ mod tests { "count": 50, }"#, ); - assert_matches!(json_err, Err(NotJson(_))); + assert_matches!(json_err, Err(InvalidJson(_))); } } diff --git a/src/schema/term.rs b/src/schema/term.rs index c6e3cfbb0..d9e36f59a 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::hash::{Hash, Hasher}; use std::{fmt, str}; @@ -8,8 +9,26 @@ use crate::DateTime; /// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term. /// + + +/// +/// - is a big endian encoded u32 field id +/// - 's most significant bit expresses whether the term is a json term or not +/// The remaining 7 bits are used to encode the type of the value. +/// If this is a JSON term, the type is the type of the leaf of the json. +/// +/// - is, if this is not the json term, a binary representation specific to the type. +/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value. const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; +/// Separates the different segments of +/// the json path. +pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8; +pub const JSON_PATH_SEGMENT_SEP_STR: &str = + unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) }; + +/// Separates the json path and the value in +/// a JSON term binary representation. +pub const JSON_END_OF_PATH: u8 = 0u8; + /// Term represents the value that the token can take. /// /// It actually wraps a `Vec`. @@ -17,6 +36,12 @@ const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8; pub struct Term>(B) where B: AsRef<[u8]>; +impl AsMut> for Term { + fn as_mut(&mut self) -> &mut Vec { + &mut self.0 + } +} + impl Term { pub(crate) fn new() -> Term { Term(Vec::with_capacity(100)) @@ -120,6 +145,22 @@ impl Term { pub fn set_text(&mut self, text: &str) { self.set_bytes(text.as_bytes()); } + + /// Removes the value_bytes and set the type code. + pub fn clear_with_type(&mut self, typ: Type) { + self.truncate(5); + self.0[4] = typ.to_code(); + } + + /// Truncate the term right after the field and the type code. + pub fn truncate(&mut self, len: usize) { + self.0.truncate(len); + } + + /// Truncate the term right after the field and the type code. + pub fn append_bytes(&mut self, bytes: &[u8]) { + self.0.extend_from_slice(bytes); + } } impl Ord for Term @@ -164,13 +205,16 @@ where B: AsRef<[u8]> Term(data) } + fn typ_code(&self) -> u8 { + *self + .as_slice() + .get(4) + .expect("the byte representation is too short") + } + /// Return the type of the term. pub fn typ(&self) -> Type { - assert!( - self.as_slice().len() >= 5, - "the type does byte representation is too short" - ); - Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code") + Type::from_code(self.typ_code()).expect("The term has an invalid type code") } /// Returns the field. @@ -189,10 +233,14 @@ where B: AsRef<[u8]> } fn get_fast_type(&self) -> Option { - if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN { + if self.typ() != T::to_type() { return None; } let mut value_bytes = [0u8; 8]; + let bytes = self.value_bytes(); + if bytes.len() != 8 { + return None; + } value_bytes.copy_from_slice(self.value_bytes()); let value_u64 = u64::from_be_bytes(value_bytes); Some(FastValue::from_u64(value_u64)) @@ -290,40 +338,74 @@ fn write_opt(f: &mut fmt::Formatter, val_opt: Option) -> Ok(()) } -impl fmt::Debug for Term { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let field_id = self.field().field_id(); - let typ = self.typ(); - write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?; - match typ { - Type::Str => { - let s = str::from_utf8(self.value_bytes()).ok(); - write_opt(f, s)?; - } - Type::U64 => { - write_opt(f, self.as_u64())?; - } - Type::I64 => { - let val_i64 = self.as_i64(); - write_opt(f, val_i64)?; - } - Type::F64 => { - let val_f64 = self.as_f64(); - write_opt(f, val_f64)?; - } - // TODO pretty print these types too. - Type::Date => { - let val_date = self.as_date(); - write_opt(f, val_date)?; - } - Type::Facet => { - let facet = self.as_facet().map(|facet| facet.to_path_string()); - write_opt(f, facet)?; - } - Type::Bytes => { - write_opt(f, self.as_bytes())?; +fn as_str(value_bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(value_bytes).ok() +} + +fn get_fast_type(bytes: &[u8]) -> Option { + let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?); + Some(FastValue::from_u64(value_u64)) +} + +/// Returns the json path (without non-human friendly separators, the type of the value, and the +/// value bytes). Returns None if the value is not JSON or is not valid. +pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> { + let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; + let json_path = str::from_utf8(&bytes[..pos]).ok()?; + let type_code = *bytes.get(pos + 1)?; + let typ = Type::from_code(type_code)?; + Some((json_path, typ, &bytes[pos + 2..])) +} + +fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result { + match typ { + Type::Str => { + let s = as_str(bytes); + write_opt(f, s)?; + } + Type::U64 => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::I64 => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::F64 => { + write_opt(f, get_fast_type::(bytes))?; + } + // TODO pretty print these types too. + Type::Date => { + write_opt(f, get_fast_type::(bytes))?; + } + Type::Facet => { + let facet_str = str::from_utf8(bytes) + .ok() + .map(ToString::to_string) + .map(Facet::from_encoded_string) + .map(|facet| facet.to_path_string()); + write_opt(f, facet_str)?; + } + Type::Bytes => { + write_opt(f, Some(bytes))?; + } + Type::Json => { + if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) { + let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, "."); + write!(f, "path={path_pretty}, vtype={typ:?}, ")?; + debug_value_bytes(typ, bytes, f)?; } } + } + Ok(()) +} + +impl fmt::Debug for Term +where B: AsRef<[u8]> +{ + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let field_id = self.field().field_id(); + let typ = self.typ(); + write!(f, "Term(type={typ:?}, field={field_id}, ")?; + debug_value_bytes(typ, self.value_bytes(), f)?; write!(f, ")",)?; Ok(()) } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 83b2dd674..45fa3b488 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -46,7 +46,7 @@ impl TextOptions { /// Essentially, should we store the term frequency and/or the positions (See /// [`IndexRecordOption`](./enum.IndexRecordOption.html)). /// - the name of the `Tokenizer` that should be used to process the field. -#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] +#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, fieldnorms: bool, diff --git a/src/schema/value.rs b/src/schema/value.rs index df83930d6..5c7d25cfe 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -2,6 +2,7 @@ use std::fmt; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_json::Map; use crate::schema::Facet; use crate::tokenizer::PreTokenizedString; @@ -27,6 +28,8 @@ pub enum Value { Facet(Facet), /// Arbitrarily sized byte array Bytes(Vec), + /// Json object value. + JsonObject(serde_json::Map), } impl Eq for Value {} @@ -43,6 +46,7 @@ impl Serialize for Value { Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()), Value::Facet(ref facet) => facet.serialize(serializer), Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), + Value::JsonObject(ref obj) => obj.serialize(serializer), } } } @@ -168,6 +172,17 @@ impl Value { None } } + + /// Returns the json object, provided the value is of the JsonObject type. + /// + /// Returns None if the value is not of type JsonObject. + pub fn as_json(&self) -> Option<&Map> { + if let Value::JsonObject(json) = self { + Some(json) + } else { + None + } + } } impl From for Value { @@ -230,6 +245,23 @@ impl From for Value { } } +impl From> for Value { + fn from(json_object: serde_json::Map) -> Value { + Value::JsonObject(json_object) + } +} + +impl From for Value { + fn from(json_value: serde_json::Value) -> Value { + match json_value { + serde_json::Value::Object(json_object) => Value::JsonObject(json_object), + _ => { + panic!("Expected a json object."); + } + } + } +} + mod binary_serialize { use std::io::{self, Read, Write}; @@ -248,6 +280,7 @@ mod binary_serialize { const DATE_CODE: u8 = 5; const F64_CODE: u8 = 6; const EXT_CODE: u8 = 7; + const JSON_OBJ_CODE: u8 = 8; // extended types @@ -296,8 +329,14 @@ mod binary_serialize { BYTES_CODE.serialize(writer)?; bytes.serialize(writer) } + Value::JsonObject(ref map) => { + JSON_OBJ_CODE.serialize(writer)?; + serde_json::to_writer(writer, &map)?; + Ok(()) + } } } + fn deserialize(reader: &mut R) -> io::Result { let type_code = u8::deserialize(reader)?; match type_code { @@ -347,6 +386,10 @@ mod binary_serialize { )), } } + JSON_OBJ_CODE => { + let map = serde_json::from_reader(reader)?; + Ok(Value::JsonObject(map)) + } _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code), diff --git a/src/tokenizer/empty_tokenizer.rs b/src/tokenizer/empty_tokenizer.rs new file mode 100644 index 000000000..1dca0006d --- /dev/null +++ b/src/tokenizer/empty_tokenizer.rs @@ -0,0 +1,41 @@ +use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer}; + +#[derive(Clone)] +pub(crate) struct EmptyTokenizer; + +impl Tokenizer for EmptyTokenizer { + fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> { + EmptyTokenStream::default().into() + } +} + +#[derive(Default)] +struct EmptyTokenStream { + token: Token, +} + +impl TokenStream for EmptyTokenStream { + fn advance(&mut self) -> bool { + false + } + + fn token(&self) -> &super::Token { + &self.token + } + + fn token_mut(&mut self) -> &mut super::Token { + &mut self.token + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizer::Tokenizer; + + #[test] + fn test_empty_tokenizer() { + let tokenizer = super::EmptyTokenizer; + let mut empty = tokenizer.token_stream("whatever string"); + assert!(!empty.advance()); + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 506b4d60a..9549fa182 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -119,6 +119,7 @@ //! ``` mod alphanum_only; mod ascii_folding_filter; +mod empty_tokenizer; mod facet_tokenizer; mod lower_caser; mod ngram_tokenizer; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e895ad0f1..12a0dfab2 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -5,6 +5,8 @@ use std::ops::{Deref, DerefMut}; use serde::{Deserialize, Serialize}; +use crate::tokenizer::empty_tokenizer::EmptyTokenizer; + /// Token #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct Token { @@ -43,6 +45,12 @@ pub struct TextAnalyzer { token_filters: Vec, } +impl Default for TextAnalyzer { + fn default() -> TextAnalyzer { + TextAnalyzer::from(EmptyTokenizer) + } +} + impl From for TextAnalyzer { fn from(tokenizer: T) -> Self { TextAnalyzer::new(tokenizer, Vec::new())