mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
Added JSON Type (#1270)
- Removed useless copy when ingesting JSON. - Bugfix in phrase query with a missing field norms. - Disabled range query on default fields Closes #1251
This commit is contained in:
@@ -9,6 +9,7 @@ Tantivy 0.17
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
|
||||
@@ -4,7 +4,7 @@ use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||
const NUM_REPEATS: usize = 10;
|
||||
const NUM_REPEATS: usize = 2;
|
||||
|
||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let schema = {
|
||||
@@ -21,6 +21,11 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
schema_builder.add_text_field("severity", STRING | STORED);
|
||||
schema_builder.build()
|
||||
};
|
||||
let dynamic_schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_json_field("json", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("index-hdfs");
|
||||
group.sample_size(20);
|
||||
@@ -74,6 +79,38 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
|
||||
@@ -6,6 +6,7 @@ extern crate test;
|
||||
mod tests {
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
@@ -20,6 +21,7 @@ mod tests {
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
128
doc/src/json.md
Normal file
128
doc/src/json.md
Normal file
@@ -0,0 +1,128 @@
|
||||
# Json
|
||||
|
||||
As of tantivy 0.17, tantivy supports a json object type.
|
||||
This type can be used to allow for a schema-less search index.
|
||||
|
||||
When indexing a json object, we "flatten" the JSON. This operation emits terms that represent a triplet `(json_path, value_type, value)`
|
||||
|
||||
For instance, if user is a json field, the following document:
|
||||
|
||||
```json
|
||||
{
|
||||
"user": {
|
||||
"name": "Paul Masurel",
|
||||
"address": {
|
||||
"city": "Tokyo",
|
||||
"country": "Japan"
|
||||
},
|
||||
"created_at": "2018-11-12T23:20:50.52Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
emits the following tokens:
|
||||
- ("name", Text, "Paul")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
|
||||
# Bytes-encoding and lexicographical sort.
|
||||
|
||||
Like any other terms, these triplets are encoded into a binary format as follows.
|
||||
- `json_path`: the json path is a sequence of "segments". In the example above, `address.city`
|
||||
is just a debug representation of the json path `["address", "city"]`.
|
||||
Its representation is done by separating segments by a unicode char `\x01`, and ending the path by `\x00`.
|
||||
- `value type`: One byte represents the `Value` type.
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("name", Text, "Paul")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
As seen in "pitfalls", we may end up having to search for a value for a same path in several different fields. Putting the field code after the path makes it maximizes compression opportunities but also increases the chances for the two terms to end up in the actual same term dictionary block.
|
||||
|
||||
|
||||
# Pitfalls, limitation and corner cases.
|
||||
|
||||
Json gives very little information about the type of the literals it stores.
|
||||
All numeric types end up mapped as a "Number" and there are no types for dates.
|
||||
|
||||
At indexing, tantivy will try to interpret number and strings as different type with a
|
||||
priority order.
|
||||
|
||||
Numbers will be interpreted as u64, i64 and f64 in that order.
|
||||
Strings will be interpreted as rfc3999 dates or simple strings.
|
||||
|
||||
The first working type is picked and is the only term that is emitted for indexing.
|
||||
Note this interpretation happens on a per-document basis, and there is no effort to try to sniff
|
||||
a consistent field type at the scale of a segment.
|
||||
|
||||
On the query parser side on the other hand, we may end up emitting more than one type.
|
||||
For instance, we do not even know if the type is a number or string based.
|
||||
|
||||
So the query
|
||||
|
||||
```
|
||||
my_path.my_segment:233
|
||||
```
|
||||
|
||||
Will be interpreted as
|
||||
`(my_path.my_segment, String, 233) or (my_path.my_segment, u64, 233)`
|
||||
|
||||
Likewise, we need to emit two tokens if the query contains an rfc3999 date.
|
||||
Indeed the date could have been actually a single token inside the text of a document at ingestion time. Generally speaking, we will always at least emit a string token in query parsing, and sometimes more.
|
||||
|
||||
If one more json field is defined, things get even more complicated.
|
||||
|
||||
|
||||
## Default json field
|
||||
|
||||
If the schema contains a text field called "text" and a json field that is set as a default field:
|
||||
`text:hello` could be reasonably interpreted as targetting the text field or as targetting the json field called `json_dynamic` with the json_path "text".
|
||||
|
||||
If there is such an ambiguity, we decide to only search in the "text" field: `text:hello`.
|
||||
|
||||
In other words, the parser will not search in default json fields if there is a schema hit.
|
||||
This is a product decision.
|
||||
|
||||
The user can still target the JSON field by specifying its name explicitly:
|
||||
`json_dynamic.text:hello`.
|
||||
|
||||
## Range queries are not supported.
|
||||
|
||||
Json field do not support range queries.
|
||||
|
||||
## Arrays do not work like nested object.
|
||||
|
||||
If json object contains an array, a search query might return more documents
|
||||
than what might be expected.
|
||||
|
||||
Let's take an example.
|
||||
|
||||
```json
|
||||
{
|
||||
"cart_id": 3234234 ,
|
||||
"cart": [
|
||||
{"product_type": "sneakers", "attributes": {"color": "white"} },
|
||||
{"product_type": "t-shirt", "attributes": {"color": "red"}},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Despite the array structure, a document in tantivy is a bag of terms.
|
||||
The query:
|
||||
|
||||
```
|
||||
cart.product_type:sneakers AND cart.attributes.color:red
|
||||
```
|
||||
|
||||
Actually match the document above.
|
||||
|
||||
80
examples/json_field.rs
Normal file
80
examples/json_field.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// We need two fields:
|
||||
// - a timestamp
|
||||
// - a json object field
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let attributes = schema_builder.add_json_field("attributes", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:50.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 103},
|
||||
"description": "the best vacuum cleaner ever"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:51.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("cart.product_id:103")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser
|
||||
.parse_query("event_type:click AND cart.product_id:133")
|
||||
.unwrap();
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -59,7 +59,7 @@ pub enum UserInputBound {
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
@@ -67,7 +67,7 @@ impl UserInputBound {
|
||||
}
|
||||
}
|
||||
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
|
||||
@@ -256,7 +256,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &Type) -> f64 {
|
||||
Type::U64 => val as f64,
|
||||
Type::I64 => i64::from_u64(val) as f64,
|
||||
Type::F64 => f64::from_u64(val),
|
||||
Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(),
|
||||
Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,7 +275,7 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> u64 {
|
||||
Type::U64 => val as u64,
|
||||
Type::I64 => (val as i64).to_u64(),
|
||||
Type::F64 => val.to_u64(),
|
||||
Type::Date | Type::Str | Type::Facet | Type::Bytes => unimplemented!(),
|
||||
Type::Date | Type::Str | Type::Facet | Type::Bytes | Type::Json => unimplemented!(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -121,9 +121,8 @@ impl SegmentReader {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during \
|
||||
indexing?",
|
||||
field_name
|
||||
"Field norm not found for field {field_name:?}. Was the field set to record norm \
|
||||
during indexing?"
|
||||
);
|
||||
crate::TantivyError::SchemaError(err_msg)
|
||||
})
|
||||
|
||||
@@ -94,8 +94,11 @@ mod tests {
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query(&format!(
|
||||
"multi_date_field:\"{}\"",
|
||||
first_time_stamp.to_rfc3339()
|
||||
))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(results.len(), 1);
|
||||
for (_score, doc_address) in results {
|
||||
@@ -150,7 +153,7 @@ mod tests {
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let range_q = format!(
|
||||
"[{} TO {}}}",
|
||||
"multi_date_field:[{} TO {}}}",
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
|
||||
415
src/indexer/json_term_writer.rs
Normal file
415
src/indexer/json_term_writer.rs
Normal file
@@ -0,0 +1,415 @@
|
||||
use chrono::Utc;
|
||||
use fnv::FnvHashMap;
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use crate::schema::Type;
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use crate::{DocId, Term};
|
||||
|
||||
/// This object is a map storing the last position for a given path for the current document
|
||||
/// being indexed.
|
||||
///
|
||||
/// It is key to solve the following problem:
|
||||
/// If we index a JsonObject emitting several terms with the same path
|
||||
/// we do not want to create false positive in phrase queries.
|
||||
///
|
||||
/// For instance:
|
||||
///
|
||||
/// ```json
|
||||
/// {"bands": [
|
||||
/// {"band_name": "Elliot Smith"},
|
||||
/// {"band_name": "The Who"},
|
||||
/// ]}
|
||||
/// ```
|
||||
///
|
||||
/// If we are careless and index each band names independently,
|
||||
/// `Elliot` and `The` will end up indexed at position 0, and `Smith` and `Who` will be indexed at
|
||||
/// position 1.
|
||||
/// As a result, with lemmatization, "The Smiths" will match our object.
|
||||
///
|
||||
/// Worse, if a same term is appears in the second object, a non increasing value would be pushed
|
||||
/// to the position recorder probably provoking a panic.
|
||||
///
|
||||
/// This problem is solved for regular multivalued object by offsetting the position
|
||||
/// of values, with a position gap. Here we would like `The` and `Who` to get indexed at
|
||||
/// position 2 and 3 respectively.
|
||||
///
|
||||
/// With regular fields, we sort the fields beforehands, so that all terms with the same
|
||||
/// path are indexed consecutively.
|
||||
///
|
||||
/// In JSON object, we do not have this confort, so we need to record these position offsets in
|
||||
/// a map.
|
||||
///
|
||||
/// Note that using a single position for the entire object would not hurt correctness.
|
||||
/// It would however hurt compression.
|
||||
///
|
||||
/// We can therefore afford working with a map that is not imperfect. It is fine if several
|
||||
/// path map to the same index position as long as the probability is relatively low.
|
||||
#[derive(Default)]
|
||||
struct IndexingPositionsPerPath {
|
||||
positions_per_path: FnvHashMap<u32, IndexingPosition>,
|
||||
}
|
||||
|
||||
impl IndexingPositionsPerPath {
|
||||
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
|
||||
self.positions_per_path
|
||||
.entry(murmurhash2(term.as_slice()))
|
||||
.or_insert_with(Default::default)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn index_json_values<'a>(
|
||||
doc: DocId,
|
||||
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
term_buffer: &mut Term,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> crate::Result<()> {
|
||||
let mut json_term_writer = JsonTermWriter::wrap(term_buffer);
|
||||
let mut positions_per_path: IndexingPositionsPerPath = Default::default();
|
||||
for json_value_res in json_values {
|
||||
let json_value = json_value_res?;
|
||||
index_json_object(
|
||||
doc,
|
||||
json_value,
|
||||
text_analyzer,
|
||||
&mut json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
&mut positions_per_path,
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn index_json_object<'a>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Map<String, serde_json::Value>,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter<'a>,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
for (json_path_segment, json_value) in json_value {
|
||||
json_term_writer.push_path_segment(json_path_segment);
|
||||
index_json_value(
|
||||
doc,
|
||||
json_value,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
json_term_writer.pop_path_segment();
|
||||
}
|
||||
}
|
||||
|
||||
fn index_json_value<'a>(
|
||||
doc: DocId,
|
||||
json_value: &serde_json::Value,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
json_term_writer: &mut JsonTermWriter<'a>,
|
||||
postings_writer: &mut dyn PostingsWriter,
|
||||
ctx: &mut IndexingContext,
|
||||
positions_per_path: &mut IndexingPositionsPerPath,
|
||||
) {
|
||||
match json_value {
|
||||
serde_json::Value::Null => {}
|
||||
serde_json::Value::Bool(val_bool) => {
|
||||
let bool_u64 = if *val_bool { 1u64 } else { 0u64 };
|
||||
json_term_writer.set_fast_value(bool_u64);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::Number(number) => {
|
||||
if let Some(number_u64) = number.as_u64() {
|
||||
json_term_writer.set_fast_value(number_u64);
|
||||
} else if let Some(number_i64) = number.as_i64() {
|
||||
json_term_writer.set_fast_value(number_i64);
|
||||
} else if let Some(number_f64) = number.as_f64() {
|
||||
json_term_writer.set_fast_value(number_f64);
|
||||
}
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
serde_json::Value::String(text) => match infer_type_from_str(text) {
|
||||
TextOrDateTime::Text(text) => {
|
||||
let mut token_stream = text_analyzer.token_stream(text);
|
||||
// TODO make sure the chain position works out.
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let indexing_position = positions_per_path.get_position(json_term_writer.term());
|
||||
postings_writer.index_text(
|
||||
doc,
|
||||
&mut *token_stream,
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
json_term_writer.set_fast_value(dt);
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
},
|
||||
serde_json::Value::Array(arr) => {
|
||||
for val in arr {
|
||||
index_json_value(
|
||||
doc,
|
||||
val,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(map) => {
|
||||
index_json_object(
|
||||
doc,
|
||||
map,
|
||||
text_analyzer,
|
||||
json_term_writer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
positions_per_path,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum TextOrDateTime<'a> {
|
||||
Text(&'a str),
|
||||
DateTime(crate::DateTime),
|
||||
}
|
||||
|
||||
fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
match chrono::DateTime::parse_from_rfc3339(text) {
|
||||
Ok(dt) => {
|
||||
let dt_utc = dt.with_timezone(&Utc);
|
||||
TextOrDateTime::DateTime(dt_utc)
|
||||
}
|
||||
Err(_) => TextOrDateTime::Text(text),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> JsonTermWriter<'a> {
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
Self {
|
||||
term_buffer,
|
||||
path_stack,
|
||||
}
|
||||
}
|
||||
|
||||
fn trim_to_end_of_path(&mut self) {
|
||||
let end_of_path = *self.path_stack.last().unwrap();
|
||||
self.term_buffer.truncate(end_of_path);
|
||||
}
|
||||
|
||||
pub fn close_path_and_set_type(&mut self, typ: Type) {
|
||||
self.trim_to_end_of_path();
|
||||
let buffer = self.term_buffer.as_mut();
|
||||
let buffer_len = buffer.len();
|
||||
buffer[buffer_len - 1] = JSON_END_OF_PATH;
|
||||
buffer.push(typ.to_code());
|
||||
}
|
||||
|
||||
pub fn push_path_segment(&mut self, segment: &str) {
|
||||
// the path stack should never be empty.
|
||||
self.trim_to_end_of_path();
|
||||
let buffer = self.term_buffer.as_mut();
|
||||
let buffer_len = buffer.len();
|
||||
if self.path_stack.len() > 1 {
|
||||
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
|
||||
}
|
||||
buffer.extend(segment.as_bytes());
|
||||
buffer.push(JSON_PATH_SEGMENT_SEP);
|
||||
self.path_stack.push(buffer.len());
|
||||
}
|
||||
|
||||
pub fn pop_path_segment(&mut self) {
|
||||
self.path_stack.pop();
|
||||
assert!(!self.path_stack.is_empty());
|
||||
self.trim_to_end_of_path();
|
||||
}
|
||||
|
||||
/// Returns the json path of the term being currently built.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn path(&self) -> &[u8] {
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
|
||||
&self.term().as_slice()[5..end_of_path - 1]
|
||||
}
|
||||
|
||||
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
self.close_path_and_set_type(T::to_type());
|
||||
self.term_buffer
|
||||
.as_mut()
|
||||
.extend_from_slice(val.to_u64().to_be_bytes().as_slice());
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn set_str(&mut self, text: &str) {
|
||||
self.close_path_and_set_type(Type::Str);
|
||||
self.term_buffer.as_mut().extend_from_slice(text.as_bytes());
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &Term {
|
||||
self.term_buffer
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::JsonTermWriter;
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_json_writer() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("attributes");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")"
|
||||
);
|
||||
json_writer.set_str("blue");
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("dimensions");
|
||||
json_writer.push_path_segment("width");
|
||||
json_writer.set_fast_value(400i64);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)"
|
||||
);
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.push_path_segment("height");
|
||||
json_writer.set_fast_value(300i64);
|
||||
assert_eq!(
|
||||
format!("{:?}", json_writer.term()),
|
||||
"Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(-4i64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_u64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4u64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_term() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4.0f64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_push_after_set_path_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("attribute");
|
||||
json_writer.set_str("something");
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pop_segment() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.push_path_segment("hue");
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_writer_path() {
|
||||
let field = Field::from_field_id(1);
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
assert_eq!(json_writer.path(), b"color");
|
||||
json_writer.push_path_segment("hue");
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
json_writer.set_str("pink");
|
||||
assert_eq!(json_writer.path(), b"color\x01hue");
|
||||
}
|
||||
}
|
||||
@@ -307,16 +307,16 @@ impl IndexMerger {
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Str(_) => {
|
||||
// We don't handle str fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future.
|
||||
}
|
||||
FieldType::Bytes(byte_options) => {
|
||||
if byte_options.is_fast() {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -5,6 +5,7 @@ pub mod doc_id_mapping;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod index_writer_status;
|
||||
mod json_term_writer;
|
||||
mod log_merge_policy;
|
||||
mod merge_operation;
|
||||
pub mod merge_policy;
|
||||
@@ -24,6 +25,7 @@ use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub(crate) use self::json_term_writer::JsonTermWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
|
||||
@@ -3,12 +3,13 @@ use super::operation::AddOperation;
|
||||
use crate::core::Segment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::json_term_writer::index_json_values;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::{
|
||||
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{Field, FieldEntry, FieldType, FieldValue, Schema, Term, Type, Value};
|
||||
use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer,
|
||||
@@ -61,7 +62,7 @@ pub struct SegmentWriter {
|
||||
pub(crate) fast_field_writers: FastFieldsWriter,
|
||||
pub(crate) fieldnorms_writer: FieldNormsWriter,
|
||||
pub(crate) doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<TextAnalyzer>>,
|
||||
per_field_text_analyzers: Vec<TextAnalyzer>,
|
||||
term_buffer: Term,
|
||||
schema: Schema,
|
||||
}
|
||||
@@ -85,19 +86,23 @@ impl SegmentWriter {
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
let tokenizers = schema
|
||||
let per_field_text_analyzers = schema
|
||||
.fields()
|
||||
.map(
|
||||
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
}),
|
||||
.map(|(_, field_entry): (_, &FieldEntry)| {
|
||||
let text_options = match field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
json_object_options.get_text_indexing_options()
|
||||
}
|
||||
_ => None,
|
||||
},
|
||||
)
|
||||
};
|
||||
text_options
|
||||
.and_then(|text_index_option| {
|
||||
let tokenizer_name = &text_index_option.tokenizer();
|
||||
tokenizer_manager.get(tokenizer_name)
|
||||
})
|
||||
.unwrap_or_default()
|
||||
})
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
@@ -107,7 +112,7 @@ impl SegmentWriter {
|
||||
segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(&schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
tokenizers,
|
||||
per_field_text_analyzers,
|
||||
term_buffer: Term::new(),
|
||||
schema,
|
||||
})
|
||||
@@ -165,9 +170,9 @@ impl SegmentWriter {
|
||||
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
|
||||
let postings_writer: &mut dyn PostingsWriter =
|
||||
self.per_field_postings_writers.get_for_field_mut(field);
|
||||
term_buffer.set_field(field_entry.field_type().value_type(), field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Facet(_) => {
|
||||
term_buffer.set_field(Type::Facet, field);
|
||||
for value in values {
|
||||
let facet = value.as_facet().ok_or_else(make_schema_error)?;
|
||||
let facet_str = facet.encoded_str();
|
||||
@@ -205,13 +210,11 @@ impl SegmentWriter {
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
let text_analyzer =
|
||||
&self.per_field_text_analyzers[field.field_id() as usize];
|
||||
offsets.push(total_offset);
|
||||
total_offset += text.len();
|
||||
token_streams.push(text_analyzer.token_stream(text));
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
@@ -219,9 +222,9 @@ impl SegmentWriter {
|
||||
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
for mut token_stream in token_streams {
|
||||
assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&mut *token_stream,
|
||||
term_buffer,
|
||||
ctx,
|
||||
@@ -233,7 +236,6 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::U64, field);
|
||||
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_u64(u64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
@@ -241,7 +243,6 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::Date, field);
|
||||
let date_val = value.as_date().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(date_val.timestamp());
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
@@ -249,7 +250,6 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::I64, field);
|
||||
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(i64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
@@ -257,7 +257,6 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::F64, field);
|
||||
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_f64(f64_val);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
@@ -265,12 +264,25 @@ impl SegmentWriter {
|
||||
}
|
||||
FieldType::Bytes(_) => {
|
||||
for value in values {
|
||||
term_buffer.set_field(Type::Bytes, field);
|
||||
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_bytes(bytes);
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => {
|
||||
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
|
||||
let json_values_it = values
|
||||
.iter()
|
||||
.map(|value| value.as_json().ok_or_else(make_schema_error));
|
||||
index_json_values(
|
||||
doc_id,
|
||||
json_values_it,
|
||||
text_analyzer,
|
||||
term_buffer,
|
||||
postings_writer,
|
||||
ctx,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -398,10 +410,16 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::Utc;
|
||||
|
||||
use super::compute_initial_table_size;
|
||||
use crate::schema::{Schema, STORED, TEXT};
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::json_term_writer::JsonTermWriter;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::Document;
|
||||
use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
@@ -440,4 +458,247 @@ mod tests {
|
||||
Some("title")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{
|
||||
"toto": "titi",
|
||||
"float": -0.2,
|
||||
"unsigned": 1,
|
||||
"signed": -2,
|
||||
"complexobject": {
|
||||
"field.with.dot": 1
|
||||
},
|
||||
"date": "1985-04-12T23:20:50.52Z",
|
||||
"my_arr": [2, 3, {"my_key": "two tokens"}, 4]
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val.clone());
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let doc = searcher
|
||||
.doc(DocAddress {
|
||||
segment_ord: 0u32,
|
||||
doc_id: 0u32,
|
||||
})
|
||||
.unwrap();
|
||||
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
|
||||
&schema.to_json(&doc),
|
||||
)
|
||||
.unwrap()
|
||||
.get("json")
|
||||
.unwrap()[0]
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
assert_eq!(json_val, serdeser_json_val);
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_idx = segment_reader.inverted_index(json_field).unwrap();
|
||||
let term_dict = inv_idx.terms();
|
||||
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut term_stream = term_dict.stream().unwrap();
|
||||
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("complexobject");
|
||||
json_term_writer.push_path_segment("field.with.dot");
|
||||
json_term_writer.set_fast_value(1u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("date");
|
||||
json_term_writer.set_fast_value(
|
||||
chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z")
|
||||
.unwrap()
|
||||
.with_timezone(&Utc),
|
||||
);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("float");
|
||||
json_term_writer.set_fast_value(-0.2f64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("my_arr");
|
||||
json_term_writer.set_fast_value(2u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_fast_value(3u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_fast_value(4u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.push_path_segment("my_key");
|
||||
json_term_writer.set_str("tokens");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.set_str("two");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("signed");
|
||||
json_term_writer.set_fast_value(-2i64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("toto");
|
||||
json_term_writer.set_str("titi");
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("unsigned");
|
||||
json_term_writer.set_fast_value(1u64);
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
assert!(!term_stream.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_tokenized_with_position() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let mut doc = Document::default();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(r#"{"mykey": "repeated token token"}"#).unwrap();
|
||||
doc.add_json_object(json_field, json_val.clone());
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("token");
|
||||
let term_info = inv_index
|
||||
.get_term_info(json_term_writer.term())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
term_info,
|
||||
TermInfo {
|
||||
doc_freq: 1,
|
||||
postings_range: 2..4,
|
||||
positions_range: 2..5
|
||||
}
|
||||
);
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(&positions[..], &[1, 2]);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_raw_no_position() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inv_index = segment_reader.inverted_index(json_field).unwrap();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.set_str("two tokens");
|
||||
let term_info = inv_index
|
||||
.get_term_info(json_term_writer.term())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
term_info,
|
||||
TermInfo {
|
||||
doc_freq: 1,
|
||||
postings_range: 0..1,
|
||||
positions_range: 0..0
|
||||
}
|
||||
);
|
||||
let mut postings = inv_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqs)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut positions = Vec::new();
|
||||
postings.positions(&mut positions);
|
||||
assert_eq!(postings.advance(), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_position_overlapping_path() {
|
||||
// This test checks that we do not end up detecting phrase query due
|
||||
// to several string literal in the same json object being overlapping.
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
|
||||
r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
|
||||
)
|
||||
.unwrap();
|
||||
let doc = doc!(json_field=>json_val);
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
writer.add_document(doc).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let mut term = Term::new();
|
||||
term.set_field(Type::Json, json_field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_term_writer.push_path_segment("mykey");
|
||||
json_term_writer.push_path_segment("field");
|
||||
json_term_writer.set_str("hello");
|
||||
let hello_term = json_term_writer.term().clone();
|
||||
json_term_writer.set_str("nothello");
|
||||
let nothello_term = json_term_writer.term().clone();
|
||||
json_term_writer.set_str("happy");
|
||||
let happy_term = json_term_writer.term().clone();
|
||||
let phrase_query = PhraseQuery::new(vec![hello_term, happy_term.clone()]);
|
||||
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 1);
|
||||
let phrase_query = PhraseQuery::new(vec![nothello_term, happy_term]);
|
||||
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
94
src/postings/json_postings_writer.rs
Normal file
94
src/postings/json_postings_writer.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
use std::io;
|
||||
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
|
||||
use crate::postings::stacker::Addr;
|
||||
use crate::postings::{
|
||||
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
|
||||
};
|
||||
use crate::schema::term::as_json_path_type_value_bytes;
|
||||
use crate::schema::Type;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::{DocId, Term};
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
|
||||
str_posting_writer: SpecializedPostingsWriter<Rec>,
|
||||
non_str_posting_writer: SpecializedPostingsWriter<NothingRecorder>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(json_postings_writer: JsonPostingsWriter<Rec>) -> Box<dyn PostingsWriter> {
|
||||
Box::new(json_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
pos: u32,
|
||||
term: &crate::Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
|
||||
}
|
||||
|
||||
fn index_text(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
self.str_posting_writer.index_text(
|
||||
doc_id,
|
||||
token_stream,
|
||||
term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
);
|
||||
}
|
||||
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
// TODO optimization opportunity here.
|
||||
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
|
||||
if typ == Type::Str {
|
||||
SpecializedPostingsWriter::<Rec>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64 {
|
||||
self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens()
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ pub(crate) use self::block_search::branchless_binary_search;
|
||||
mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
mod indexing_context;
|
||||
mod json_postings_writer;
|
||||
mod per_field_postings_writer;
|
||||
mod postings;
|
||||
mod postings_writer;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::postings::json_postings_writer::JsonPostingsWriter;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
|
||||
use crate::postings::PostingsWriter;
|
||||
@@ -33,21 +34,38 @@ fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn Postings
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::default().into()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed),
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::default().into()),
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::Bytes(_)
|
||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
| FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::<NothingRecorder>::default()),
|
||||
FieldType::JsonObject(ref json_object_options) => {
|
||||
if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {
|
||||
match text_indexing_option.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
JsonPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
JsonPostingsWriter::<TermFrequencyRecorder>::default().into()
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
JsonPostingsWriter::<TfAndPositionRecorder>::default().into()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
JsonPostingsWriter::<NothingRecorder>::default().into()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::postings::{
|
||||
FieldSerializer, IndexingContext, InvertedIndexSerializer, PerFieldPostingsWriter,
|
||||
UnorderedTermId,
|
||||
};
|
||||
use crate::schema::{Field, FieldType, Schema, Term, Type};
|
||||
use crate::schema::{Field, FieldType, Schema, Term};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::{Token, TokenStream, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
@@ -85,6 +85,7 @@ pub(crate) fn serialize_postings(
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes(_) => {}
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
@@ -142,13 +143,12 @@ pub(crate) trait PostingsWriter {
|
||||
fn index_text(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
) {
|
||||
term_buffer.set_field(Type::Str, field);
|
||||
let end_of_path_idx = term_buffer.as_slice().len();
|
||||
let mut num_tokens = 0;
|
||||
let mut end_position = 0;
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
@@ -162,7 +162,8 @@ pub(crate) trait PostingsWriter {
|
||||
);
|
||||
return;
|
||||
}
|
||||
term_buffer.set_text(token.text.as_str());
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
@@ -170,6 +171,7 @@ pub(crate) trait PostingsWriter {
|
||||
});
|
||||
indexing_position.end_position = end_position + POSITION_GAP;
|
||||
indexing_position.num_tokens += num_tokens;
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
}
|
||||
|
||||
fn total_num_tokens(&self) -> u64;
|
||||
@@ -177,27 +179,40 @@ pub(crate) trait PostingsWriter {
|
||||
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
/// constructor
|
||||
pub fn new() -> SpecializedPostingsWriter<Rec> {
|
||||
SpecializedPostingsWriter {
|
||||
total_num_tokens: 0u64,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a memory arena.
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
fn from(
|
||||
specialized_postings_writer: SpecializedPostingsWriter<Rec>,
|
||||
) -> Box<dyn PostingsWriter> {
|
||||
Box::new(specialized_postings_writer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
#[inline]
|
||||
pub(crate) fn serialize_one_term(
|
||||
term: &Term<&[u8]>,
|
||||
addr: Addr,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.term_index.read(addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
@@ -218,7 +233,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::new();
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
@@ -235,11 +250,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
let recorder: Rec = ctx.term_index.read(*addr);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, &mut buffer_lender);
|
||||
serializer.close_term()?;
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -56,9 +56,7 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
||||
/// * the document id
|
||||
/// * the term frequency
|
||||
/// * the term positions
|
||||
pub(crate) trait Recorder: Copy + 'static {
|
||||
///
|
||||
fn new() -> Self;
|
||||
pub(crate) trait Recorder: Copy + Default + 'static {
|
||||
/// Returns the current document
|
||||
fn current_doc(&self) -> u32;
|
||||
/// Starts recording information about a new document
|
||||
@@ -90,14 +88,16 @@ pub struct NothingRecorder {
|
||||
current_doc: DocId,
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn new() -> Self {
|
||||
impl Default for NothingRecorder {
|
||||
fn default() -> Self {
|
||||
NothingRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for NothingRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
@@ -152,8 +152,8 @@ pub struct TermFrequencyRecorder {
|
||||
term_doc_freq: u32,
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn new() -> Self {
|
||||
impl Default for TermFrequencyRecorder {
|
||||
fn default() -> Self {
|
||||
TermFrequencyRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: 0,
|
||||
@@ -161,7 +161,9 @@ impl Recorder for TermFrequencyRecorder {
|
||||
term_doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
@@ -223,15 +225,18 @@ pub struct TfAndPositionRecorder {
|
||||
current_doc: DocId,
|
||||
term_doc_freq: u32,
|
||||
}
|
||||
impl Recorder for TfAndPositionRecorder {
|
||||
fn new() -> Self {
|
||||
|
||||
impl Default for TfAndPositionRecorder {
|
||||
fn default() -> Self {
|
||||
TfAndPositionRecorder {
|
||||
stack: ExpUnrolledLinkedList::new(),
|
||||
current_doc: u32::max_value(),
|
||||
term_doc_freq: 0u32,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Recorder for TfAndPositionRecorder {
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
|
||||
@@ -76,7 +76,7 @@ impl InvertedIndexSerializer {
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -122,24 +122,21 @@ impl<'a> FieldSerializer<'a> {
|
||||
fieldnorm_reader: Option<FieldNormReader>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
total_num_tokens.serialize(postings_write)?;
|
||||
let mode = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
if let Some(text_indexing_options) = text_options.get_indexing_options() {
|
||||
text_indexing_options.index_option()
|
||||
} else {
|
||||
IndexRecordOption::Basic
|
||||
}
|
||||
}
|
||||
_ => IndexRecordOption::Basic,
|
||||
};
|
||||
let index_record_option = field_type
|
||||
.index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
let term_dictionary_builder = TermDictionaryBuilder::create(term_dictionary_write)?;
|
||||
let average_fieldnorm = fieldnorm_reader
|
||||
.as_ref()
|
||||
.map(|ff_reader| (total_num_tokens as Score / ff_reader.num_docs() as Score))
|
||||
.unwrap_or(0.0);
|
||||
let postings_serializer =
|
||||
PostingsSerializer::new(postings_write, average_fieldnorm, mode, fieldnorm_reader);
|
||||
let positions_serializer_opt = if mode.has_positions() {
|
||||
let postings_serializer = PostingsSerializer::new(
|
||||
postings_write,
|
||||
average_fieldnorm,
|
||||
index_record_option,
|
||||
fieldnorm_reader,
|
||||
);
|
||||
let positions_serializer_opt = if index_record_option.has_positions() {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
} else {
|
||||
None
|
||||
@@ -203,6 +200,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq);
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
assert_eq!(term_freq as usize, position_deltas.len());
|
||||
positions_serializer.write_positions_delta(position_deltas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,8 +204,8 @@ impl BooleanQuery {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BooleanQuery;
|
||||
use crate::collector::DocSetCollector;
|
||||
use crate::query::{QueryClone, TermQuery};
|
||||
use crate::collector::{Count, DocSetCollector};
|
||||
use crate::query::{QueryClone, QueryParser, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{DocAddress, Index, Term};
|
||||
|
||||
@@ -282,4 +282,42 @@ mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_json_array_pitfall_bag_of_terms() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"cart": [
|
||||
{"product_type": "sneakers", "attributes": {"color": "white"}},
|
||||
{"product_type": "t-shirt", "attributes": {"color": "red"}},
|
||||
{"product_type": "cd", "attributes": {"genre": "blues"}},
|
||||
]
|
||||
})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
let doc_matches = |query: &str| {
|
||||
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
searcher.search(&query, &Count).unwrap() == 1
|
||||
};
|
||||
// As expected
|
||||
assert!(doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:white"#
|
||||
));
|
||||
// Unexpected match, due to the fact that array do not act as nested docs.
|
||||
assert!(doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:red"#
|
||||
));
|
||||
// However, bviously this works...
|
||||
assert!(!doc_matches(
|
||||
r#"cart.product_type:sneakers AND cart.attributes.color:blues"#
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,10 +9,12 @@ pub use self::phrase_weight::PhraseWeight;
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
|
||||
use crate::core::Index;
|
||||
use crate::query::Weight;
|
||||
use crate::query::{QueryParser, Weight};
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::{assert_nearly_equals, DocAddress, DocId, TERMINATED};
|
||||
|
||||
@@ -248,4 +250,56 @@ pub mod tests {
|
||||
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query_on_json() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json_field = schema_builder.add_json_field("json", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text": "elliot smith the happy who"
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text": "the who elliot smith"
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"arr": [{"text":"the who"}, {"text":"elliot smith"}]
|
||||
})))?;
|
||||
index_writer.add_document(doc!(json_field=>json!({
|
||||
"text2": "the smith"
|
||||
})))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
let matching_docs = |query: &str| {
|
||||
let query_parser = QueryParser::for_index(&index, vec![json_field]);
|
||||
let phrase_query = query_parser.parse_query(query).unwrap();
|
||||
let phrase_weight = phrase_query.weight(&*searcher, false).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.scorer(searcher.segment_reader(0), 1.0f32)
|
||||
.unwrap();
|
||||
let mut docs = Vec::new();
|
||||
loop {
|
||||
let doc = phrase_scorer.doc();
|
||||
if doc == TERMINATED {
|
||||
break;
|
||||
}
|
||||
docs.push(doc);
|
||||
phrase_scorer.advance();
|
||||
}
|
||||
docs
|
||||
};
|
||||
assert!(matching_docs(r#"text:"the smith""#).is_empty());
|
||||
assert_eq!(&matching_docs(r#"text:the"#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"the""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"smith""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text:"elliot smith""#), &[0u32, 1u32]);
|
||||
assert_eq!(&matching_docs(r#"text2:"the smith""#), &[3u32]);
|
||||
assert!(&matching_docs(r#"arr.text:"the smith""#).is_empty());
|
||||
assert_eq!(&matching_docs(r#"arr.text:"elliot smith""#), &[2]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,10 +31,11 @@ impl PhraseWeight {
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
if self.scoring_enabled {
|
||||
reader.get_fieldnorms_reader(field)
|
||||
} else {
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
||||
return Ok(fieldnorm_reader);
|
||||
}
|
||||
}
|
||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||
}
|
||||
|
||||
fn phrase_scorer(
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -174,7 +174,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
r#"TermQuery(Term(type=Str, field=1, val="hello"))"#
|
||||
r#"TermQuery(Term(type=Str, field=1, "hello"))"#
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -117,7 +117,16 @@ impl Document {
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_bytes<T: Into<Vec<u8>>>(&mut self, field: Field, value: T) {
|
||||
self.add_field_value(field, value.into())
|
||||
self.add_field_value(field, value.into());
|
||||
}
|
||||
|
||||
/// Add a bytes field
|
||||
pub fn add_json_object(
|
||||
&mut self,
|
||||
field: Field,
|
||||
json_object: serde_json::Map<String, serde_json::Value>,
|
||||
) {
|
||||
self.add_field_value(field, json_object);
|
||||
}
|
||||
|
||||
/// Add a (field, value) to the document.
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::{is_valid_field_name, FacetOptions, FieldType, NumericOptions, TextOptions};
|
||||
use crate::schema::{
|
||||
is_valid_field_name, FacetOptions, FieldType, JsonObjectOptions, NumericOptions, TextOptions,
|
||||
};
|
||||
|
||||
/// A `FieldEntry` represents a field and its configuration.
|
||||
/// `Schema` are a collection of `FieldEntry`
|
||||
@@ -27,71 +29,44 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
/// Creates a new text field entry.
|
||||
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Str(text_options),
|
||||
}
|
||||
Self::new(field_name, FieldType::Str(text_options))
|
||||
}
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_u64(field_name: String, field_type: NumericOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::U64(field_type),
|
||||
}
|
||||
/// Creates a new u64 field entry.
|
||||
pub fn new_u64(field_name: String, int_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::U64(int_options))
|
||||
}
|
||||
|
||||
/// Creates a new i64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_i64(field_name: String, field_type: NumericOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::I64(field_type),
|
||||
}
|
||||
/// Creates a new i64 field entry.
|
||||
pub fn new_i64(field_name: String, int_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::I64(int_options))
|
||||
}
|
||||
|
||||
/// Creates a new f64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_f64(field_name: String, field_type: NumericOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::F64(field_type),
|
||||
}
|
||||
/// Creates a new f64 field entry.
|
||||
pub fn new_f64(field_name: String, f64_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::F64(f64_options))
|
||||
}
|
||||
|
||||
/// Creates a new date field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_date(field_name: String, field_type: NumericOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Date(field_type),
|
||||
}
|
||||
/// Creates a new date field entry.
|
||||
pub fn new_date(field_name: String, date_options: NumericOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Date(date_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a facet.
|
||||
pub fn new_facet(field_name: String, field_type: FacetOptions) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Facet(field_type),
|
||||
}
|
||||
pub fn new_facet(field_name: String, facet_options: FacetOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Facet(facet_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a bytes field
|
||||
pub fn new_bytes(field_name: String, bytes_type: BytesOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::Bytes(bytes_type),
|
||||
}
|
||||
pub fn new_bytes(field_name: String, bytes_options: BytesOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::Bytes(bytes_options))
|
||||
}
|
||||
|
||||
/// Creates a field entry for a json field
|
||||
pub fn new_json(field_name: String, json_object_options: JsonObjectOptions) -> FieldEntry {
|
||||
Self::new(field_name, FieldType::JsonObject(json_object_options))
|
||||
}
|
||||
|
||||
/// Returns the name of the field
|
||||
@@ -137,6 +112,7 @@ impl FieldEntry {
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
FieldType::Facet(ref options) => options.is_stored(),
|
||||
FieldType::Bytes(ref options) => options.is_stored(),
|
||||
FieldType::JsonObject(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,8 @@ use thiserror::Error;
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
use crate::schema::facet_options::FacetOptions;
|
||||
use crate::schema::{
|
||||
Facet, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions, Value,
|
||||
Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions,
|
||||
Value,
|
||||
};
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
|
||||
@@ -49,9 +50,11 @@ pub enum Type {
|
||||
Facet = b'h',
|
||||
/// `Vec<u8>`
|
||||
Bytes = b'b',
|
||||
/// Leaf in a Json object.
|
||||
Json = b'j',
|
||||
}
|
||||
|
||||
const ALL_TYPES: [Type; 7] = [
|
||||
const ALL_TYPES: [Type; 8] = [
|
||||
Type::Str,
|
||||
Type::U64,
|
||||
Type::I64,
|
||||
@@ -59,6 +62,7 @@ const ALL_TYPES: [Type; 7] = [
|
||||
Type::Date,
|
||||
Type::Facet,
|
||||
Type::Bytes,
|
||||
Type::Json,
|
||||
];
|
||||
|
||||
impl Type {
|
||||
@@ -83,6 +87,7 @@ impl Type {
|
||||
Type::Date => "Date",
|
||||
Type::Facet => "Facet",
|
||||
Type::Bytes => "Bytes",
|
||||
Type::Json => "Json",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,6 +102,7 @@ impl Type {
|
||||
b'd' => Some(Type::Date),
|
||||
b'h' => Some(Type::Facet),
|
||||
b'b' => Some(Type::Bytes),
|
||||
b'j' => Some(Type::Json),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -123,6 +129,8 @@ pub enum FieldType {
|
||||
Facet(FacetOptions),
|
||||
/// Bytes (one per document)
|
||||
Bytes(BytesOptions),
|
||||
/// Json object
|
||||
JsonObject(JsonObjectOptions),
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
@@ -136,6 +144,7 @@ impl FieldType {
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::Facet(_) => Type::Facet,
|
||||
FieldType::Bytes(_) => Type::Bytes,
|
||||
FieldType::JsonObject(_) => Type::Json,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,6 +158,7 @@ impl FieldType {
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::Facet(ref _facet_options) => true,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(),
|
||||
FieldType::JsonObject(ref json_object_options) => json_object_options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,6 +170,9 @@ impl FieldType {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing| text_indexing.index_option()),
|
||||
FieldType::JsonObject(json_object_options) => json_object_options
|
||||
.get_text_indexing_options()
|
||||
.map(|text_indexing| text_indexing.index_option()),
|
||||
field_type => {
|
||||
if field_type.is_indexed() {
|
||||
Some(IndexRecordOption::Basic)
|
||||
@@ -183,6 +196,7 @@ impl FieldType {
|
||||
| FieldType::Date(ref int_options) => int_options.fieldnorms(),
|
||||
FieldType::Facet(_) => false,
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(),
|
||||
FieldType::JsonObject(ref _json_object_options) => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,6 +231,9 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(ref json_obj_options) => json_obj_options
|
||||
.get_text_indexing_options()
|
||||
.map(TextFieldIndexing::index_option),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,41 +242,43 @@ impl FieldType {
|
||||
/// Tantivy will not try to cast values.
|
||||
/// For instance, If the json value is the integer `3` and the
|
||||
/// target field is a `Str`, this method will return an Error.
|
||||
pub fn value_from_json(&self, json: &JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match *json {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match json {
|
||||
JsonValue::String(field_text) => match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
|
||||
chrono::DateTime::parse_from_rfc3339(field_text).map_err(|_err| {
|
||||
chrono::DateTime::parse_from_rfc3339(&field_text).map_err(|_err| {
|
||||
ValueParsingError::TypeError {
|
||||
expected: "rfc3339 format",
|
||||
json: json.clone(),
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?;
|
||||
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: json.clone(),
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64 {
|
||||
base64: field_text.clone(),
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(&field_text)
|
||||
.map(Value::Bytes)
|
||||
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::String(field_text),
|
||||
}),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
JsonValue::Number(field_val_num) => match self {
|
||||
FieldType::I64(_) | FieldType::Date(_) => {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
Ok(Value::I64(field_val_i64))
|
||||
} else {
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "an i64 int",
|
||||
json: json.clone(),
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -269,7 +288,7 @@ impl FieldType {
|
||||
} else {
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "u64",
|
||||
json: json.clone(),
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -279,33 +298,38 @@ impl FieldType {
|
||||
} else {
|
||||
Err(ValueParsingError::OverflowError {
|
||||
expected: "a f64",
|
||||
json: json.clone(),
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string",
|
||||
json: json.clone(),
|
||||
json: JsonValue::Number(field_val_num),
|
||||
})
|
||||
}
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::Number(field_val_num),
|
||||
}),
|
||||
},
|
||||
JsonValue::Object(_) => match *self {
|
||||
JsonValue::Object(json_map) => match self {
|
||||
FieldType::Str(_) => {
|
||||
if let Ok(tok_str_val) =
|
||||
serde_json::from_value::<PreTokenizedString>(json.clone())
|
||||
{
|
||||
if let Ok(tok_str_val) = serde_json::from_value::<PreTokenizedString>(
|
||||
serde_json::Value::Object(json_map.clone()),
|
||||
) {
|
||||
Ok(Value::PreTokStr(tok_str_val))
|
||||
} else {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "a string or an pretokenized string",
|
||||
json: json.clone(),
|
||||
json: JsonValue::Object(json_map),
|
||||
})
|
||||
}
|
||||
}
|
||||
FieldType::JsonObject(_) => Ok(Value::JsonObject(json_map)),
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
expected: self.value_type().name(),
|
||||
json: json.clone(),
|
||||
json: JsonValue::Object(json_map),
|
||||
}),
|
||||
},
|
||||
_ => Err(ValueParsingError::TypeError {
|
||||
@@ -319,6 +343,7 @@ impl FieldType {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
||||
use serde_json::json;
|
||||
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
@@ -354,17 +379,17 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bytes_value_from_json() {
|
||||
let result = FieldType::Bytes(Default::default())
|
||||
.value_from_json(&json!("dGhpcyBpcyBhIHRlc3Q="))
|
||||
.value_from_json(json!("dGhpcyBpcyBhIHRlc3Q="))
|
||||
.unwrap();
|
||||
assert_eq!(result, Value::Bytes("this is a test".as_bytes().to_vec()));
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521));
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(json!(521));
|
||||
match result {
|
||||
Err(ValueParsingError::TypeError { .. }) => {}
|
||||
_ => panic!("Expected parse failure for wrong type"),
|
||||
}
|
||||
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-"));
|
||||
let result = FieldType::Bytes(Default::default()).value_from_json(json!("-"));
|
||||
match result {
|
||||
Err(ValueParsingError::InvalidBase64 { .. }) => {}
|
||||
_ => panic!("Expected parse failure for invalid base64"),
|
||||
@@ -428,7 +453,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let deserialized_value = FieldType::Str(TextOptions::default())
|
||||
.value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||
.value_from_json(serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(deserialized_value, expected_value);
|
||||
|
||||
109
src/schema/json_object_options.rs
Normal file
109
src/schema/json_object_options.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::schema::flags::{SchemaFlagList, StoredFlag};
|
||||
use crate::schema::{TextFieldIndexing, TextOptions};
|
||||
|
||||
/// The `JsonObjectOptions` make it possible to
|
||||
/// configure how a json object field should be indexed and stored.
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct JsonObjectOptions {
|
||||
stored: bool,
|
||||
// If set to some, int, date, f64 and text will be indexed.
|
||||
// Text will use the TextFieldIndexing setting for indexing.
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
}
|
||||
|
||||
impl JsonObjectOptions {
|
||||
/// Returns `true` iff the json object should be stored.
|
||||
pub fn is_stored(&self) -> bool {
|
||||
self.stored
|
||||
}
|
||||
|
||||
/// Returns `true` iff the json object should be indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
self.indexing.is_some()
|
||||
}
|
||||
|
||||
/// Returns the text indexing options.
|
||||
///
|
||||
/// If set to `Some` then both int and str values will be indexed.
|
||||
/// The inner `TextFieldIndexing` will however, only apply to the str values
|
||||
/// in the json object.
|
||||
pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StoredFlag> for JsonObjectOptions {
|
||||
fn from(_stored_flag: StoredFlag) -> Self {
|
||||
JsonObjectOptions {
|
||||
stored: true,
|
||||
indexing: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<()> for JsonObjectOptions {
|
||||
fn from(_: ()) -> Self {
|
||||
Self::default()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Into<JsonObjectOptions>> BitOr<T> for JsonObjectOptions {
|
||||
type Output = JsonObjectOptions;
|
||||
|
||||
fn bitor(self, other: T) -> Self {
|
||||
let other = other.into();
|
||||
JsonObjectOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for JsonObjectOptions
|
||||
where
|
||||
Head: Clone,
|
||||
Tail: Clone,
|
||||
Self: BitOr<Output = Self> + From<Head> + From<Tail>,
|
||||
{
|
||||
fn from(head_tail: SchemaFlagList<Head, Tail>) -> Self {
|
||||
Self::from(head_tail.head) | Self::from(head_tail.tail)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TextOptions> for JsonObjectOptions {
|
||||
fn from(text_options: TextOptions) -> Self {
|
||||
JsonObjectOptions {
|
||||
stored: text_options.is_stored(),
|
||||
indexing: text_options.get_indexing_options().cloned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::schema::{STORED, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_json_options() {
|
||||
{
|
||||
let json_options: JsonObjectOptions = (STORED | TEXT).into();
|
||||
assert!(json_options.is_stored());
|
||||
assert!(json_options.is_indexed());
|
||||
}
|
||||
{
|
||||
let json_options: JsonObjectOptions = TEXT.into();
|
||||
assert!(!json_options.is_stored());
|
||||
assert!(json_options.is_indexed());
|
||||
}
|
||||
{
|
||||
let json_options: JsonObjectOptions = STORED.into();
|
||||
assert!(json_options.is_stored());
|
||||
assert!(!json_options.is_indexed());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -104,7 +104,7 @@ mod document;
|
||||
mod facet;
|
||||
mod facet_options;
|
||||
mod schema;
|
||||
mod term;
|
||||
pub(crate) mod term;
|
||||
|
||||
mod field_entry;
|
||||
mod field_type;
|
||||
@@ -112,14 +112,14 @@ mod field_value;
|
||||
|
||||
mod bytes_options;
|
||||
mod field;
|
||||
mod flags;
|
||||
mod index_record_option;
|
||||
mod json_object_options;
|
||||
mod named_field_document;
|
||||
mod numeric_options;
|
||||
mod text_options;
|
||||
mod value;
|
||||
|
||||
mod flags;
|
||||
|
||||
pub use self::bytes_options::BytesOptions;
|
||||
pub use self::document::Document;
|
||||
pub(crate) use self::facet::FACET_SEP_BYTE;
|
||||
@@ -131,9 +131,11 @@ pub use self::field_type::{FieldType, Type};
|
||||
pub use self::field_value::FieldValue;
|
||||
pub use self::flags::{FAST, INDEXED, STORED};
|
||||
pub use self::index_record_option::IndexRecordOption;
|
||||
pub use self::json_object_options::JsonObjectOptions;
|
||||
pub use self::named_field_document::NamedFieldDocument;
|
||||
pub use self::numeric_options::NumericOptions;
|
||||
#[allow(deprecated)]
|
||||
pub use self::numeric_options::{Cardinality, IntOptions, NumericOptions};
|
||||
pub use self::numeric_options::{Cardinality, IntOptions};
|
||||
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
|
||||
pub use self::term::Term;
|
||||
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::sync::Arc;
|
||||
use serde::de::{SeqAccess, Visitor};
|
||||
use serde::ser::SerializeSeq;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::{self, Map as JsonObject, Value as JsonValue};
|
||||
use serde_json::{self, Value as JsonValue};
|
||||
|
||||
use super::*;
|
||||
use crate::schema::bytes_options::BytesOptions;
|
||||
@@ -173,6 +173,16 @@ impl SchemaBuilder {
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a json object field to the schema.
|
||||
pub fn add_json_field<T: Into<JsonObjectOptions>>(
|
||||
&mut self,
|
||||
field_name: &str,
|
||||
field_options: T,
|
||||
) -> Field {
|
||||
let field_entry = FieldEntry::new_json(field_name.to_string(), field_options.into());
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a field entry to the schema in build.
|
||||
pub fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field::from_field_id(self.fields.len() as u32);
|
||||
@@ -298,23 +308,23 @@ impl Schema {
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_obj: JsonObject<String, JsonValue> =
|
||||
serde_json::from_str(doc_json).map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJson(doc_json_sample)
|
||||
})?;
|
||||
let json_obj: serde_json::Map<String, JsonValue> =
|
||||
serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?;
|
||||
self.json_object_to_doc(json_obj)
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn json_object_to_doc(
|
||||
&self,
|
||||
json_obj: serde_json::Map<String, JsonValue>,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
if let Some(field) = self.get_field(field_name) {
|
||||
for (field_name, json_value) in json_obj {
|
||||
if let Some(field) = self.get_field(&field_name) {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
match json_value {
|
||||
JsonValue::Array(json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
@@ -383,12 +393,24 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[error("The provided string is not valid JSON")]
|
||||
NotJson(String),
|
||||
InvalidJson(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[error("The field '{0:?}' could not be parsed: {1:?}")]
|
||||
ValueError(String, ValueParsingError),
|
||||
}
|
||||
|
||||
impl DocParsingError {
|
||||
/// Builds a NotJson DocParsingError
|
||||
fn invalid_json(invalid_json: &str) -> Self {
|
||||
let sample_json: String = if invalid_json.len() < 20 {
|
||||
invalid_json.to_string()
|
||||
} else {
|
||||
format!("{:?}...", &invalid_json[0..20])
|
||||
};
|
||||
DocParsingError::InvalidJson(sample_json)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -399,7 +421,7 @@ mod tests {
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::numeric_options::Cardinality::SingleValue;
|
||||
use crate::schema::schema::DocParsingError::NotJson;
|
||||
use crate::schema::schema::DocParsingError::InvalidJson;
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
@@ -732,7 +754,7 @@ mod tests {
|
||||
"count": 50,
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(json_err, Err(NotJson(_)));
|
||||
assert_matches!(json_err, Err(InvalidJson(_)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::convert::TryInto;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::{fmt, str};
|
||||
|
||||
@@ -8,8 +9,26 @@ use crate::DateTime;
|
||||
|
||||
/// Size (in bytes) of the buffer of a fast value (u64, i64, f64, or date) term.
|
||||
/// <field> + <type byte> + <value len>
|
||||
///
|
||||
/// - <field> is a big endian encoded u32 field id
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
|
||||
/// The remaining 7 bits are used to encode the type of the value.
|
||||
/// If this is a JSON term, the type is the type of the leaf of the json.
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
|
||||
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
|
||||
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
|
||||
|
||||
/// Separates the json path and the value in
|
||||
/// a JSON term binary representation.
|
||||
pub const JSON_END_OF_PATH: u8 = 0u8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
@@ -17,6 +36,12 @@ const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where B: AsRef<[u8]>;
|
||||
|
||||
impl AsMut<Vec<u8>> for Term {
|
||||
fn as_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
pub(crate) fn new() -> Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
@@ -120,6 +145,22 @@ impl Term {
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.set_bytes(text.as_bytes());
|
||||
}
|
||||
|
||||
/// Removes the value_bytes and set the type code.
|
||||
pub fn clear_with_type(&mut self, typ: Type) {
|
||||
self.truncate(5);
|
||||
self.0[4] = typ.to_code();
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
self.0.truncate(len);
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.extend_from_slice(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> Ord for Term<B>
|
||||
@@ -164,13 +205,16 @@ where B: AsRef<[u8]>
|
||||
Term(data)
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
*self
|
||||
.as_slice()
|
||||
.get(4)
|
||||
.expect("the byte representation is too short")
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
pub fn typ(&self) -> Type {
|
||||
assert!(
|
||||
self.as_slice().len() >= 5,
|
||||
"the type does byte representation is too short"
|
||||
);
|
||||
Type::from_code(self.as_slice()[4]).expect("The term has an invalid type code")
|
||||
Type::from_code(self.typ_code()).expect("The term has an invalid type code")
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
@@ -189,10 +233,14 @@ where B: AsRef<[u8]>
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() || self.as_slice().len() != FAST_VALUE_TERM_LEN {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
let bytes = self.value_bytes();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
@@ -290,40 +338,74 @@ fn write_opt<T: std::fmt::Debug>(f: &mut fmt::Formatter, val_opt: Option<T>) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={:?}, field={}, val=", typ, field_id,)?;
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = str::from_utf8(self.value_bytes()).ok();
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
write_opt(f, self.as_u64())?;
|
||||
}
|
||||
Type::I64 => {
|
||||
let val_i64 = self.as_i64();
|
||||
write_opt(f, val_i64)?;
|
||||
}
|
||||
Type::F64 => {
|
||||
let val_f64 = self.as_f64();
|
||||
write_opt(f, val_f64)?;
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
let val_date = self.as_date();
|
||||
write_opt(f, val_date)?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet = self.as_facet().map(|facet| facet.to_path_string());
|
||||
write_opt(f, facet)?;
|
||||
}
|
||||
Type::Bytes => {
|
||||
write_opt(f, self.as_bytes())?;
|
||||
fn as_str(value_bytes: &[u8]) -> Option<&str> {
|
||||
std::str::from_utf8(value_bytes).ok()
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(bytes: &[u8]) -> Option<T> {
|
||||
let value_u64 = u64::from_be_bytes(bytes.try_into().ok()?);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
|
||||
/// Returns the json path (without non-human friendly separators, the type of the value, and the
|
||||
/// value bytes). Returns None if the value is not JSON or is not valid.
|
||||
pub(crate) fn as_json_path_type_value_bytes(bytes: &[u8]) -> Option<(&str, Type, &[u8])> {
|
||||
let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?;
|
||||
let json_path = str::from_utf8(&bytes[..pos]).ok()?;
|
||||
let type_code = *bytes.get(pos + 1)?;
|
||||
let typ = Type::from_code(type_code)?;
|
||||
Some((json_path, typ, &bytes[pos + 2..]))
|
||||
}
|
||||
|
||||
fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match typ {
|
||||
Type::Str => {
|
||||
let s = as_str(bytes);
|
||||
write_opt(f, s)?;
|
||||
}
|
||||
Type::U64 => {
|
||||
write_opt(f, get_fast_type::<u64>(bytes))?;
|
||||
}
|
||||
Type::I64 => {
|
||||
write_opt(f, get_fast_type::<i64>(bytes))?;
|
||||
}
|
||||
Type::F64 => {
|
||||
write_opt(f, get_fast_type::<f64>(bytes))?;
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
write_opt(f, get_fast_type::<crate::DateTime>(bytes))?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet_str = str::from_utf8(bytes)
|
||||
.ok()
|
||||
.map(ToString::to_string)
|
||||
.map(Facet::from_encoded_string)
|
||||
.map(|facet| facet.to_path_string());
|
||||
write_opt(f, facet_str)?;
|
||||
}
|
||||
Type::Bytes => {
|
||||
write_opt(f, Some(bytes))?;
|
||||
}
|
||||
Type::Json => {
|
||||
if let Some((path, typ, bytes)) = as_json_path_type_value_bytes(bytes) {
|
||||
let path_pretty = path.replace(JSON_PATH_SEGMENT_SEP_STR, ".");
|
||||
write!(f, "path={path_pretty}, vtype={typ:?}, ")?;
|
||||
debug_value_bytes(typ, bytes, f)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<B> fmt::Debug for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let field_id = self.field().field_id();
|
||||
let typ = self.typ();
|
||||
write!(f, "Term(type={typ:?}, field={field_id}, ")?;
|
||||
debug_value_bytes(typ, self.value_bytes(), f)?;
|
||||
write!(f, ")",)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ impl TextOptions {
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
|
||||
/// - the name of the `Tokenizer` that should be used to process the field.
|
||||
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
record: IndexRecordOption,
|
||||
fieldnorms: bool,
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::fmt;
|
||||
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use serde_json::Map;
|
||||
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
@@ -27,6 +28,8 @@ pub enum Value {
|
||||
Facet(Facet),
|
||||
/// Arbitrarily sized byte array
|
||||
Bytes(Vec<u8>),
|
||||
/// Json object value.
|
||||
JsonObject(serde_json::Map<String, serde_json::Value>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
@@ -43,6 +46,7 @@ impl Serialize for Value {
|
||||
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -168,6 +172,17 @@ impl Value {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the json object, provided the value is of the JsonObject type.
|
||||
///
|
||||
/// Returns None if the value is not of type JsonObject.
|
||||
pub fn as_json(&self) -> Option<&Map<String, serde_json::Value>> {
|
||||
if let Value::JsonObject(json) = self {
|
||||
Some(json)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for Value {
|
||||
@@ -230,6 +245,23 @@ impl From<PreTokenizedString> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Map<String, serde_json::Value>> for Value {
|
||||
fn from(json_object: serde_json::Map<String, serde_json::Value>) -> Value {
|
||||
Value::JsonObject(json_object)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Value> for Value {
|
||||
fn from(json_value: serde_json::Value) -> Value {
|
||||
match json_value {
|
||||
serde_json::Value::Object(json_object) => Value::JsonObject(json_object),
|
||||
_ => {
|
||||
panic!("Expected a json object.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod binary_serialize {
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
@@ -248,6 +280,7 @@ mod binary_serialize {
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
const EXT_CODE: u8 = 7;
|
||||
const JSON_OBJ_CODE: u8 = 8;
|
||||
|
||||
// extended types
|
||||
|
||||
@@ -296,8 +329,14 @@ mod binary_serialize {
|
||||
BYTES_CODE.serialize(writer)?;
|
||||
bytes.serialize(writer)
|
||||
}
|
||||
Value::JsonObject(ref map) => {
|
||||
JSON_OBJ_CODE.serialize(writer)?;
|
||||
serde_json::to_writer(writer, &map)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let type_code = u8::deserialize(reader)?;
|
||||
match type_code {
|
||||
@@ -347,6 +386,10 @@ mod binary_serialize {
|
||||
)),
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
41
src/tokenizer/empty_tokenizer.rs
Normal file
41
src/tokenizer/empty_tokenizer.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct EmptyTokenizer;
|
||||
|
||||
impl Tokenizer for EmptyTokenizer {
|
||||
fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> {
|
||||
EmptyTokenStream::default().into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct EmptyTokenStream {
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl TokenStream for EmptyTokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &super::Token {
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut super::Token {
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::Tokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_empty_tokenizer() {
|
||||
let tokenizer = super::EmptyTokenizer;
|
||||
let mut empty = tokenizer.token_stream("whatever string");
|
||||
assert!(!empty.advance());
|
||||
}
|
||||
}
|
||||
@@ -119,6 +119,7 @@
|
||||
//! ```
|
||||
mod alphanum_only;
|
||||
mod ascii_folding_filter;
|
||||
mod empty_tokenizer;
|
||||
mod facet_tokenizer;
|
||||
mod lower_caser;
|
||||
mod ngram_tokenizer;
|
||||
|
||||
@@ -5,6 +5,8 @@ use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
@@ -43,6 +45,12 @@ pub struct TextAnalyzer {
|
||||
token_filters: Vec<BoxTokenFilter>,
|
||||
}
|
||||
|
||||
impl Default for TextAnalyzer {
|
||||
fn default() -> TextAnalyzer {
|
||||
TextAnalyzer::from(EmptyTokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
fn from(tokenizer: T) -> Self {
|
||||
TextAnalyzer::new(tokenizer, Vec::new())
|
||||
|
||||
Reference in New Issue
Block a user