diff --git a/CHANGELOG.md b/CHANGELOG.md index 95318f007..f29e328f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Tantivy 0.14.0 ========================= - Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan). - Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo) +- API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match. Tantivy 0.13.1 =================== diff --git a/src/collector/tests.rs b/src/collector/tests.rs index e61b2b5cb..144261be8 100644 --- a/src/collector/tests.rs +++ b/src/collector/tests.rs @@ -185,12 +185,15 @@ impl Collector for BytesFastFieldTestCollector { _segment_local_id: u32, segment_reader: &SegmentReader, ) -> crate::Result { + let reader = segment_reader + .fast_fields() + .bytes(self.field) + .ok_or_else(|| { + crate::TantivyError::InvalidArgument("Field is not a bytes fast field.".to_string()) + })?; Ok(BytesFastFieldSegmentCollector { vals: Vec::new(), - reader: segment_reader - .fast_fields() - .bytes(self.field) - .expect("Field is not a bytes fast field."), + reader, }) } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index b941d44d3..6ab50492f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -129,7 +129,7 @@ impl SegmentReader { self.fieldnorm_readers.get_field(field).ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( - "Field norm not found for field {:?}. Was it market as indexed during indexing.", + "Field norm not found for field {:?}. Was it marked as indexed during indexing?", field_name ); crate::TantivyError::SchemaError(err_msg) diff --git a/src/error.rs b/src/error.rs index 9502acc44..c3a842a77 100644 --- a/src/error.rs +++ b/src/error.rs @@ -131,8 +131,7 @@ impl From for TantivyError { impl From for TantivyError { fn from(error: serde_json::Error) -> TantivyError { - let io_err = io::Error::from(error); - TantivyError::IOError(io_err.into()) + TantivyError::IOError(error.into()) } } diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index 43985db9d..1ce4fe416 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -6,31 +6,115 @@ pub use self::writer::BytesFastFieldWriter; #[cfg(test)] mod tests { - use crate::schema::Schema; - use crate::Index; + use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value}; + use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED}; + use crate::{DocAddress, DocSet, Index, Searcher, Term}; + use std::ops::Deref; #[test] - fn test_bytes() { + fn test_bytes() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let field = schema_builder.add_bytes_field("bytesfield"); + let bytes_field = schema_builder.add_bytes_field("bytesfield", FAST); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests().unwrap(); - index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3])); - index_writer.add_document(doc!(field=>vec![])); - index_writer.add_document(doc!(field=>vec![255u8])); - index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9])); - index_writer.add_document(doc!(field=>vec![0u8; 1000])); - assert!(index_writer.commit().is_ok()); - let searcher = index.reader().unwrap().searcher(); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3])); + index_writer.add_document(doc!(bytes_field=>vec![])); + index_writer.add_document(doc!(bytes_field=>vec![255u8])); + index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9])); + index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000])); + index_writer.commit()?; + let searcher = index.reader()?.searcher(); let segment_reader = searcher.segment_reader(0); - let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap(); - + let bytes_reader = segment_reader.fast_fields().bytes(bytes_field).unwrap(); assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]); assert!(bytes_reader.get_bytes(1).is_empty()); assert_eq!(bytes_reader.get_bytes(2), &[255u8]); assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]); let long = vec![0u8; 1000]; assert_eq!(bytes_reader.get_bytes(4), long.as_slice()); + Ok(()) + } + + fn create_index_for_test>( + byte_options: T, + ) -> crate::Result> { + let mut schema_builder = Schema::builder(); + let field = schema_builder.add_bytes_field("string_bytes", byte_options.into()); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!( + field => b"tantivy".as_ref(), + field => b"lucene".as_ref() + )); + index_writer.commit()?; + Ok(index.reader()?.searcher()) + } + + #[test] + fn test_stored_bytes() -> crate::Result<()> { + let searcher = create_index_for_test(STORED)?; + assert_eq!(searcher.num_docs(), 1); + let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?; + let field = searcher.schema().get_field("string_bytes").unwrap(); + let values: Vec<&Value> = retrieved_doc.get_all(field).collect(); + assert_eq!(values.len(), 2); + let values_bytes: Vec<&[u8]> = values + .into_iter() + .flat_map(|value| value.bytes_value()) + .collect(); + assert_eq!(values_bytes, &[&b"tantivy"[..], &b"lucene"[..]]); + Ok(()) + } + + #[test] + fn test_non_stored_bytes() -> crate::Result<()> { + let searcher = create_index_for_test(INDEXED)?; + assert_eq!(searcher.num_docs(), 1); + let retrieved_doc = searcher.doc(DocAddress(0u32, 0u32))?; + let field = searcher.schema().get_field("string_bytes").unwrap(); + assert!(retrieved_doc.get_first(field).is_none()); + Ok(()) + } + + #[test] + fn test_index_bytes() -> crate::Result<()> { + let searcher = create_index_for_test(INDEXED)?; + assert_eq!(searcher.num_docs(), 1); + let field = searcher.schema().get_field("string_bytes").unwrap(); + let term = Term::from_field_bytes(field, b"lucene".as_ref()); + let term_query = TermQuery::new(term, IndexRecordOption::Basic); + let term_weight = term_query.specialized_weight(&searcher, true); + let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32)?; + assert_eq!(term_scorer.doc(), 0u32); + Ok(()) + } + + #[test] + fn test_non_index_bytes() -> crate::Result<()> { + let searcher = create_index_for_test(STORED)?; + assert_eq!(searcher.num_docs(), 1); + let field = searcher.schema().get_field("string_bytes").unwrap(); + let term = Term::from_field_bytes(field, b"lucene".as_ref()); + let term_query = TermQuery::new(term, IndexRecordOption::Basic); + let term_weight = term_query.specialized_weight(&searcher, false); + let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32); + assert!(matches!( + term_scorer_err, + Err(crate::TantivyError::SchemaError(_)) + )); + Ok(()) + } + + #[test] + fn test_fast_bytes_multivalue_value() -> crate::Result<()> { + let searcher = create_index_for_test(FAST)?; + assert_eq!(searcher.num_docs(), 1); + let fast_fields = searcher.segment_reader(0u32).fast_fields(); + let field = searcher.schema().get_field("string_bytes").unwrap(); + let fast_field_reader = fast_fields.bytes(field).unwrap(); + assert_eq!(fast_field_reader.get_bytes(0u32), b"tantivy"); + Ok(()) } } diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 8b64cbb47..454078215 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -49,16 +49,10 @@ impl BytesFastFieldWriter { /// matching field values present in the document. pub fn add_document(&mut self, doc: &Document) { self.next_doc(); - for field_value in doc.field_values() { - if field_value.field() == self.field { - if let Value::Bytes(ref bytes) = *field_value.value() { - self.vals.extend_from_slice(bytes); - } else { - panic!( - "Bytes field contained non-Bytes Value!. Field {:?} = {:?}", - self.field, field_value - ); - } + for field_value in doc.get_all(self.field) { + if let Value::Bytes(ref bytes) = field_value { + self.vals.extend_from_slice(bytes); + return; } } } @@ -76,21 +70,18 @@ impl BytesFastFieldWriter { /// Serializes the fast field values by pushing them to the `FastFieldSerializer`. pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { - { - // writing the offset index - let mut doc_index_serializer = - serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?; - for &offset in &self.doc_index { - doc_index_serializer.add_val(offset)?; - } - doc_index_serializer.add_val(self.vals.len() as u64)?; - doc_index_serializer.close_field()?; - } - { - // writing the values themselves - let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?; - value_serializer.write_all(&self.vals)?; + // writing the offset index + let mut doc_index_serializer = + serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?; + for &offset in &self.doc_index { + doc_index_serializer.add_val(offset)?; } + doc_index_serializer.add_val(self.vals.len() as u64)?; + doc_index_serializer.close_field()?; + // writing the values themselves + serializer + .new_bytes_fast_field_with_idx(self.field, 1)? + .write_all(&self.vals)?; Ok(()) } } diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index db9981a04..9594f53f3 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -100,6 +100,7 @@ mod tests { .get_first(date_field) .expect("cannot find value") .date_value() + .unwrap() .timestamp(), first_time_stamp.timestamp() ); @@ -108,7 +109,7 @@ mod tests { .get_first(time_i) .expect("cannot find value") .i64_value(), - 1i64 + Some(1i64) ); } } @@ -131,6 +132,7 @@ mod tests { .get_first(date_field) .expect("cannot find value") .date_value() + .unwrap() .timestamp(), two_secs_ahead.timestamp() ); @@ -139,7 +141,7 @@ mod tests { .get_first(time_i) .expect("cannot find value") .i64_value(), - 3i64 + Some(3i64) ); } } @@ -197,22 +199,14 @@ mod tests { let segment_reader = searcher.segment_reader(0); let mut vals = Vec::new(); let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap(); - { - multi_value_reader.get_vals(2, &mut vals); - assert_eq!(&vals, &[-4i64]); - } - { - multi_value_reader.get_vals(0, &mut vals); - assert_eq!(&vals, &[1i64, 3i64]); - } - { - multi_value_reader.get_vals(1, &mut vals); - assert!(vals.is_empty()); - } - { - multi_value_reader.get_vals(3, &mut vals); - assert_eq!(&vals, &[-5i64, -20i64, 1i64]); - } + multi_value_reader.get_vals(2, &mut vals); + assert_eq!(&vals, &[-4i64]); + multi_value_reader.get_vals(0, &mut vals); + assert_eq!(&vals, &[1i64, 3i64]); + multi_value_reader.get_vals(1, &mut vals); + assert!(vals.is_empty()); + multi_value_reader.get_vals(3, &mut vals); + assert_eq!(&vals, &[-5i64, -20i64, 1i64]); } #[test] #[ignore] diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index d6b39523c..fe7dc5a22 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -68,7 +68,10 @@ impl FastFieldReaders { }; for (field, field_entry) in schema.fields() { let field_type = field_entry.field_type(); - if field_type == &FieldType::Bytes { + if let FieldType::Bytes(bytes_option) = field_type { + if !bytes_option.is_fast() { + continue; + } let idx_reader = fast_fields_composite .open_read_with_idx(field, 0) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index c841faa46..375b9349d 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -33,7 +33,7 @@ impl FastFieldsWriter { let mut bytes_value_writers = Vec::new(); for (field, field_entry) in schema.fields() { - match *field_entry.field_type() { + match field_entry.field_type() { FieldType::I64(ref int_options) | FieldType::U64(ref int_options) | FieldType::F64(ref int_options) @@ -56,9 +56,11 @@ impl FastFieldsWriter { let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true); multi_values_writers.push(fast_field_writer); } - FieldType::Bytes => { - let fast_field_writer = BytesFastFieldWriter::new(field); - bytes_value_writers.push(fast_field_writer); + FieldType::Bytes(bytes_option) => { + if bytes_option.is_fast() { + let fast_field_writer = BytesFastFieldWriter::new(field); + bytes_value_writers.push(fast_field_writer); + } } _ => {} } diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 5c72a1362..061522e5c 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -4,7 +4,7 @@ use super::fieldnorm_to_id; use super::FieldNormsSerializer; use crate::schema::Field; use crate::schema::Schema; -use std::io; +use std::{io, iter}; /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte /// of each document for each field with field norms. @@ -44,7 +44,9 @@ impl FieldNormsWriter { .unwrap_or(0); FieldNormsWriter { fields, - fieldnorms_buffer: (0..max_field).map(|_| Vec::new()).collect::>(), + fieldnorms_buffer: iter::repeat_with(Vec::new) + .take(max_field) + .collect::>(), } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b8de841f0..9acf7f18c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -194,7 +194,7 @@ impl IndexMerger { ) -> crate::Result<()> { for (field, field_entry) in self.schema.fields() { let field_type = field_entry.field_type(); - match *field_type { + match field_type { FieldType::HierarchicalFacet => { let term_ordinal_mapping = term_ord_mappings .remove(&field) @@ -223,8 +223,10 @@ impl IndexMerger { // They can be implemented using what is done // for facets in the future. } - FieldType::Bytes => { - self.write_bytes_fast_field(field, fast_field_serializer)?; + FieldType::Bytes(byte_options) => { + if byte_options.is_fast() { + self.write_bytes_fast_field(field, fast_field_serializer)?; + } } } } @@ -443,9 +445,11 @@ impl IndexMerger { let mut bytes_readers: Vec = Vec::new(); for reader in &self.readers { - let bytes_reader = reader.fast_fields().bytes(field).expect( - "Failed to find bytes fast field reader. This is a bug in tantivy, please report.", - ); + let bytes_reader = reader.fast_fields().bytes(field).ok_or_else(|| { + crate::TantivyError::InvalidArgument( + "Bytes fast field {:?} not found in segment.".to_string(), + ) + })?; if let Some(delete_bitset) = reader.delete_bitset() { for doc in 0u32..reader.max_doc() { if delete_bitset.is_alive(doc) { @@ -498,14 +502,15 @@ impl IndexMerger { ) -> crate::Result> { let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); + + let mut field_term_streams = Vec::new(); + let mut max_term_ords: Vec = Vec::new(); + let field_readers: Vec> = self .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) - .collect::>(); - - let mut field_term_streams = Vec::new(); - let mut max_term_ords: Vec = Vec::new(); + .collect(); for field_reader in &field_readers { let terms = field_reader.terms(); @@ -720,12 +725,12 @@ mod tests { use crate::IndexWriter; use crate::Searcher; use crate::{schema, DocSet, SegmentId}; - use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; + use byteorder::{BigEndian, ReadBytesExt}; use futures::executor::block_on; - use std::io::Cursor; + use schema::FAST; #[test] - fn test_index_merger_no_deletes() { + fn test_index_merger_no_deletes() -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( @@ -738,98 +743,77 @@ mod tests { let date_field = schema_builder.add_date_field("date", INDEXED); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); - let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); + let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST); let index = Index::create_in_ram(schema_builder.build()); - let reader = index.reader().unwrap(); + let reader = index.reader()?; let curr_time = chrono::Utc::now(); - let add_score_bytes = |doc: &mut Document, score: u32| { - let mut bytes = Vec::new(); - bytes - .write_u32::(score) - .expect("failed to write u32 bytes to Vec..."); - doc.add_bytes(bytes_score_field, bytes); - }; - { - let mut index_writer = index.writer_for_tests().unwrap(); - { - // writing the segment - { - let mut doc = Document::default(); - doc.add_text(text_field, "af b"); - doc.add_u64(score_field, 3); - doc.add_date(date_field, &curr_time); - add_score_bytes(&mut doc, 3); - index_writer.add_document(doc); - } - { - let mut doc = Document::default(); - doc.add_text(text_field, "a b c"); - doc.add_u64(score_field, 5); - add_score_bytes(&mut doc, 5); - index_writer.add_document(doc); - } - { - let mut doc = Document::default(); - doc.add_text(text_field, "a b c d"); - doc.add_u64(score_field, 7); - add_score_bytes(&mut doc, 7); - index_writer.add_document(doc); - } - index_writer.commit().expect("committed"); - } + let mut index_writer = index.writer_for_tests()?; + // writing the segment + index_writer.add_document(doc!( + text_field => "af b", + score_field => 3u64, + date_field => curr_time, + bytes_score_field => 3u32.to_be_bytes().as_ref() + )); - { - // writing the segment - { - let mut doc = Document::default(); - doc.add_text(text_field, "af b"); - doc.add_date(date_field, &curr_time); - doc.add_u64(score_field, 11); - add_score_bytes(&mut doc, 11); - index_writer.add_document(doc); - } - { - let mut doc = Document::default(); - doc.add_text(text_field, "a b c g"); - doc.add_u64(score_field, 13); - add_score_bytes(&mut doc, 13); - index_writer.add_document(doc); - } - index_writer.commit().expect("Commit failed"); - } + index_writer.add_document(doc!( + text_field => "a b c", + score_field => 5u64, + bytes_score_field => 5u32.to_be_bytes().as_ref() + )); + index_writer.add_document(doc!( + text_field => "a b c d", + score_field => 7u64, + bytes_score_field => 7u32.to_be_bytes().as_ref() + )); + index_writer.commit()?; + // writing the segment + index_writer.add_document(doc!( + text_field => "af b", + date_field => curr_time, + score_field => 11u64, + bytes_score_field => 11u32.to_be_bytes().as_ref() + )); + index_writer.add_document(doc!( + text_field => "a b c g", + score_field => 13u64, + bytes_score_field => 13u32.to_be_bytes().as_ref() + )); + index_writer.commit()?; } { let segment_ids = index .searchable_segment_ids() .expect("Searchable segments failed."); - let mut index_writer = index.writer_for_tests().unwrap(); - block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); - index_writer.wait_merging_threads().unwrap(); + let mut index_writer = index.writer_for_tests()?; + block_on(index_writer.merge(&segment_ids))?; + index_writer.wait_merging_threads()?; } { - reader.reload().unwrap(); + reader.reload()?; let searcher = reader.searcher(); let get_doc_ids = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - let top_docs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap(); - top_docs.docs().to_vec() + searcher + .search(&query, &TEST_COLLECTOR_WITH_SCORE) + .map(|top_docs| top_docs.docs().to_vec()) }; { assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + get_doc_ids(vec![Term::from_field_text(text_field, "a")])?, vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)] ); assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + get_doc_ids(vec![Term::from_field_text(text_field, "af")])?, vec![DocAddress(0, 0), DocAddress(0, 3)] ); assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "g")]), + get_doc_ids(vec![Term::from_field_text(text_field, "g")])?, vec![DocAddress(0, 4)] ); assert_eq!( - get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + get_doc_ids(vec![Term::from_field_text(text_field, "b")])?, vec![ DocAddress(0, 0), DocAddress(0, 1), @@ -839,60 +823,57 @@ mod tests { ] ); assert_eq!( - get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]), + get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)])?, vec![DocAddress(0, 0), DocAddress(0, 3)] ); } { - let doc = searcher.doc(DocAddress(0, 0)).unwrap(); + let doc = searcher.doc(DocAddress(0, 0))?; assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(DocAddress(0, 1)).unwrap(); + let doc = searcher.doc(DocAddress(0, 1))?; assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { - let doc = searcher.doc(DocAddress(0, 2)).unwrap(); + let doc = searcher.doc(DocAddress(0, 2))?; assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { - let doc = searcher.doc(DocAddress(0, 3)).unwrap(); + let doc = searcher.doc(DocAddress(0, 3))?; assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(DocAddress(0, 4)).unwrap(); + let doc = searcher.doc(DocAddress(0, 4))?; assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - searcher - .search(&query, &FastFieldTestCollector::for_field(score_field)) - .unwrap() + searcher.search(&query, &FastFieldTestCollector::for_field(score_field)) }; let get_fast_vals_bytes = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); - searcher - .search( - &query, - &BytesFastFieldTestCollector::for_field(bytes_score_field), - ) - .expect("failed to search") + searcher.search( + &query, + &BytesFastFieldTestCollector::for_field(bytes_score_field), + ) }; assert_eq!( - get_fast_vals(vec![Term::from_field_text(text_field, "a")]), + get_fast_vals(vec![Term::from_field_text(text_field, "a")])?, vec![5, 7, 13] ); assert_eq!( - get_fast_vals_bytes(vec![Term::from_field_text(text_field, "a")]), + get_fast_vals_bytes(vec![Term::from_field_text(text_field, "a")])?, vec![0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 13] ); } } + Ok(()) } #[test] - fn test_index_merger_with_deletes() { + fn test_index_merger_with_deletes() -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( @@ -902,27 +883,26 @@ mod tests { let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); - let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); + let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST); let index = Index::create_in_ram(schema_builder.build()); - let mut index_writer = index.writer_for_tests().unwrap(); + let mut index_writer = index.writer_for_tests()?; let reader = index.reader().unwrap(); let search_term = |searcher: &Searcher, term: Term| { let collector = FastFieldTestCollector::for_field(score_field); let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - let (scores, bytes) = searcher + searcher .search(&term_query, &(collector, bytes_collector)) - .unwrap(); - let mut score_bytes = Cursor::new(bytes); - for &score in &scores { - assert_eq!(score as u32, score_bytes.read_u32::().unwrap()); - } - - scores + .map(|(scores, bytes)| { + let mut score_bytes = &bytes[..]; + for &score in &scores { + assert_eq!(score as u32, score_bytes.read_u32::().unwrap()); + } + scores + }) }; let empty_vec = Vec::::new(); - { // a first commit index_writer.add_document(doc!( @@ -941,26 +921,26 @@ mod tests { score_field => 3u64, bytes_score_field => vec![0u8, 0, 0, 3], )); - index_writer.commit().expect("committed"); - reader.reload().unwrap(); + index_writer.commit()?; + reader.reload()?; let searcher = reader.searcher(); assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "a")), + search_term(&searcher, Term::from_field_text(text_field, "a"))?, vec![1] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "b")), + search_term(&searcher, Term::from_field_text(text_field, "b"))?, vec![1] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "c")), + search_term(&searcher, Term::from_field_text(text_field, "c"))?, vec![3] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "d")), + search_term(&searcher, Term::from_field_text(text_field, "d"))?, vec![1, 3] ); } @@ -988,8 +968,8 @@ mod tests { score_field => 7_000u64, bytes_score_field => vec![0u8, 0, 27, 88], )); - index_writer.commit().expect("committed"); - reader.reload().unwrap(); + index_writer.commit()?; + reader.reload()?; let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 2); @@ -999,31 +979,31 @@ mod tests { assert_eq!(searcher.segment_readers()[1].num_docs(), 1); assert_eq!(searcher.segment_readers()[1].max_doc(), 3); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "a")), + search_term(&searcher, Term::from_field_text(text_field, "a"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "b")), + search_term(&searcher, Term::from_field_text(text_field, "b"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "c")), + search_term(&searcher, Term::from_field_text(text_field, "c"))?, vec![3] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "d")), + search_term(&searcher, Term::from_field_text(text_field, "d"))?, vec![3] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "e")), + search_term(&searcher, Term::from_field_text(text_field, "e"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "f")), + search_term(&searcher, Term::from_field_text(text_field, "f"))?, vec![6_000] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "g")), + search_term(&searcher, Term::from_field_text(text_field, "g"))?, vec![6_000, 7_000] ); @@ -1045,42 +1025,40 @@ mod tests { } { // merging the segments - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); - reader.reload().unwrap(); + let segment_ids = index.searchable_segment_ids()?; + block_on(index_writer.merge(&segment_ids))?; + reader.reload()?; let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "a")), + search_term(&searcher, Term::from_field_text(text_field, "a"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "b")), + search_term(&searcher, Term::from_field_text(text_field, "b"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "c")), + search_term(&searcher, Term::from_field_text(text_field, "c"))?, vec![3] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "d")), + search_term(&searcher, Term::from_field_text(text_field, "d"))?, vec![3] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "e")), + search_term(&searcher, Term::from_field_text(text_field, "e"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "f")), + search_term(&searcher, Term::from_field_text(text_field, "f"))?, vec![6_000] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "g")), + search_term(&searcher, Term::from_field_text(text_field, "g"))?, vec![6_000, 7_000] ); let score_field_reader = searcher @@ -1094,40 +1072,40 @@ mod tests { { // test a commit with only deletes index_writer.delete_term(Term::from_field_text(text_field, "c")); - index_writer.commit().unwrap(); + index_writer.commit()?; - reader.reload().unwrap(); + reader.reload()?; let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "a")), + search_term(&searcher, Term::from_field_text(text_field, "a"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "b")), + search_term(&searcher, Term::from_field_text(text_field, "b"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "c")), + search_term(&searcher, Term::from_field_text(text_field, "c"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "d")), + search_term(&searcher, Term::from_field_text(text_field, "d"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "e")), + search_term(&searcher, Term::from_field_text(text_field, "e"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "f")), + search_term(&searcher, Term::from_field_text(text_field, "f"))?, vec![6_000] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "g")), + search_term(&searcher, Term::from_field_text(text_field, "g"))?, vec![6_000, 7_000] ); let score_field_reader = searcher @@ -1140,11 +1118,9 @@ mod tests { } { // Test merging a single segment in order to remove deletes. - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - block_on(index_writer.merge(&segment_ids)).expect("Merging failed"); - reader.reload().unwrap(); + let segment_ids = index.searchable_segment_ids()?; + block_on(index_writer.merge(&segment_ids))?; + reader.reload()?; let searcher = reader.searcher(); assert_eq!(searcher.segment_readers().len(), 1); @@ -1152,31 +1128,31 @@ mod tests { assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 2); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "a")), + search_term(&searcher, Term::from_field_text(text_field, "a"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "b")), + search_term(&searcher, Term::from_field_text(text_field, "b"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "c")), + search_term(&searcher, Term::from_field_text(text_field, "c"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "d")), + search_term(&searcher, Term::from_field_text(text_field, "d"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "e")), + search_term(&searcher, Term::from_field_text(text_field, "e"))?, empty_vec ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "f")), + search_term(&searcher, Term::from_field_text(text_field, "f"))?, vec![6_000] ); assert_eq!( - search_term(&searcher, Term::from_field_text(text_field, "g")), + search_term(&searcher, Term::from_field_text(text_field, "g"))?, vec![6_000, 7_000] ); let score_field_reader = searcher @@ -1191,17 +1167,16 @@ mod tests { { // Test removing all docs index_writer.delete_term(Term::from_field_text(text_field, "g")); - index_writer.commit().unwrap(); - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - reader.reload().unwrap(); + index_writer.commit()?; + let segment_ids = index.searchable_segment_ids()?; + reader.reload()?; let searcher = reader.searcher(); assert!(segment_ids.is_empty()); assert!(searcher.segment_readers().is_empty()); assert_eq!(searcher.num_docs(), 0); } + Ok(()) } #[test] diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 5bb979702..1606b9d75 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -16,7 +16,6 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::Opstamp; use crate::{DocId, SegmentComponent}; -use std::io; /// Computes the initial size of the hash table. /// @@ -117,7 +116,11 @@ impl SegmentWriter { /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document(&mut self, add_operation: AddOperation, schema: &Schema) -> io::Result<()> { + pub fn add_document( + &mut self, + add_operation: AddOperation, + schema: &Schema, + ) -> crate::Result<()> { let doc_id = self.max_doc; let mut doc = add_operation.document; self.doc_opstamps.push(add_operation.opstamp); @@ -125,13 +128,20 @@ impl SegmentWriter { self.fast_field_writers.add_document(&doc); for (field, field_values) in doc.get_sorted_field_values() { - let field_options = schema.get_field_entry(field); - if !field_options.is_indexed() { + let field_entry = schema.get_field_entry(field); + let make_schema_error = || { + crate::TantivyError::SchemaError(format!( + "Expected a {:?} for field {:?}", + field_entry.field_type().value_type(), + field_entry.name() + )) + }; + if !field_entry.is_indexed() { continue; } let (term_buffer, multifield_postings) = (&mut self.term_buffer, &mut self.multifield_postings); - match *field_options.field_type() { + match *field_entry.field_type() { FieldType::HierarchicalFacet => { term_buffer.set_field(field); let facets = @@ -143,14 +153,16 @@ impl SegmentWriter { panic!("Expected hierarchical facet"); } }); - for fake_str in facets { + for facet_str in facets { let mut unordered_term_id_opt = None; - FacetTokenizer.token_stream(fake_str).process(&mut |token| { - term_buffer.set_text(&token.text); - let unordered_term_id = - multifield_postings.subscribe(doc_id, &term_buffer); - unordered_term_id_opt = Some(unordered_term_id); - }); + FacetTokenizer + .token_stream(facet_str) + .process(&mut |token| { + term_buffer.set_text(&token.text); + let unordered_term_id = + multifield_postings.subscribe(doc_id, &term_buffer); + unordered_term_id_opt = Some(unordered_term_id); + }); if let Some(unordered_term_id) = unordered_term_id_opt { self.fast_field_writers .get_multivalue_writer(field) @@ -205,7 +217,11 @@ impl SegmentWriter { if int_option.is_indexed() { for field_value in field_values { term_buffer.set_field(field_value.field()); - term_buffer.set_u64(field_value.value().u64_value()); + let u64_val = field_value + .value() + .u64_value() + .ok_or_else(make_schema_error)?; + term_buffer.set_u64(u64_val); multifield_postings.subscribe(doc_id, &term_buffer); } } @@ -214,7 +230,11 @@ impl SegmentWriter { if int_option.is_indexed() { for field_value in field_values { term_buffer.set_field(field_value.field()); - term_buffer.set_i64(field_value.value().date_value().timestamp()); + let date_val = field_value + .value() + .date_value() + .ok_or_else(make_schema_error)?; + term_buffer.set_i64(date_val.timestamp()); multifield_postings.subscribe(doc_id, &term_buffer); } } @@ -223,7 +243,11 @@ impl SegmentWriter { if int_option.is_indexed() { for field_value in field_values { term_buffer.set_field(field_value.field()); - term_buffer.set_i64(field_value.value().i64_value()); + let i64_val = field_value + .value() + .i64_value() + .ok_or_else(make_schema_error)?; + term_buffer.set_i64(i64_val); multifield_postings.subscribe(doc_id, &term_buffer); } } @@ -232,13 +256,27 @@ impl SegmentWriter { if int_option.is_indexed() { for field_value in field_values { term_buffer.set_field(field_value.field()); - term_buffer.set_f64(field_value.value().f64_value()); + let f64_val = field_value + .value() + .f64_value() + .ok_or_else(make_schema_error)?; + term_buffer.set_f64(f64_val); multifield_postings.subscribe(doc_id, &term_buffer); } } } - FieldType::Bytes => { - // Do nothing. Bytes only supports fast fields. + FieldType::Bytes(ref option) => { + if option.is_indexed() { + for field_value in field_values { + term_buffer.set_field(field_value.field()); + let bytes = field_value + .value() + .bytes_value() + .ok_or_else(make_schema_error)?; + term_buffer.set_bytes(bytes); + self.multifield_postings.subscribe(doc_id, &term_buffer); + } + } } } } diff --git a/src/lib.rs b/src/lib.rs index 7cbed553f..6fd4ce2ad 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -842,11 +842,11 @@ mod tests { text_field => "some other value", other_text_field => "short"); assert_eq!(document.len(), 3); - let values = document.get_all(text_field); + let values: Vec<&Value> = document.get_all(text_field).collect(); assert_eq!(values.len(), 2); assert_eq!(values[0].text(), Some("tantivy")); assert_eq!(values[1].text(), Some("some other value")); - let values = document.get_all(other_text_field); + let values: Vec<&Value> = document.get_all(other_text_field).collect(); assert_eq!(values.len(), 1); assert_eq!(values[0].text(), Some("short")); } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 3fc0b8291..401ba0df1 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -38,12 +38,8 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) + | FieldType::Bytes(_) | FieldType::HierarchicalFacet => SpecializedPostingsWriter::::new_boxed(), - FieldType::Bytes => { - // FieldType::Bytes cannot actually be indexed. - // TODO fix during the indexer refactoring described in #276 - SpecializedPostingsWriter::::new_boxed() - } } } @@ -161,7 +157,7 @@ impl MultiFieldPostingsWriter { unordered_term_mappings.insert(field, mapping); } FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {} - FieldType::Bytes => {} + FieldType::Bytes(_) => {} } let postings_writer = &self.per_field_postings_writers[field.field_id() as usize]; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 02ef50765..3907914b6 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -34,6 +34,10 @@ pub enum QueryParserError { /// is neither. #[error("Expected a valid integer: '{0:?}'")] ExpectedInt(ParseIntError), + /// The query contains a term for a bytes field, but the value is not valid + /// base64. + #[error("Expected base64: '{0:?}'")] + ExpectedBase64(base64::DecodeError), /// The query contains a term for a `f64`-field, but the value /// is not a f64. #[error("Invalid query: Only excluding terms given")] @@ -357,9 +361,10 @@ impl QueryParser { let facet = Facet::from_text(phrase); Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]) } - FieldType::Bytes => { - let field_name = self.schema.get_field_name(field).to_string(); - Err(QueryParserError::FieldNotIndexed(field_name)) + FieldType::Bytes(_) => { + let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?; + let term = Term::from_field_bytes(field, &bytes); + Ok(vec![(0, term)]) } } } @@ -601,6 +606,8 @@ mod test { schema_builder.add_date_field("date", INDEXED); schema_builder.add_f64_field("float", INDEXED); schema_builder.add_facet_field("facet"); + schema_builder.add_bytes_field("bytes", INDEXED); + schema_builder.add_bytes_field("bytes_not_indexed", STORED); schema_builder.build() } @@ -788,6 +795,37 @@ mod test { ); } + #[test] + fn test_parse_bytes() { + test_parse_query_to_logical_ast_helper( + "bytes:YnVidQ==", + "Term(field=12,bytes=[98, 117, 98, 117])", + false, + ); + } + + #[test] + fn test_parse_bytes_not_indexed() { + let error = parse_query_to_logical_ast("bytes_not_indexed:aaa", false).unwrap_err(); + assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); + } + + #[test] + fn test_parse_bytes_phrase() { + test_parse_query_to_logical_ast_helper( + "bytes:\"YnVidQ==\"", + "Term(field=12,bytes=[98, 117, 98, 117])", + false, + ); + } + + #[test] + fn test_parse_bytes_invalid_base64() { + let base64_err: QueryParserError = + parse_query_to_logical_ast("bytes:aa", false).unwrap_err(); + assert!(matches!(base64_err, QueryParserError::ExpectedBase64(_))); + } + #[test] fn test_parse_query_to_ast_ab_c() { test_parse_query_to_logical_ast_helper( diff --git a/src/schema/bytes_options.rs b/src/schema/bytes_options.rs new file mode 100644 index 000000000..5ab750174 --- /dev/null +++ b/src/schema/bytes_options.rs @@ -0,0 +1,164 @@ +use serde::{Deserialize, Serialize}; +use std::ops::BitOr; + +use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; +/// Define how an a bytes field should be handled by tantivy. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct BytesOptions { + indexed: bool, + fast: bool, + stored: bool, +} + +impl BytesOptions { + /// Returns true iff the value is indexed. + pub fn is_indexed(&self) -> bool { + self.indexed + } + + /// Returns true iff the value is a fast field. + pub fn is_fast(&self) -> bool { + self.fast + } + + /// Returns true iff the value is stored. + pub fn is_stored(&self) -> bool { + self.stored + } + + /// Set the field as indexed. + /// + /// Setting an integer as indexed will generate + /// a posting list for each value taken by the integer. + pub fn set_indexed(mut self) -> BytesOptions { + self.indexed = true; + self + } + + /// Set the field as a single-valued fast field. + /// + /// Fast fields are designed for random access. + /// Access time are similar to a random lookup in an array. + /// If more than one value is associated to a fast field, only the last one is + /// kept. + pub fn set_fast(mut self) -> BytesOptions { + self.fast = true; + self + } + + /// Set the field as stored. + /// + /// Only the fields that are set as *stored* are + /// persisted into the Tantivy's store. + pub fn set_stored(mut self) -> BytesOptions { + self.stored = true; + self + } +} + +impl Default for BytesOptions { + fn default() -> BytesOptions { + BytesOptions { + indexed: false, + fast: false, + stored: false, + } + } +} + +impl> BitOr for BytesOptions { + type Output = BytesOptions; + + fn bitor(self, other: T) -> BytesOptions { + let other = other.into(); + BytesOptions { + indexed: self.indexed | other.indexed, + stored: self.stored | other.stored, + fast: self.fast | other.fast, + } + } +} + +impl From<()> for BytesOptions { + fn from(_: ()) -> Self { + Self::default() + } +} + +impl From for BytesOptions { + fn from(_: FastFlag) -> Self { + BytesOptions { + indexed: false, + stored: false, + fast: true, + } + } +} + +impl From for BytesOptions { + fn from(_: StoredFlag) -> Self { + BytesOptions { + indexed: false, + stored: true, + fast: false, + } + } +} + +impl From for BytesOptions { + fn from(_: IndexedFlag) -> Self { + BytesOptions { + indexed: true, + stored: false, + fast: false, + } + } +} + +impl From> for BytesOptions +where + Head: Clone, + Tail: Clone, + Self: BitOr + From + From, +{ + fn from(head_tail: SchemaFlagList) -> Self { + Self::from(head_tail.head) | Self::from(head_tail.tail) + } +} + +#[cfg(test)] +mod tests { + use crate::schema::{BytesOptions, FAST, INDEXED, STORED}; + + #[test] + fn test_bytes_option_fast_flag() { + assert_eq!(BytesOptions::default().set_fast(), FAST.into()); + assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into()); + assert_eq!(BytesOptions::default().set_stored(), STORED.into()); + } + #[test] + fn test_bytes_option_fast_flag_composition() { + assert_eq!( + BytesOptions::default().set_fast().set_stored(), + (FAST | STORED).into() + ); + assert_eq!( + BytesOptions::default().set_indexed().set_fast(), + (INDEXED | FAST).into() + ); + assert_eq!( + BytesOptions::default().set_stored().set_indexed(), + (STORED | INDEXED).into() + ); + } + + #[test] + fn test_bytes_option_fast_() { + assert!(!BytesOptions::default().is_stored()); + assert!(!BytesOptions::default().is_fast()); + assert!(!BytesOptions::default().is_indexed()); + assert!(BytesOptions::default().set_stored().is_stored()); + assert!(BytesOptions::default().set_fast().is_fast()); + assert!(BytesOptions::default().set_indexed().is_indexed()); + } +} diff --git a/src/schema/document.rs b/src/schema/document.rs index f6f297539..1887821f2 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -161,20 +161,16 @@ impl Document { } /// Returns all of the `FieldValue`s associated the given field - pub fn get_all(&self, field: Field) -> Vec<&Value> { + pub fn get_all(&self, field: Field) -> impl Iterator { self.field_values .iter() - .filter(|field_value| field_value.field() == field) + .filter(move |field_value| field_value.field() == field) .map(FieldValue::value) - .collect() } /// Returns the first `FieldValue` associated the given field pub fn get_first(&self, field: Field) -> Option<&Value> { - self.field_values - .iter() - .find(|field_value| field_value.field() == field) - .map(FieldValue::value) + self.get_all(field).next() } /// Prepares Document for being stored in the document store diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 05d184cda..b4552d228 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -1,6 +1,7 @@ use crate::schema::TextOptions; use crate::schema::{is_valid_field_name, IntOptions}; +use crate::schema::bytes_options::BytesOptions; use crate::schema::FieldType; use serde::de::{self, MapAccess, Visitor}; use serde::ser::SerializeStruct; @@ -81,11 +82,10 @@ impl FieldEntry { } /// Creates a field entry for a bytes field - pub fn new_bytes(field_name: String) -> FieldEntry { - assert!(is_valid_field_name(&field_name)); + pub fn new_bytes(field_name: String, bytes_type: BytesOptions) -> FieldEntry { FieldEntry { name: field_name, - field_type: FieldType::Bytes, + field_type: FieldType::Bytes(bytes_type), } } @@ -108,7 +108,7 @@ impl FieldEntry { | FieldType::F64(ref options) | FieldType::Date(ref options) => options.is_indexed(), FieldType::HierarchicalFacet => true, - FieldType::Bytes => false, + FieldType::Bytes(ref options) => options.is_indexed(), } } @@ -133,7 +133,7 @@ impl FieldEntry { FieldType::Str(ref options) => options.is_stored(), // TODO make stored hierarchical facet optional FieldType::HierarchicalFacet => true, - FieldType::Bytes => false, + FieldType::Bytes(ref options) => options.is_stored(), } } } @@ -170,8 +170,9 @@ impl Serialize for FieldEntry { FieldType::HierarchicalFacet => { s.serialize_field("type", "hierarchical_facet")?; } - FieldType::Bytes => { + FieldType::Bytes(ref options) => { s.serialize_field("type", "bytes")?; + s.serialize_field("options", options)?; } } @@ -227,10 +228,7 @@ impl<'de> Deserialize<'de> for FieldEntry { "hierarchical_facet" => { field_type = Some(FieldType::HierarchicalFacet); } - "bytes" => { - field_type = Some(FieldType::Bytes); - } - "text" | "u64" | "i64" | "f64" | "date" => { + "text" | "u64" | "i64" | "f64" | "date" | "bytes" => { // These types require additional options to create a field_type } _ => panic!("unhandled type"), @@ -249,6 +247,7 @@ impl<'de> Deserialize<'de> for FieldEntry { "i64" => field_type = Some(FieldType::I64(map.next_value()?)), "f64" => field_type = Some(FieldType::F64(map.next_value()?)), "date" => field_type = Some(FieldType::Date(map.next_value()?)), + "bytes" => field_type = Some(FieldType::Bytes(map.next_value()?)), _ => { let msg = format!("Unrecognised type {}", ty); return Err(de::Error::custom(msg)); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 42b005c5b..9843a5df3 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,5 +1,4 @@ -use base64::decode; - +use crate::schema::bytes_options::BytesOptions; use crate::schema::Facet; use crate::schema::IndexRecordOption; use crate::schema::TextFieldIndexing; @@ -63,7 +62,7 @@ pub enum FieldType { /// Hierachical Facet HierarchicalFacet, /// Bytes (one per document) - Bytes, + Bytes(BytesOptions), } impl FieldType { @@ -76,7 +75,7 @@ impl FieldType { FieldType::F64(_) => Type::F64, FieldType::Date(_) => Type::Date, FieldType::HierarchicalFacet => Type::HierarchicalFacet, - FieldType::Bytes => Type::Bytes, + FieldType::Bytes(_) => Type::Bytes, } } @@ -89,7 +88,7 @@ impl FieldType { | FieldType::F64(ref int_options) => int_options.is_indexed(), FieldType::Date(ref date_options) => date_options.is_indexed(), FieldType::HierarchicalFacet => true, - FieldType::Bytes => false, + FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), } } @@ -113,7 +112,13 @@ impl FieldType { } } FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic), - FieldType::Bytes => None, + FieldType::Bytes(ref bytes_options) => { + if bytes_options.is_indexed() { + Some(IndexRecordOption::Basic) + } else { + None + } + } } } @@ -140,7 +145,7 @@ impl FieldType { ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)), ), FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))), - FieldType::Bytes => decode(field_text).map(Value::Bytes).map_err(|_| { + FieldType::Bytes(_) => base64::decode(field_text).map(Value::Bytes).map_err(|_| { ValueParsingError::InvalidBase64(format!( "Expected base64 string, got {:?}", field_text @@ -172,7 +177,7 @@ impl FieldType { Err(ValueParsingError::OverflowError(msg)) } } - FieldType::Str(_) | FieldType::HierarchicalFacet | FieldType::Bytes => { + FieldType::Str(_) | FieldType::HierarchicalFacet | FieldType::Bytes(_) => { let msg = format!("Expected a string, got {:?}", json); Err(ValueParsingError::TypeError(msg)) } @@ -248,18 +253,18 @@ mod tests { #[test] fn test_bytes_value_from_json() { - let result = FieldType::Bytes + let result = FieldType::Bytes(Default::default()) .value_from_json(&json!("dGhpcyBpcyBhIHRlc3Q=")) .unwrap(); assert_eq!(result, Value::Bytes("this is a test".as_bytes().to_vec())); - let result = FieldType::Bytes.value_from_json(&json!(521)); + let result = FieldType::Bytes(Default::default()).value_from_json(&json!(521)); match result { Err(ValueParsingError::TypeError(_)) => {} _ => panic!("Expected parse failure for wrong type"), } - let result = FieldType::Bytes.value_from_json(&json!("-")); + let result = FieldType::Bytes(Default::default()).value_from_json(&json!("-")); match result { Err(ValueParsingError::InvalidBase64(_)) => {} _ => panic!("Expected parse failure for invalid base64"), diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index 565672645..a5e3b86f2 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -14,7 +14,7 @@ pub enum Cardinality { MultiValues, } -/// Define how an int field should be handled by tantivy. +/// Define how an u64, i64, of f64 field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntOptions { indexed: bool, @@ -39,7 +39,7 @@ impl IntOptions { self.fast.is_some() } - /// Set the u64 options as stored. + /// Set the field as stored. /// /// Only the fields that are set as *stored* are /// persisted into the Tantivy's store. @@ -48,7 +48,7 @@ impl IntOptions { self } - /// Set the u64 options as indexed. + /// Set the field as indexed. /// /// Setting an integer as indexed will generate /// a posting list for each value taken by the integer. @@ -57,7 +57,7 @@ impl IntOptions { self } - /// Set the u64 options as a single-valued fast field. + /// Set the field as a single-valued fast field. /// /// Fast fields are designed for random access. /// Access time are similar to a random lookup in an array. @@ -127,12 +127,12 @@ impl> BitOr for IntOptions { type Output = IntOptions; fn bitor(self, other: T) -> IntOptions { - let mut res = IntOptions::default(); let other = other.into(); - res.indexed = self.indexed | other.indexed; - res.stored = self.stored | other.stored; - res.fast = self.fast.or(other.fast); - res + IntOptions { + indexed: self.indexed | other.indexed, + stored: self.stored | other.stored, + fast: self.fast.or(other.fast), + } } } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 36abed17f..169fd96b3 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -111,6 +111,7 @@ mod field_entry; mod field_type; mod field_value; +mod bytes_options; mod field; mod index_record_option; mod int_options; @@ -142,6 +143,7 @@ pub use self::text_options::TextOptions; pub use self::text_options::STRING; pub use self::text_options::TEXT; +pub use self::bytes_options::BytesOptions; pub use self::flags::{FAST, INDEXED, STORED}; pub use self::int_options::Cardinality; pub use self::int_options::IntOptions; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c4b97c0e7..8bdb7c140 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::sync::Arc; use super::*; +use crate::schema::bytes_options::BytesOptions; use serde::de::{SeqAccess, Visitor}; use serde::ser::SerializeSeq; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -160,8 +161,12 @@ impl SchemaBuilder { /// some document features at scoring time. /// These can be serializing and stored as a bytes field to /// get access rapidly when scoring each document. - pub fn add_bytes_field(&mut self, field_name: &str) -> Field { - let field_entry = FieldEntry::new_bytes(field_name.to_string()); + pub fn add_bytes_field>( + &mut self, + field_name: &str, + field_options: T, + ) -> Field { + let field_entry = FieldEntry::new_bytes(field_name.to_string(), field_options.into()); self.add_field(field_entry) } @@ -556,14 +561,14 @@ mod tests { .convert_named_doc(NamedFieldDocument(named_doc_map)) .unwrap(); assert_eq!( - doc.get_all(title), + doc.get_all(title).collect::>(), vec![ &Value::from("title1".to_string()), &Value::from("title2".to_string()) ] ); assert_eq!( - doc.get_all(val), + doc.get_all(val).collect::>(), vec![&Value::from(14u64), &Value::from(-1i64)] ); } @@ -624,9 +629,15 @@ mod tests { doc.get_first(author_field).unwrap().text(), Some("fulmicoton") ); - assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); - assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); - assert_eq!(doc.get_first(score_field).unwrap().f64_value(), 80.5); + assert_eq!(doc.get_first(count_field).unwrap().u64_value(), Some(4)); + assert_eq!( + doc.get_first(popularity_field).unwrap().i64_value(), + Some(10) + ); + assert_eq!( + doc.get_first(score_field).unwrap().f64_value(), + Some(80.5f64) + ); } { let json_err = schema.parse_document( diff --git a/src/schema/term.rs b/src/schema/term.rs index 2b696e880..266c7b9fb 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -112,7 +112,7 @@ impl Term { pub(crate) fn set_field(&mut self, field: Field) { self.0.clear(); self.0 - .extend_from_slice(&field.field_id().to_be_bytes()[..]); + .extend_from_slice(field.field_id().to_be_bytes().as_ref()); } /// Sets a u64 value in the term. @@ -123,7 +123,7 @@ impl Term { /// the natural order of the values. pub fn set_u64(&mut self, val: u64) { self.0.resize(INT_TERM_LEN, 0u8); - self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref()); + self.set_bytes(val.to_be_bytes().as_ref()); } /// Sets a `i64` value in the term. @@ -136,7 +136,8 @@ impl Term { self.set_u64(common::f64_to_u64(val)); } - fn set_bytes(&mut self, bytes: &[u8]) { + /// Sets the value of a `Bytes` field. + pub fn set_bytes(&mut self, bytes: &[u8]) { self.0.resize(4, 0u8); self.0.extend(bytes); } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 16ffe3d21..abb65e6dc 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -117,10 +117,10 @@ impl> BitOr for TextOptions { fn bitor(self, other: T) -> TextOptions { let other = other.into(); - let mut res = TextOptions::default(); - res.indexing = self.indexing.or(other.indexing); - res.stored = self.stored | other.stored; - res + TextOptions { + indexing: self.indexing.or(other.indexing), + stored: self.stored | other.stored, + } } } diff --git a/src/schema/value.rs b/src/schema/value.rs index 53f75aefd..f695d0554 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -125,62 +125,76 @@ impl Value { /// Returns the text value, provided the value is of the `Str` type. /// (Returns None if the value is not of the `Str` type). pub fn text(&self) -> Option<&str> { - match *self { - Value::Str(ref text) => Some(text), - _ => None, + if let Value::Str(text) = self { + Some(text) + } else { + None } } /// Returns the tokenized text, provided the value is of the `PreTokStr` type. - /// (Returns None if the value is not of the `PreTokStr` type). + /// + /// Returns None if the value is not of the `PreTokStr` type. pub fn tokenized_text(&self) -> Option<&PreTokenizedString> { - match *self { - Value::PreTokStr(ref tok_text) => Some(tok_text), - _ => None, + if let Value::PreTokStr(tokenized_text) = self { + Some(tokenized_text) + } else { + None } } /// Returns the u64-value, provided the value is of the `U64` type. /// - /// # Panics - /// If the value is not of type `U64` - pub fn u64_value(&self) -> u64 { - match *self { - Value::U64(ref value) => *value, - _ => panic!("This is not a u64 field."), + /// Returns None if the value is not of the `U64` type. + pub fn u64_value(&self) -> Option { + if let Value::U64(val) = self { + Some(*val) + } else { + None } } /// Returns the i64-value, provided the value is of the `I64` type. /// - /// # Panics - /// If the value is not of type `I64` - pub fn i64_value(&self) -> i64 { - match *self { - Value::I64(ref value) => *value, - _ => panic!("This is not a i64 field."), + /// Return None if the value is not of type `I64`. + pub fn i64_value(&self) -> Option { + if let Value::I64(val) = self { + Some(*val) + } else { + None } } /// Returns the f64-value, provided the value is of the `F64` type. /// - /// # Panics - /// If the value is not of type `F64` - pub fn f64_value(&self) -> f64 { - match *self { - Value::F64(ref value) => *value, - _ => panic!("This is not a f64 field."), + /// Return None if the value is not of type `F64`. + pub fn f64_value(&self) -> Option { + if let Value::F64(value) = self { + Some(*value) + } else { + None } } /// Returns the Date-value, provided the value is of the `Date` type. /// - /// # Panics - /// If the value is not of type `Date` - pub fn date_value(&self) -> &DateTime { - match *self { - Value::Date(ref value) => value, - _ => panic!("This is not a date field."), + /// Returns None if the value is not of type `Date`. + pub fn date_value(&self) -> Option<&DateTime> { + if let Value::Date(date) = self { + Some(date) + } else { + None + } + } + + /// Returns the Bytes-value, provided the value is of the `Bytes` type. + /// + /// Returns None if the value is not of type `Bytes`. + pub fn bytes_value(&self) -> Option<&[u8]> { + if let Value::Bytes(bytes) = self { + Some(bytes) + } else { + None } } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 103d2b2ba..59485ef8b 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -302,7 +302,6 @@ impl SnippetGenerator { pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { let text: String = doc .get_all(self.field) - .into_iter() .flat_map(Value::text) .collect::>() .join(" ");