diff --git a/CHANGELOG.md b/CHANGELOG.md index 8522c75ff..22aaa4c8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,9 @@ Tantivy 0.6 - BM25 - Approximate field norms encoded over 1 byte. - Compiles on stable rust - +- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270) + - Completely uncompressed + - Internally: One u64 fast field for indexes, one fast field for the bytes themselves. Tantivy 0.5.2 =========================== diff --git a/Cargo.toml b/Cargo.toml index 34af5ddce..aa91d08ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ readme = "README.md" keywords = ["search", "information", "retrieval"] [dependencies] +base64 = "0.9.1" byteorder = "1.0" lazy_static = "0.2.1" tinysegmenter = "0.1.0" diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 918ee757e..9c0d3ac14 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -95,6 +95,7 @@ pub mod tests { use DocId; use Score; use SegmentLocalId; + use fastfield::BytesFastFieldReader; /// Stores all of the doc ids. /// This collector is only used for tests. @@ -185,6 +186,45 @@ pub mod tests { } } + /// Collects in order all of the fast field bytes for all of the + /// docs in the `DocSet` + /// + /// This collector is mainly useful for tests. + pub struct BytesFastFieldTestCollector { + vals: Vec, + field: Field, + ff_reader: Option, + } + + impl BytesFastFieldTestCollector { + pub fn for_field(field: Field) -> BytesFastFieldTestCollector { + BytesFastFieldTestCollector { + vals: Vec::new(), + field, + ff_reader: None, + } + } + + pub fn vals(self) -> Vec { + self.vals + } + } + + impl Collector for BytesFastFieldTestCollector { + fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> { + self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?); + Ok(()) + } + + fn collect(&mut self, doc: u32, _score: f32) { + let val = self.ff_reader.as_ref().unwrap().get_val(doc); + self.vals.extend(val); + } + + fn requires_scoring(&self) -> bool { + false + } + } } #[cfg(all(test, feature = "unstable"))] diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 7f7f032a3..6c9d331d6 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -10,7 +10,7 @@ use fastfield::DeleteBitSet; use fastfield::FacetReader; use fastfield::FastFieldReader; use fastfield::{self, FastFieldNotAvailableError}; -use fastfield::{FastValue, MultiValueIntFastFieldReader}; +use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader}; use fieldnorm::FieldNormReader; use schema::Cardinality; use schema::Document; @@ -149,6 +149,23 @@ impl SegmentReader { } } + /// Accessor to the `BytesFastFieldReader` associated to a given `Field`. + pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result { + let field_entry = self.schema.get_field_entry(field); + match field_entry.field_type() { + &FieldType::Bytes => {}, + _ => return Err(FastFieldNotAvailableError::new(field_entry)), + } + let idx_reader = self.fast_fields_composite + .open_read_with_idx(field, 0) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) + .map(FastFieldReader::open)?; + let values = self.fast_fields_composite + .open_read_with_idx(field, 1) + .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?; + Ok(BytesFastFieldReader::open(idx_reader, values)) + } + /// Accessor to the `FacetReader` associated to a given `Field`. pub fn facet_reader(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs new file mode 100644 index 000000000..add4e6b7c --- /dev/null +++ b/src/fastfield/bytes/mod.rs @@ -0,0 +1,38 @@ +mod reader; +mod writer; + +pub use self::reader::BytesFastFieldReader; +pub use self::writer::BytesFastFieldWriter; + +#[cfg(test)] +mod tests { + use schema::SchemaBuilder; + use Index; + + #[test] + fn test_bytes() { + let mut schema_builder = SchemaBuilder::default(); + let field = schema_builder.add_bytes_field("bytesfield"); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3])); + index_writer.add_document(doc!(field=>vec![])); + index_writer.add_document(doc!(field=>vec![255u8])); + index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9])); + index_writer.add_document(doc!(field=>vec![0u8; 1000])); + assert!(index_writer.commit().is_ok()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + let bytes_reader = reader.bytes_fast_field_reader(field).unwrap(); + + assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]); + assert!(bytes_reader.get_val(1).is_empty()); + assert_eq!(bytes_reader.get_val(2), &[255u8]); + assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]); + let long = vec![0u8; 1000]; + assert_eq!(bytes_reader.get_val(4), long.as_slice()); + } +} \ No newline at end of file diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs new file mode 100644 index 000000000..8db65e7cd --- /dev/null +++ b/src/fastfield/bytes/reader.rs @@ -0,0 +1,40 @@ +use owning_ref::OwningRef; + +use directory::ReadOnlySource; +use fastfield::FastFieldReader; +use DocId; + +/// Reader for byte array fast fields +/// +/// The reader is implemented as a `u64` fast field and a separate collection of bytes. +/// +/// The `vals_reader` will access the concatenated list of all values for all documents. +/// +/// The `idx_reader` associates, for each document, the index of its first value. +/// +/// Reading the value for a document is done by reading the start index for it, +/// and the start index for the next document, and keeping the bytes in between. +pub struct BytesFastFieldReader { + idx_reader: FastFieldReader, + values: OwningRef, +} + +impl BytesFastFieldReader { + pub(crate) fn open( + idx_reader: FastFieldReader, + values_source: ReadOnlySource, + ) -> BytesFastFieldReader { + let values = OwningRef::new(values_source).map(|source| &source[..]); + BytesFastFieldReader { + idx_reader, + values, + } + } + + /// Returns the bytes associated to the given `doc` + pub fn get_val(&self, doc: DocId) -> &[u8] { + let start = self.idx_reader.get(doc) as usize; + let stop = self.idx_reader.get(doc + 1) as usize; + &self.values[start..stop] + } +} \ No newline at end of file diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs new file mode 100644 index 000000000..16aa330cd --- /dev/null +++ b/src/fastfield/bytes/writer.rs @@ -0,0 +1,97 @@ + +use std::io; + +use fastfield::serializer::FastFieldSerializer; +use schema::{Document, Field, Value}; +use DocId; + +/// Writer for byte array (as in, any number of bytes per document) fast fields +/// +/// This `BytesFastFieldWriter` is only useful for advanced user. +/// The normal way to get your associated bytes in your index +/// is to +/// - declare your field with fast set to `Cardinality::SingleValue` +/// in your schema +/// - add your document simply by calling `.add_document(...)` with associating bytes to the field. +/// +/// The `BytesFastFieldWriter` can be acquired from the +/// fast field writer by calling +/// [`.get_bytes_writer(...)`](./struct.FastFieldsWriter.html#method.get_bytes_writer). +/// +/// Once acquired, writing is done by calling `.add_document_val(&[u8])` +/// once per document, even if there are no bytes associated to it. +pub struct BytesFastFieldWriter { + field: Field, + vals: Vec, + doc_index: Vec, +} + +impl BytesFastFieldWriter { + /// Creates a new `BytesFastFieldWriter` + pub fn new(field: Field) -> Self { + BytesFastFieldWriter { + field, + vals: Vec::new(), + doc_index: Vec::new(), + } + } + + /// Access the field associated to the `BytesFastFieldWriter` + pub fn field(&self) -> Field { + self.field + } + + /// Finalize the current document. + pub(crate) fn next_doc(&mut self) { + self.doc_index.push(self.vals.len() as u64); + } + + /// Shift to the next document and add all of the + /// matching field values present in the document. + pub fn add_document(&mut self, doc: &Document) { + self.next_doc(); + for field_value in doc.field_values() { + if field_value.field() == self.field { + if let &Value::Bytes(ref bytes) = field_value.value() { + self.vals.extend_from_slice(bytes); + } else { + panic!("Bytes field contained non-Bytes Value!. Field {:?} = {:?}", self.field, field_value); + } + } + } + } + + /// Register the bytes associated to a document. + /// + /// The method returns the `DocId` of the document that was + /// just written. + pub fn add_document_val(&mut self, val: &[u8]) -> DocId { + let doc = self.doc_index.len() as DocId; + self.next_doc(); + self.vals.extend_from_slice(val); + doc + } + + /// Serializes the fast field values by pushing them to the `FastFieldSerializer`. + pub fn serialize( + &self, + serializer: &mut FastFieldSerializer + ) -> io::Result<()> { + { + // writing the offset index + let mut doc_index_serializer = + serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?; + for &offset in &self.doc_index { + doc_index_serializer.add_val(offset)?; + } + doc_index_serializer.add_val(self.vals.len() as u64)?; + doc_index_serializer.close_field()?; + } + { + // writing the values themselves + let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?; + value_serializer.write_all(&self.vals)?; + } + Ok(()) + } +} \ No newline at end of file diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 6b1044ffe..ba14c5236 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -31,11 +31,13 @@ pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastField pub use self::reader::FastFieldReader; pub use self::serializer::FastFieldSerializer; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; +pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; use common; use schema::Cardinality; use schema::FieldType; use schema::Value; +mod bytes; mod delete; mod error; mod facet_reader; diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index f8ff9e276..79e169344 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -34,17 +34,6 @@ impl MultiValueIntFastFieldReader { (start, stop) } - /// Returns the number of values associated to a given document. - pub fn num_vals(&self, doc: DocId) -> usize { - let (start, stop) = self.range(doc); - (stop - start) as usize - } - - /// Returns the overall number of values associated to documents. - pub(crate) fn total_num_vals(&self) -> u64 { - self.idx_reader.max_value() - } - /// Returns the array of values associated to the given `doc`. pub fn get_vals(&self, doc: DocId, vals: &mut Vec) { let (start, stop) = self.range(doc); diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 3c6dcf401..287fd76b7 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -61,6 +61,16 @@ impl FastFieldSerializer { FastSingleFieldSerializer::open(field_write, min_value, max_value) } + /// Start serializing a new [u8] fast field + pub fn new_bytes_fast_field_with_idx( + &mut self, + field: Field, + idx: usize + ) -> io::Result>> { + let field_write = self.composite_write.for_field_with_idx(field, idx); + FastBytesFieldSerializer::open(field_write) + } + /// Closes the serializer /// /// After this call the data must be persistently save on disk. @@ -117,3 +127,21 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { self.bit_packer.close(&mut self.write) } } + +pub struct FastBytesFieldSerializer<'a, W: Write + 'a> { + write: &'a mut W, +} + +impl<'a, W: Write> FastBytesFieldSerializer<'a, W> { + fn open(write: &'a mut W) -> io::Result> { + Ok(FastBytesFieldSerializer { write }) + } + + pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> { + self.write.write_all(vals) + } + + pub fn flush(&mut self) -> io::Result<()> { + self.write.flush() + } +} \ No newline at end of file diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 6b8ae3099..c7aa0816c 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,5 +1,7 @@ -use common::{self, VInt, BinarySerializable}; -use fastfield::FastFieldSerializer; +use common; +use common::BinarySerializable; +use common::VInt; +use fastfield::{BytesFastFieldWriter, FastFieldSerializer}; use postings::UnorderedTermId; use schema::{FieldType, Cardinality, Document, Field, Schema}; use std::collections::HashMap; @@ -11,6 +13,7 @@ use termdict::TermOrdinal; pub struct FastFieldsWriter { single_value_writers: Vec, multi_values_writers: Vec, + bytes_value_writers: Vec, } impl FastFieldsWriter { @@ -18,6 +21,7 @@ impl FastFieldsWriter { pub fn from_schema(schema: &Schema) -> FastFieldsWriter { let mut single_value_writers = Vec::new(); let mut multi_values_writers = Vec::new(); + let mut bytes_value_writers = Vec::new(); for (field_id, field_entry) in schema.fields().iter().enumerate() { let field = Field(field_id as u32); @@ -45,12 +49,17 @@ impl FastFieldsWriter { let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true); multi_values_writers.push(fast_field_writer); } + FieldType::Bytes => { + let fast_field_writer = BytesFastFieldWriter::new(field); + bytes_value_writers.push(fast_field_writer); + } _ => {} } } FastFieldsWriter { single_value_writers, multi_values_writers, + bytes_value_writers, } } @@ -76,6 +85,20 @@ impl FastFieldsWriter { .find(|multivalue_writer| multivalue_writer.field() == field) } + /// Returns the bytes fast field writer for the given field. + /// + /// Returns None if the field does not exist, or is not + /// configured as a bytes fastfield in the schema. + pub fn get_bytes_writer( + &mut self, + field: Field, + ) -> Option<&mut BytesFastFieldWriter> { + // TODO optimize + self.bytes_value_writers + .iter_mut() + .find(|field_writer| field_writer.field() == field) + } + /// Indexes all of the fastfields of a new document. pub fn add_document(&mut self, doc: &Document) { for field_writer in &mut self.single_value_writers { @@ -84,6 +107,9 @@ impl FastFieldsWriter { for field_writer in &mut self.multi_values_writers { field_writer.add_document(doc); } + for field_writer in &mut self.bytes_value_writers { + field_writer.add_document(doc); + } } /// Serializes all of the `FastFieldWriter`s by pushing them in @@ -100,6 +126,9 @@ impl FastFieldsWriter { let field = field_writer.field(); field_writer.serialize(serializer, mapping.get(&field))?; } + for field_writer in &self.bytes_value_writers { + field_writer.serialize(serializer)?; + } Ok(()) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1a01c1a62..5d58e5024 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -221,6 +221,9 @@ impl IndexMerger { // They can be implemented using what is done // for facets in the future. } + FieldType::Bytes => { + self.write_bytes_fast_field(field, fast_field_serializer)?; + } } } Ok(()) @@ -284,9 +287,9 @@ impl IndexMerger { } - fn write_multi_fast_field_idx(&self, - field: Field, - fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { + fn write_fast_field_idx(&self, + field: Field, + fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { let mut total_num_vals = 0u64; // In the first pass, we compute the total number of vals. @@ -294,15 +297,17 @@ impl IndexMerger { // This is required by the bitpacker, as it needs to know // what should be the bit length use for bitpacking. for reader in &self.readers { - let multi_ff_reader = reader.multi_fast_field_reader::(field)?; + let idx_reader = reader.fast_field_reader_with_idx::(field, 0)?; if let Some(delete_bitset) = reader.delete_bitset() { for doc in 0u32..reader.max_doc() { if !delete_bitset.is_deleted(doc) { - total_num_vals += multi_ff_reader.num_vals(doc) as u64; + let start = idx_reader.get(doc); + let end = idx_reader.get(doc + 1); + total_num_vals += end - start; } } } else { - total_num_vals += multi_ff_reader.total_num_vals(); + total_num_vals += idx_reader.max_value(); } } @@ -311,11 +316,13 @@ impl IndexMerger { let mut serialize_idx = fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?; let mut idx = 0; for reader in &self.readers { - let multi_ff_reader = reader.multi_fast_field_reader::(field)?; + let idx_reader = reader.fast_field_reader_with_idx::(field, 0)?; for doc in 0u32..reader.max_doc() { if !reader.is_deleted(doc) { serialize_idx.add_val(idx)?; - idx += multi_ff_reader.num_vals(doc) as u64; + let start = idx_reader.get(doc); + let end = idx_reader.get(doc + 1); + idx += end - start; } } } @@ -334,7 +341,7 @@ impl IndexMerger { // The second contains the actual values. // First we merge the idx fast field. - self.write_multi_fast_field_idx(field, fast_field_serializer)?; + self.write_fast_field_idx(field, fast_field_serializer)?; // We can now write the actual fast field values. // In the case of hierarchical facets, they are actually term ordinals. @@ -368,7 +375,7 @@ impl IndexMerger { // The second contains the actual values. // First we merge the idx fast field. - self.write_multi_fast_field_idx(field, fast_field_serializer)?; + self.write_fast_field_idx(field, fast_field_serializer)?; let mut min_value = u64::max_value(); let mut max_value = u64::min_value(); @@ -421,6 +428,27 @@ impl IndexMerger { Ok(()) } + fn write_bytes_fast_field(&self, + field: Field, + fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { + self.write_fast_field_idx(field, fast_field_serializer)?; + + let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?; + for reader in &self.readers { + let bytes_reader = reader.bytes_fast_field_reader(field)?; + // TODO: optimize if no deletes + for doc in 0..reader.max_doc() { + if !reader.is_deleted(doc) { + let val = bytes_reader.get_val(doc); + serialize_vals.write_all(val)?; + } + } + } + serialize_vals.flush()?; + + Ok(()) + } + fn write_postings_for_field(&self, indexed_field: Field, field_type: &FieldType, @@ -629,19 +657,21 @@ mod tests { use query::TermQuery; use schema; use schema::Document; - use schema::Field; use schema::IndexRecordOption; use schema::Term; use schema::TextFieldIndexing; use DocAddress; use Searcher; - use collector::tests::FastFieldTestCollector; + use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; + use collector::chain; use schema::Cardinality; use futures::Future; use IndexWriter; use query::AllQuery; use collector::FacetCollector; use schema::IntOptions; + use byteorder::{ReadBytesExt, WriteBytesExt, BigEndian}; + use std::io::Cursor; #[test] fn test_index_merger_no_deletes() { @@ -656,8 +686,15 @@ mod tests { let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); + let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let index = Index::create_in_ram(schema_builder.build()); + let add_score_bytes = |doc: &mut Document, score: u32| { + let mut bytes = Vec::new(); + bytes.write_u32::(score).expect("failed to write u32 bytes to Vec..."); + doc.add_bytes(bytes_score_field, bytes); + }; + { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { @@ -666,18 +703,21 @@ mod tests { let mut doc = Document::default(); doc.add_text(text_field, "af b"); doc.add_u64(score_field, 3); + add_score_bytes(&mut doc, 3); index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c"); doc.add_u64(score_field, 5); + add_score_bytes(&mut doc, 5); index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c d"); doc.add_u64(score_field, 7); + add_score_bytes(&mut doc, 7); index_writer.add_document(doc); } index_writer.commit().expect("committed"); @@ -689,12 +729,14 @@ mod tests { let mut doc = Document::default(); doc.add_text(text_field, "af b"); doc.add_u64(score_field, 11); + add_score_bytes(&mut doc, 11); index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c g"); doc.add_u64(score_field, 13); + add_score_bytes(&mut doc, 13); index_writer.add_document(doc); } index_writer.commit().expect("Commit failed"); @@ -765,21 +807,24 @@ mod tests { assert!(searcher.search(&query, &mut collector).is_ok()); collector.vals() }; + let get_fast_vals_bytes = |terms: Vec| { + let query = BooleanQuery::new_multiterms_query(terms); + let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field); + searcher.search(&query, &mut collector).expect("failed to search"); + collector.vals() + }; assert_eq!( get_fast_vals(vec![Term::from_field_text(text_field, "a")]), vec![5, 7, 13] ); + assert_eq!( + get_fast_vals_bytes(vec![Term::from_field_text(text_field, "a")]), + vec![0,0,0,5, 0,0,0,7, 0,0,0,13] + ); } } } - fn search_term(searcher: &Searcher, term: Term) -> Vec { - let mut collector = FastFieldTestCollector::for_field(Field(1)); - let term_query = TermQuery::new(term, IndexRecordOption::Basic); - searcher.search(&term_query, &mut collector).unwrap(); - collector.vals() - } - #[test] fn test_index_merger_with_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); @@ -791,25 +836,51 @@ mod tests { let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); + let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + let search_term = |searcher: &Searcher, term: Term| { + let mut collector = FastFieldTestCollector::for_field(score_field); + let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field); + let term_query = TermQuery::new(term, IndexRecordOption::Basic); + + { + let mut combined_collector = chain() + .push(&mut collector) + .push(&mut bytes_collector); + searcher.search(&term_query, &mut combined_collector).unwrap(); + } + + let scores = collector.vals(); + + let mut score_bytes = Cursor::new(bytes_collector.vals()); + for &score in &scores { + assert_eq!(score as u32, score_bytes.read_u32::().unwrap()); + } + + scores + }; + let empty_vec = Vec::::new(); { // a first commit index_writer.add_document(doc!( text_field => "a b d", - score_field => 1u64 + score_field => 1u64, + bytes_score_field => vec![0u8, 0, 0, 1], )); index_writer.add_document(doc!( text_field => "b c", - score_field => 2u64 + score_field => 2u64, + bytes_score_field => vec![0u8, 0, 0, 2], )); index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.add_document(doc!( text_field => "c d", - score_field => 3u64 + score_field => 3u64, + bytes_score_field => vec![0u8, 0, 0, 3], )); index_writer.commit().expect("committed"); index.load_searchers().unwrap(); @@ -838,21 +909,25 @@ mod tests { // a second commit index_writer.add_document(doc!( text_field => "a d e", - score_field => 4_000u64 + score_field => 4_000u64, + bytes_score_field => vec![0u8, 0, 0, 4], )); index_writer.add_document(doc!( text_field => "e f", - score_field => 5_000u64 + score_field => 5_000u64, + bytes_score_field => vec![0u8, 0, 0, 5], )); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "f")); index_writer.add_document(doc!( text_field => "f g", - score_field => 6_000u64 + score_field => 6_000u64, + bytes_score_field => vec![0u8, 0, 23, 112], )); index_writer.add_document(doc!( text_field => "g h", - score_field => 7_000u64 + score_field => 7_000u64, + bytes_score_field => vec![0u8, 0, 27, 88], )); index_writer.commit().expect("committed"); index.load_searchers().unwrap(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 717610b10..bfd3dd72a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -202,6 +202,9 @@ impl<'a> SegmentWriter<'a> { } } } + FieldType::Bytes => { + // Do nothing. Bytes only supports fast fields. + } } } doc.filter_fields(|field| schema.get_field_entry(field).is_stored()); diff --git a/src/lib.rs b/src/lib.rs index 3e57b9ead..f1e59f2c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -116,6 +116,9 @@ extern crate lazy_static; #[macro_use] extern crate serde_derive; +#[cfg_attr(test, macro_use)] +extern crate serde_json; + #[macro_use] extern crate log; @@ -124,6 +127,7 @@ extern crate error_chain; #[cfg(feature = "mmap")] extern crate atomicwrites; +extern crate base64; extern crate bit_set; extern crate bitpacking; extern crate byteorder; @@ -140,7 +144,6 @@ extern crate owning_ref; extern crate regex; extern crate rust_stemmers; extern crate serde; -extern crate serde_json; extern crate stable_deref_trait; extern crate tempdir; extern crate tempfile; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 472e9e2e0..34773cbee 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -14,7 +14,6 @@ mod serializer; mod term_info; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; -use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder}; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; pub use self::postings::Postings; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 37ba69ecd..d05b14271 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,7 +1,6 @@ use datastruct::stacker::{Heap, TermHashMap}; -use postings::Recorder; use postings::{FieldSerializer, InvertedIndexSerializer}; -use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder}; +use postings::recorder::{Recorder, NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder}; use schema::{FieldEntry, FieldType, Term, Field, Schema}; use std::collections::HashMap; use std::io; @@ -37,6 +36,11 @@ fn posting_from_field_entry<'a>( FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => { SpecializedPostingsWriter::::new_boxed(heap) } + FieldType::Bytes => { + // FieldType::Bytes cannot actually be indexed. + // TODO fix during the indexer refactoring described in #276 + SpecializedPostingsWriter::::new_boxed(heap) + } } } @@ -112,7 +116,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let field_entry = self.schema.get_field_entry(field); match field_entry.field_type() { - FieldType::Str(_) | FieldType::HierarchicalFacet => { + &FieldType::Str(_) | &FieldType::HierarchicalFacet => { // populating the (unordered term ord) -> (ordered term ord) mapping // for the field. let mut unordered_term_ids = term_offsets[start..stop] @@ -124,7 +128,8 @@ impl<'a> MultiFieldPostingsWriter<'a> { .collect(); unordered_term_mappings.insert(field, mapping); } - FieldType::U64(_) | FieldType::I64(_) => {} + &FieldType::U64(_) | &FieldType::I64(_) => {} + &FieldType::Bytes => {} } let postings_writer = &self.per_field_postings_writers[field.0 as usize]; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index beef7fcea..e7c461c92 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -227,6 +227,10 @@ impl QueryParser { let term = Term::from_field_text(field, phrase); Ok(Some(LogicalLiteral::Term(term))) } + FieldType::Bytes => { + let field_name = self.schema.get_field_name(field).to_string(); + Err(QueryParserError::FieldNotIndexed(field_name)) + } } } diff --git a/src/schema/document.rs b/src/schema/document.rs index 4f32c7d97..c6a508a94 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -87,6 +87,11 @@ impl Document { self.add(FieldValue::new(field, Value::I64(value))); } + /// Add a bytes field + pub fn add_bytes(&mut self, field: Field, value: Vec) { + self.add(FieldValue::new(field, Value::Bytes(value))) + } + /// Add a field value pub fn add(&mut self, field_value: FieldValue) { self.field_values.push(field_value); diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 4e6cf43f9..fcce0aab9 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -56,6 +56,14 @@ impl FieldEntry { } } + /// Creates a field entry for a bytes field + pub fn new_bytes(field_name: String) -> FieldEntry { + FieldEntry { + name: field_name, + field_type: FieldType::Bytes, + } + } + /// Returns the name of the field pub fn name(&self) -> &str { &self.name @@ -72,6 +80,7 @@ impl FieldEntry { FieldType::Str(ref options) => options.get_indexing_options().is_some(), FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(), FieldType::HierarchicalFacet => true, + FieldType::Bytes => false, } } @@ -88,8 +97,9 @@ impl FieldEntry { match self.field_type { FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_stored(), FieldType::Str(ref options) => options.is_stored(), + // TODO make stored hierarchical facet optional FieldType::HierarchicalFacet => true, - // TODO make stored hierachical facet optional + FieldType::Bytes => false, } } } @@ -118,6 +128,9 @@ impl Serialize for FieldEntry { FieldType::HierarchicalFacet => { s.serialize_field("type", "hierarchical_facet")?; } + FieldType::Bytes => { + s.serialize_field("type", "bytes")?; + } } s.end() @@ -167,10 +180,20 @@ impl<'de> Deserialize<'de> for FieldEntry { if ty.is_some() { return Err(de::Error::duplicate_field("type")); } - ty = Some(map.next_value()?); - if ty == Some("hierarchical_facet") { - field_type = Some(FieldType::HierarchicalFacet); + let type_string = map.next_value()?; + match type_string { + "hierarchical_facet" => { + field_type = Some(FieldType::HierarchicalFacet); + } + "bytes" => { + field_type = Some(FieldType::Bytes); + } + "text" | "u64" | "i64" => { + // These types require additional options to create a field_type + } + _ => panic!("unhandled type") } + ty = Some(type_string); } Field::Options => match ty { None => { @@ -205,7 +228,6 @@ impl<'de> Deserialize<'de> for FieldEntry { #[cfg(test)] mod tests { - use super::*; use schema::TEXT; use serde_json; diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 692e3b895..209d49930 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -1,3 +1,5 @@ +use base64::decode; + use schema::{IntOptions, TextOptions}; use schema::Facet; @@ -15,6 +17,9 @@ pub enum ValueParsingError { /// (e.g. 3 for a `Str` type or `"abc"` for a u64 type) /// Tantivy will try to autocast values. TypeError(String), + /// The json node is a string but contains json that is + /// not valid base64. + InvalidBase64(String), } /// Type of the value that a field can take. @@ -31,6 +36,8 @@ pub enum Type { I64, /// `tantivy::schema::Facet`. Passed as a string in JSON. HierarchicalFacet, + /// `Vec` + Bytes, } /// A `FieldType` describes the type (text, u64) of a field as well as @@ -45,6 +52,8 @@ pub enum FieldType { I64(IntOptions), /// Hierachical Facet HierarchicalFacet, + /// Bytes (one per document) + Bytes, } impl FieldType { @@ -55,6 +64,7 @@ impl FieldType { FieldType::U64(_) => Type::U64, FieldType::I64(_) => Type::I64, FieldType::HierarchicalFacet => Type::HierarchicalFacet, + FieldType::Bytes => Type::Bytes, } } @@ -66,6 +76,7 @@ impl FieldType { int_options.is_indexed() } FieldType::HierarchicalFacet => true, + FieldType::Bytes => false, } } @@ -86,6 +97,7 @@ impl FieldType { } } FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic), + FieldType::Bytes => None, } } @@ -102,6 +114,13 @@ impl FieldType { format!("Expected an integer, got {:?}", json), )), FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))), + FieldType::Bytes => { + decode(field_text) + .map(Value::Bytes) + .map_err(|_| ValueParsingError::InvalidBase64( + format!("Expected base64 string, got {:?}", field_text) + )) + } }, JsonValue::Number(ref field_val_num) => match *self { FieldType::I64(_) => { @@ -120,7 +139,7 @@ impl FieldType { Err(ValueParsingError::OverflowError(msg)) } } - FieldType::Str(_) | FieldType::HierarchicalFacet => { + FieldType::Str(_) | FieldType::HierarchicalFacet | FieldType::Bytes => { let msg = format!("Expected a string, got {:?}", json); Err(ValueParsingError::TypeError(msg)) } @@ -135,3 +154,32 @@ impl FieldType { } } } + +#[cfg(test)] +mod tests { + use super::FieldType; + use schema::Value; + use schema::field_type::ValueParsingError; + + #[test] + fn test_bytes_value_from_json() { + let result = FieldType::Bytes + .value_from_json(&json!("dGhpcyBpcyBhIHRlc3Q=")) + .unwrap(); + assert_eq!(result, Value::Bytes("this is a test".as_bytes().to_vec())); + + let result = FieldType::Bytes + .value_from_json(&json!(521)); + match result { + Err(ValueParsingError::TypeError(_)) => {} + _ => panic!("Expected parse failure for wrong type") + } + + let result = FieldType::Bytes + .value_from_json(&json!("-")); + match result { + Err(ValueParsingError::InvalidBase64(_)) => {} + _ => panic!("Expected parse failure for invalid base64") + } + } +} diff --git a/src/schema/schema.rs b/src/schema/schema.rs index c358912a6..6d4f6c949 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -95,6 +95,12 @@ impl SchemaBuilder { self.add_field(field_entry) } + /// Adds a fast bytes field to the schema + pub fn add_bytes_field(&mut self, field_name: &str) -> Field { + let field_entry = FieldEntry::new_bytes(field_name.to_string()); + self.add_field(field_entry) + } + /// Adds a field entry to the schema in build. fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field(self.fields.len() as u32); diff --git a/src/schema/value.rs b/src/schema/value.rs index bae96d867..f5ce151f1 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -15,6 +15,8 @@ pub enum Value { I64(i64), /// Hierarchical Facet Facet(Facet), + /// Arbitrarily sized byte array + Bytes(Vec), } impl Serialize for Value { @@ -27,6 +29,7 @@ impl Serialize for Value { Value::U64(u) => serializer.serialize_u64(u), Value::I64(u) => serializer.serialize_i64(u), Value::Facet(ref facet) => facet.serialize(serializer), + Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes), } } } @@ -131,6 +134,12 @@ impl<'a> From for Value { } } +impl From> for Value { + fn from(bytes: Vec) -> Value { + Value::Bytes(bytes) + } +} + mod binary_serialize { use super::Value; use common::BinarySerializable; @@ -141,6 +150,7 @@ mod binary_serialize { const U64_CODE: u8 = 1; const I64_CODE: u8 = 2; const HIERARCHICAL_FACET_CODE: u8 = 3; + const BYTES_CODE: u8 = 4; impl BinarySerializable for Value { fn serialize(&self, writer: &mut W) -> io::Result<()> { @@ -161,6 +171,10 @@ mod binary_serialize { HIERARCHICAL_FACET_CODE.serialize(writer)?; facet.serialize(writer) } + Value::Bytes(ref bytes) => { + BYTES_CODE.serialize(writer)?; + bytes.serialize(writer) + } } } fn deserialize(reader: &mut R) -> io::Result { @@ -179,6 +193,7 @@ mod binary_serialize { Ok(Value::I64(value)) } HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)), + BYTES_CODE => Ok(Value::Bytes(Vec::::deserialize(reader)?)), _ => Err(io::Error::new( io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code),