diff --git a/.travis.yml b/.travis.yml index 0c7ec3d43..cbfbc222b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ script: travis-cargo test && travis-cargo bench && travis-cargo doc + - cargo run --example simple_search after_success: - bash ./script/build-doc.sh - travis-cargo doc-upload diff --git a/CHANGELOG.md b/CHANGELOG.md index 3478e57ec..9c45eb005 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ Tantivy 0.4.0 ========================== - +- Raise the limit of number of fields (previously 256 fields) - Removed u32 fields. They are replaced by u64 and i64 fields (#65) - Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola - QueryParser: @@ -13,7 +13,6 @@ Tantivy 0.3.1 ========================== - Expose a method to trigger files garbage collection -- Raise the limit of number of fields (previously 256 fields) Tantivy 0.3 diff --git a/appveyor.yml b/appveyor.yml index 789e24400..4e016911e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -21,4 +21,5 @@ install: build: false test_script: - - REM SET RUST_LOG=tantivy,test & cargo test --verbose \ No newline at end of file + - REM SET RUST_LOG=tantivy,test & cargo test --verbose + - REM SET RUST_LOG=tantivy,test & cargo run --example simple_search \ No newline at end of file diff --git a/src/common/mod.rs b/src/common/mod.rs index da4a169e9..ae9f56794 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -27,7 +27,6 @@ pub trait HasLen { } } - const HIGHEST_BIT: u64 = 1 << 63; diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index b54377fad..63bb53c9a 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -175,9 +175,7 @@ mod tests { let mut term_it = searcher.terms(); let mut terms = String::new(); while let Some(term) = term_it.next() { - unsafe { - terms.push_str(term.text()); - } + terms.push_str(term.text()); } assert_eq!(terms, "abcdef"); } diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 235c594d5..8ba3739f5 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -45,11 +45,6 @@ impl Heap { pub fn capacity(&self,) -> u32 { self.inner().capacity() } - - /// Return the amount of memory that has been allocated so far. - pub fn len(&self,) -> u32 { - self.inner().len() - } /// Return amount of free space, in bytes. pub fn num_free_bytes(&self,) -> u32 { @@ -90,10 +85,6 @@ impl Heap { pub fn get_mut_ref(&self, addr: u32) -> &mut Item { self.inner().get_mut_ref(addr) } - - pub fn get_ref(&self, addr: u32) -> &Item { - self.inner().get_mut_ref(addr) - } } @@ -108,8 +99,9 @@ struct InnerHeap { impl InnerHeap { pub fn with_capacity(num_bytes: usize) -> InnerHeap { + let buffer: Vec = vec![0u8; num_bytes]; InnerHeap { - buffer: vec![0u8; num_bytes], + buffer: buffer, buffer_len: num_bytes as u32, next_heap: None, used: 0u32, @@ -124,10 +116,6 @@ impl InnerHeap { pub fn capacity(&self,) -> u32 { self.buffer.len() as u32 } - - pub fn len(&self,) -> u32 { - self.used - } // Returns the number of free bytes. If the buffer // has reached it's capacity and overflowed to another buffer, return 0. @@ -195,8 +183,6 @@ impl InnerHeap { } } - - fn get_mut_ref(&mut self, addr: u32) -> &mut Item { if addr >= self.buffer_len { self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index fa4bd42de..7e41f5582 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -120,7 +120,9 @@ pub fn open_index_writer( let delete_queue = DeleteQueue::new(); - let stamper = Stamper::new(index.opstamp()); + let current_opstamp = index.opstamp(); + + let stamper = Stamper::new(current_opstamp); let segment_updater = SegmentUpdater::new(index.clone(), stamper.clone(), @@ -143,7 +145,7 @@ pub fn open_index_writer( delete_queue: delete_queue, - committed_opstamp: index.opstamp(), + committed_opstamp: current_opstamp, stamper: stamper, generation: 0, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 38e5271b4..d63333344 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -32,7 +32,7 @@ struct DeltaPositionComputer { impl DeltaPositionComputer { fn new() -> DeltaPositionComputer { DeltaPositionComputer { - buffer: vec![0u32, 512] + buffer: vec![0u32; 512] } } @@ -201,6 +201,8 @@ impl IndexMerger { } merged_doc_id_map.push(segment_local_map); } + + let mut field = Field(u32::max_value()); while merged_terms.advance() { // Create the total list of doc ids @@ -231,15 +233,19 @@ impl IndexMerger { // We can now serialize this postings, by pushing each document to the // postings serializer. - for (segment_ord, mut segment_postings) in segment_postings { let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; while segment_postings.advance() { if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] { if !term_written { + let current_field = term.field(); + if current_field != field { + postings_serializer.new_field(current_field); + field = current_field; + } // we make sure to only write the term iff // there is at least one document. - postings_serializer.new_term(&term)?; + postings_serializer.new_term(term.as_slice())?; term_written = true; } let delta_positions: &[u32] = diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 3ec06ff3f..7b55597c3 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -77,7 +77,7 @@ pub fn save_metas(segment_metas: Vec, schema: schema, opstamp: opstamp, }; - let mut w = try!(serde_json::to_vec(&metas)); + let mut w = try!(serde_json::to_vec_pretty(&metas)); try!(write!(&mut w, "\n")); let res = directory.atomic_write(&META_FILEPATH, &w[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index ad51315a2..8878fd519 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -76,9 +76,6 @@ impl<'a> SegmentWriter<'a> { /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. pub fn finalize(self) -> Result> { - // for per_field_postings_writer in &mut self.per_field_postings_writers { - // per_field_postings_writer.close(self.heap); - // } write(&self.multifield_postings, &self.fast_field_writers, &self.fieldnorms_writer, @@ -149,7 +146,6 @@ impl<'a> SegmentWriter<'a> { for field_value in field_values { let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value()); self.multifield_postings.suscribe(doc_id, &term); - // field_posting_writer.suscribe(term_index, doc_id, 0, &term, self.heap); } } } diff --git a/src/lib.rs b/src/lib.rs index 7f889691d..b4c1068ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -126,7 +126,6 @@ pub use schema::{Term, Document}; pub use core::SegmentReader; pub use self::common::TimerTree; - pub use postings::DocSet; pub use postings::Postings; pub use postings::SegmentPostingsOption; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4133a57a2..b244e9b8f 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -17,16 +17,15 @@ mod docset; mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; -pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; +use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; pub use self::serializer::PostingsSerializer; -pub use self::postings_writer::PostingsWriter; -pub use self::postings_writer::SpecializedPostingsWriter; -pub use self::postings_writer::MultiFieldPostingsWriter; +pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub use self::term_info::TermInfo; pub use self::postings::Postings; #[cfg(test)] pub use self::vec_postings::VecPostings; + pub use self::segment_postings::SegmentPostings; pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; @@ -61,8 +60,8 @@ mod tests { let index = Index::create_in_ram(schema); let mut segment = index.new_segment(); let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); - let term = Term::from_field_text(text_field, "abc"); - posting_serializer.new_term(&term).unwrap(); + posting_serializer.new_field(text_field); + posting_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..3u32 { let positions = vec!(1,2,3,2); posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 867bfa00d..b317c22f1 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -114,6 +114,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let (_, stop) = offsets[i+1]; let postings_writer = &self.per_field_postings_writers[field.0 as usize]; postings_writer.serialize( + field, &term_offsets[start..stop], serializer, self.heap)?; @@ -144,7 +145,7 @@ pub trait PostingsWriter { /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. - fn serialize(&self, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; + fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; /// Tokenize a text and suscribe all of its token. fn index_text<'a>(&mut self, @@ -156,7 +157,8 @@ pub trait PostingsWriter { -> u32 { let mut pos = 0u32; let mut num_tokens: u32 = 0u32; - let mut term = Term::allocate(field, 100); + let mut term = unsafe { Term::with_capacity(100) }; + term.set_field(field); for field_value in field_values { let mut tokens = SimpleTokenizer.tokenize(field_value.value().text()); // right now num_tokens and pos are redundant, but it should @@ -226,17 +228,19 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } fn serialize(&self, + field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { - let mut term = Term::allocate(Field(0), 100); + + serializer.new_field(field); for &(term_bytes, addr) in term_addrs { let recorder: &mut Rec = self.heap.get_mut_ref(addr); - term.set_content(term_bytes); - try!(serializer.new_term(&term)); + try!(serializer.new_term(&term_bytes)); try!(recorder.serialize(addr, serializer, heap)); try!(serializer.close_term()); } + Ok(()) } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 107d5a881..f8c38ebc2 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,7 +1,6 @@ use Result; use datastruct::FstMapBuilder; use super::TermInfo; -use schema::Term; use schema::Field; use schema::FieldEntry; use schema::FieldType; @@ -30,7 +29,7 @@ use common::BinarySerializable; /// /// The serializer expects to receive the following calls /// in this order : -/// +/// * `set_field(...)` /// * `new_term(...)` /// * `write_doc(...)` /// * `write_doc(...)` @@ -41,6 +40,8 @@ use common::BinarySerializable; /// * `write_doc(...)` /// * ... /// * `close_term()` +/// * `set_field(...)` +/// * ... /// * `close()` /// /// Terms have to be pushed in a lexicographically-sorted order. @@ -105,7 +106,11 @@ impl PostingsSerializer { segment.schema()) } - fn load_indexing_options(&mut self, field: Field) { + /// Must be called before starting pushing terms of + /// a given field. + /// + /// Loads the indexing options for the given field. + pub fn new_field(&mut self, field: Field) { let field_entry: &FieldEntry = self.schema.get_field_entry(field); self.text_indexing_options = match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options.get_indexing_options(), @@ -130,13 +135,11 @@ impl PostingsSerializer { /// * term - the term. It needs to come after the previous term according /// to the lexicographical order. /// * doc_freq - return the number of document containing the term. - pub fn new_term(&mut self, term: &Term) -> io::Result<()> { + pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> { if self.term_open { panic!("Called new_term, while the previous term was not closed."); } self.term_open = true; - // TODO avoid load indexing options all the time. - self.load_indexing_options(term.field()); self.doc_ids.clear(); self.last_doc_id_encoded = 0; self.term_freqs.clear(); @@ -146,7 +149,7 @@ impl PostingsSerializer { postings_offset: self.written_bytes_postings as u32, positions_offset: self.written_bytes_positions as u32, }; - self.terms_fst_builder.insert_key(term.as_slice()) + self.terms_fst_builder.insert_key(term) } /// Finish the serialization for this term postings. diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 27b658837..6e6328000 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -109,7 +109,7 @@ mod field; mod value; mod named_field_document; -pub use self::term::extract_field_from_term_bytes; +pub(crate) use self::term::extract_field_from_term_bytes; pub use self::named_field_document::NamedFieldDocument; pub use self::schema::{Schema, SchemaBuilder}; pub use self::value::Value; diff --git a/src/schema/term.rs b/src/schema/term.rs index 50cff08fb..a539d6e2b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,11 +1,14 @@ use std::fmt; use common; -use byteorder::{BigEndian, WriteBytesExt, ByteOrder}; +use byteorder::{BigEndian, ByteOrder}; use super::Field; use std::str; +/// Size (in bytes) of the buffer of a int field. +const INT_TERM_LEN: usize = 4 + 8; + /// Term represents the value that the token can take. /// /// It actually wraps a `Vec`. @@ -14,18 +17,11 @@ pub struct Term(Vec); /// Extract `field` from Term. #[doc(hidden)] -pub fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field { +pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field { Field(BigEndian::read_u32(&term_bytes[..4])) } impl Term { - - /// Pre-allocate a term buffer. - pub fn allocate(field: Field, num_bytes: usize) -> Term { - let mut term = Term(Vec::with_capacity(num_bytes)); - term.0.write_u32::(field.0).expect("serializing u32 to Vec"); - term - } /// Set the content of the term. pub fn set_content(&mut self, content: &[u8]) { @@ -39,6 +35,14 @@ impl Term { extract_field_from_term_bytes(&self.0) } + /// Returns the field. + pub fn set_field(&mut self, field: Field) { + if self.0.len() < 4 { + self.0.resize(4, 0u8); + } + BigEndian::write_u32(&mut self.0[0..4], field.0); + } + /// Builds a term given a field, and a u64-value /// /// Assuming the term has a field id of 1, and a u64 value of 3234, @@ -47,13 +51,21 @@ impl Term { /// The first four byte are dedicated to storing the field id as a u64. /// The 4 following bytes are encoding the u64 value. pub fn from_field_u64(field: Field, val: u64) -> Term { - const U64_TERM_LEN: usize = 4 + 8; - let mut buffer = vec![0u8; U64_TERM_LEN]; - // we want BigEndian here to have lexicographic order - // match the natural order of `(field, val)` - BigEndian::write_u32(&mut buffer[0..4], field.0); - BigEndian::write_u64(&mut buffer[4..], val); - Term(buffer) + let mut term = Term(vec![0u8; INT_TERM_LEN]); + term.set_field(field); + term.set_u64(val); + term + } + + /// Sets a u64 value in the term. + /// + /// U64 are serialized using (8-byte) BigEndian + /// representation. + /// The use of BigEndian has the benefit of preserving + /// the natural order of the values. + pub fn set_u64(&mut self, val: u64) { + self.0.resize(INT_TERM_LEN, 0u8); + BigEndian::write_u64(&mut self.0[4..], val); } /// Builds a term given a field, and a u64-value @@ -75,10 +87,21 @@ impl Term { /// The first byte is 2, and the three following bytes are the utf-8 /// representation of "abc". pub fn from_field_text(field: Field, text: &str) -> Term { - let mut buffer = vec![0u8; 4 + text.len()]; - BigEndian::write_u32(&mut buffer[0..4], field.0); - buffer[4..].clone_from_slice(text.as_bytes()); - Term(buffer) + let buffer = Vec::with_capacity(4 + text.len()); + let mut term = Term(buffer); + term.set_field(field); + term.set_text(text); + term + } + + /// Creates a new Term with an empty buffer, + /// but with a given capacity. + /// + /// It is declared unsafe, as the term content + /// is not initialized, and a call to `.field()` + /// would panic. + pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term { + Term(Vec::with_capacity(num_bytes)) } /// Assume the term is a u64 field. @@ -112,8 +135,8 @@ impl Term { /// If the value is not valid utf-8. This may happen /// if the index is corrupted or if you try to /// call this method on a non-string type. - pub unsafe fn text(&self) -> &str { - str::from_utf8_unchecked(self.value()) + pub fn text(&self) -> &str { + str::from_utf8(self.value()).expect("Term does not contain valid utf-8.") } /// Set the texts only, keeping the field untouched.