Code cleaning

2026-06-05 01:50:42 +00:00 · 2017-05-11 20:47:30 +09:00
parent 54ab897755
commit 6fd17e0ead
16 changed files with 91 additions and 75 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,6 +28,7 @@ script:
    travis-cargo test &&
    travis-cargo bench &&
    travis-cargo doc
+  - cargo run --example simple_search
 after_success:
  - bash ./script/build-doc.sh
  - travis-cargo doc-upload
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 Tantivy 0.4.0
 ==========================
-
+- Raise the limit of number of fields (previously 256 fields)
 - Removed u32 fields. They are replaced by u64 and i64 fields (#65)
 - Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
 - QueryParser:
@@ -13,7 +13,6 @@ Tantivy 0.3.1
 ==========================

 - Expose a method to trigger files garbage collection
- Raise the limit of number of fields (previously 256 fields)


 Tantivy 0.3
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -21,4 +21,5 @@ install:
 build: false

 test_script:
-  - REM SET RUST_LOG=tantivy,test & cargo test --verbose
+  - REM SET RUST_LOG=tantivy,test & cargo test --verbose
+  - REM SET RUST_LOG=tantivy,test & cargo run --example simple_search
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -27,7 +27,6 @@ pub trait HasLen {
    }
 }

-
 const HIGHEST_BIT: u64 = 1 << 63;


--- a/src/core/term_iterator.rs
+++ b/src/core/term_iterator.rs
@@ -175,9 +175,7 @@ mod tests {
        let mut term_it = searcher.terms();
        let mut terms = String::new();
        while let Some(term) = term_it.next() {
-            unsafe {
-                terms.push_str(term.text());
-            }
+            terms.push_str(term.text());
        }
        assert_eq!(terms, "abcdef");
    }
--- a/src/datastruct/stacker/heap.rs
+++ b/src/datastruct/stacker/heap.rs
@@ -45,11 +45,6 @@ impl Heap {
    pub fn capacity(&self,) -> u32 {
        self.inner().capacity()
    }
-    
-    /// Return the amount of memory that has been allocated so far. 
-    pub fn len(&self,) -> u32 {
-        self.inner().len()
-    }
        
    /// Return amount of free space, in bytes.
    pub fn num_free_bytes(&self,) -> u32 {
@@ -90,10 +85,6 @@ impl Heap {
    pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
        self.inner().get_mut_ref(addr)
    }
-
-    pub fn get_ref<Item>(&self, addr: u32) -> &Item {
-        self.inner().get_mut_ref(addr)
-    }
 }


@@ -108,8 +99,9 @@ struct InnerHeap {
 impl InnerHeap {

    pub fn with_capacity(num_bytes: usize) -> InnerHeap {
+        let buffer: Vec<u8> = vec![0u8; num_bytes];
        InnerHeap {
-            buffer: vec![0u8; num_bytes],
+            buffer: buffer,
            buffer_len: num_bytes as u32,
            next_heap: None,
            used: 0u32,
@@ -124,10 +116,6 @@ impl InnerHeap {
    pub fn capacity(&self,) -> u32 {
        self.buffer.len() as u32
    }
-
-    pub fn len(&self,) -> u32 {
-        self.used
-    }
    
    // Returns the number of free bytes. If the buffer
    // has reached it's capacity and overflowed to another buffer, return 0.
@@ -195,8 +183,6 @@ impl InnerHeap {
        }
    }

-
-
    fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
        if addr >= self.buffer_len {
            self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -120,7 +120,9 @@ pub fn open_index_writer(

    let delete_queue = DeleteQueue::new();
    
-    let stamper = Stamper::new(index.opstamp());
+    let current_opstamp = index.opstamp();
+
+    let stamper = Stamper::new(current_opstamp);

    let segment_updater = SegmentUpdater::new(index.clone(), 
                                              stamper.clone(),
@@ -143,7 +145,7 @@ pub fn open_index_writer(

        delete_queue: delete_queue,

-        committed_opstamp: index.opstamp(),
+        committed_opstamp: current_opstamp,
        stamper: stamper,

        generation: 0,
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -32,7 +32,7 @@ struct DeltaPositionComputer {
 impl DeltaPositionComputer {
    fn new() -> DeltaPositionComputer {
        DeltaPositionComputer { 
-            buffer: vec![0u32, 512]
+            buffer: vec![0u32; 512]
        }
    }

@@ -201,6 +201,8 @@ impl IndexMerger {
            }
            merged_doc_id_map.push(segment_local_map);
        }
+
+        let mut field = Field(u32::max_value());
        
        while merged_terms.advance() {
            // Create the total list of doc ids
@@ -231,15 +233,19 @@ impl IndexMerger {
                
                // We can now serialize this postings, by pushing each document to the
                // postings serializer.                
-                
                for (segment_ord, mut segment_postings) in segment_postings {
                    let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
                    while segment_postings.advance() {
                        if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
                            if !term_written {
+                                let current_field = term.field();
+                                if current_field != field {
+                                    postings_serializer.new_field(current_field);
+                                    field = current_field;
+                                }
                                // we make sure to only write the term iff
                                // there is at least one document.
-                                postings_serializer.new_term(&term)?;
+                                postings_serializer.new_term(term.as_slice())?;
                                term_written = true;
                            }
                            let delta_positions: &[u32] =
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -77,7 +77,7 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
        schema: schema,
        opstamp: opstamp,
    };
-    let mut w = try!(serde_json::to_vec(&metas));
+    let mut w = try!(serde_json::to_vec_pretty(&metas));
    try!(write!(&mut w, "\n"));
    let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
    debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -76,9 +76,6 @@ impl<'a> SegmentWriter<'a> {
 	/// Finalize consumes the `SegmentWriter`, so that it cannot 
 	/// be used afterwards.
 	pub fn finalize(self) -> Result<Vec<u64>> {
-		// for per_field_postings_writer in &mut self.per_field_postings_writers {
-		// 	per_field_postings_writer.close(self.heap);
-		// }
 		write(&self.multifield_postings,
 			  &self.fast_field_writers,
 			  &self.fieldnorms_writer,
@@ -149,7 +146,6 @@ impl<'a> SegmentWriter<'a> {
 						for field_value in field_values {
 							let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
 							self.multifield_postings.suscribe(doc_id, &term);
-							// field_posting_writer.suscribe(term_index, doc_id, 0, &term, self.heap);
 						}
 					}
 				}
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -126,7 +126,6 @@ pub use schema::{Term, Document};
 pub use core::SegmentReader;
 pub use self::common::TimerTree;

-
 pub use postings::DocSet;
 pub use postings::Postings;
 pub use postings::SegmentPostingsOption;
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -17,16 +17,15 @@ mod docset;
 mod segment_postings_option;

 pub use self::docset::{SkipResult, DocSet};
-pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
+use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
 pub use self::serializer::PostingsSerializer;
-pub use self::postings_writer::PostingsWriter;
-pub use self::postings_writer::SpecializedPostingsWriter;
-pub use self::postings_writer::MultiFieldPostingsWriter;
+pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
 pub use self::term_info::TermInfo;
 pub use self::postings::Postings;

 #[cfg(test)]
 pub use self::vec_postings::VecPostings;
+
 pub use self::segment_postings::SegmentPostings;
 pub use self::intersection::IntersectionDocSet;
 pub use self::freq_handler::FreqHandler;
@@ -61,8 +60,8 @@ mod tests {
        let index = Index::create_in_ram(schema);
        let mut segment = index.new_segment();
        let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
-        let term = Term::from_field_text(text_field, "abc");
-        posting_serializer.new_term(&term).unwrap();
+        posting_serializer.new_field(text_field);
+        posting_serializer.new_term("abc".as_bytes()).unwrap();
        for doc_id in 0u32..3u32 {
            let positions = vec!(1,2,3,2);
            posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -114,6 +114,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
            let (_, stop) = offsets[i+1];
            let postings_writer = &self.per_field_postings_writers[field.0 as usize];
            postings_writer.serialize(
+                field,
                &term_offsets[start..stop],
                serializer,
                self.heap)?;
@@ -144,7 +145,7 @@ pub trait PostingsWriter {
    
    /// Serializes the postings on disk.
    /// The actual serialization format is handled by the `PostingsSerializer`.
-    fn serialize(&self, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
+    fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
    
    /// Tokenize a text and suscribe all of its token.
    fn index_text<'a>(&mut self,
@@ -156,7 +157,8 @@ pub trait PostingsWriter {
                      -> u32 {
        let mut pos = 0u32;
        let mut num_tokens: u32 = 0u32;
-        let mut term = Term::allocate(field, 100);
+        let mut term = unsafe { Term::with_capacity(100) };
+        term.set_field(field);
        for field_value in field_values {
            let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
            // right now num_tokens and pos are redundant, but it should
@@ -226,17 +228,19 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
    }

    fn serialize(&self,
+        field: Field,
        term_addrs: &[(&[u8], u32)],
        serializer: &mut PostingsSerializer, 
        heap: &Heap) -> io::Result<()> {
-        let mut term = Term::allocate(Field(0), 100);
+        
+        serializer.new_field(field);
        for &(term_bytes, addr) in term_addrs {
            let recorder: &mut Rec = self.heap.get_mut_ref(addr);
-            term.set_content(term_bytes);
-            try!(serializer.new_term(&term));
+            try!(serializer.new_term(&term_bytes));
            try!(recorder.serialize(addr, serializer, heap));
            try!(serializer.close_term());
        }
+        
        Ok(())
    }
 }
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -1,7 +1,6 @@
 use Result;
 use datastruct::FstMapBuilder;
 use super::TermInfo;
-use schema::Term;
 use schema::Field;
 use schema::FieldEntry;
 use schema::FieldType;
@@ -30,7 +29,7 @@ use common::BinarySerializable;
 ///
 /// The serializer expects to receive the following calls
 /// in this order :
-///
+/// * `set_field(...)`
 /// * `new_term(...)`
 /// * `write_doc(...)`
 /// * `write_doc(...)`
@@ -41,6 +40,8 @@ use common::BinarySerializable;
 /// * `write_doc(...)`
 /// * ...
 /// * `close_term()`
+/// * `set_field(...)`
+/// * ...
 /// * `close()`
 ///
 /// Terms have to be pushed in a lexicographically-sorted order.
@@ -105,7 +106,11 @@ impl PostingsSerializer {
                                segment.schema())
    }

-    fn load_indexing_options(&mut self, field: Field) {
+    /// Must be called before starting pushing terms of 
+    /// a given field.
+    ///
+    /// Loads the indexing options for the given field.
+    pub fn new_field(&mut self, field: Field) {
        let field_entry: &FieldEntry = self.schema.get_field_entry(field);
        self.text_indexing_options = match *field_entry.field_type() {
            FieldType::Str(ref text_options) => text_options.get_indexing_options(),
@@ -130,13 +135,11 @@ impl PostingsSerializer {
    /// * term - the term. It needs to come after the previous term according
    ///   to the lexicographical order.
    /// * doc_freq - return the number of document containing the term.
-    pub fn new_term(&mut self, term: &Term) -> io::Result<()> {
+    pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
        if self.term_open {
            panic!("Called new_term, while the previous term was not closed.");
        }
        self.term_open = true;
-        // TODO avoid load indexing options all the time.
-        self.load_indexing_options(term.field());
        self.doc_ids.clear();
        self.last_doc_id_encoded = 0;
        self.term_freqs.clear();
@@ -146,7 +149,7 @@ impl PostingsSerializer {
            postings_offset: self.written_bytes_postings as u32,
            positions_offset: self.written_bytes_positions as u32,
        };
-        self.terms_fst_builder.insert_key(term.as_slice())
+        self.terms_fst_builder.insert_key(term)
    }

    /// Finish the serialization for this term postings.
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -109,7 +109,7 @@ mod field;
 mod value;
 mod named_field_document;

-pub use self::term::extract_field_from_term_bytes;
+pub(crate) use self::term::extract_field_from_term_bytes;
 pub use self::named_field_document::NamedFieldDocument;
 pub use self::schema::{Schema, SchemaBuilder};
 pub use self::value::Value;
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -1,11 +1,14 @@
 use std::fmt;

 use common;
-use byteorder::{BigEndian, WriteBytesExt, ByteOrder};
+use byteorder::{BigEndian, ByteOrder};
 use super::Field;
 use std::str;


+/// Size (in bytes) of the buffer of a int field.
+const INT_TERM_LEN: usize = 4 + 8;
+
 /// Term represents the value that the token can take.
 ///
 /// It actually wraps a `Vec<u8>`.
@@ -14,18 +17,11 @@ pub struct Term(Vec<u8>);

 /// Extract `field` from Term.
 #[doc(hidden)]
-pub fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
+pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
    Field(BigEndian::read_u32(&term_bytes[..4]))
 }

 impl Term {
-    
-    /// Pre-allocate a term buffer. 
-    pub fn allocate(field: Field, num_bytes: usize) -> Term {
-        let mut term = Term(Vec::with_capacity(num_bytes));
-        term.0.write_u32::<BigEndian>(field.0).expect("serializing u32 to Vec<u8 should never fail>");
-        term
-    }

    /// Set the content of the term.
    pub fn set_content(&mut self, content: &[u8]) {
@@ -39,6 +35,14 @@ impl Term {
        extract_field_from_term_bytes(&self.0)
    }

+    /// Returns the field.
+    pub fn set_field(&mut self, field: Field) {
+        if self.0.len() < 4 {
+            self.0.resize(4, 0u8);
+        }
+        BigEndian::write_u32(&mut self.0[0..4], field.0);
+    }
+
    /// Builds a term given a field, and a u64-value
    ///
    /// Assuming the term has a field id of 1, and a u64 value of 3234,
@@ -47,13 +51,21 @@ impl Term {
    /// The first four byte are dedicated to storing the field id as a u64.
    /// The 4 following bytes are encoding the u64 value.
    pub fn from_field_u64(field: Field, val: u64) -> Term {
-        const U64_TERM_LEN: usize = 4 + 8;
-        let mut buffer = vec![0u8; U64_TERM_LEN];
-        // we want BigEndian here to have lexicographic order
-        // match the natural order of `(field, val)`
-        BigEndian::write_u32(&mut buffer[0..4], field.0);
-        BigEndian::write_u64(&mut buffer[4..], val);
-        Term(buffer)
+        let mut term = Term(vec![0u8; INT_TERM_LEN]);
+        term.set_field(field);
+        term.set_u64(val);
+        term
+    }
+
+    /// Sets a u64 value in the term.
+    /// 
+    /// U64 are serialized using (8-byte) BigEndian
+    /// representation.
+    /// The use of BigEndian has the benefit of preserving
+    /// the natural order of the values.    
+    pub fn set_u64(&mut self, val: u64) {
+        self.0.resize(INT_TERM_LEN, 0u8);
+        BigEndian::write_u64(&mut self.0[4..], val);
    }
    
    /// Builds a term given a field, and a u64-value
@@ -75,10 +87,21 @@ impl Term {
    /// The first byte is 2, and the three following bytes are the utf-8 
    /// representation of "abc".
    pub fn from_field_text(field: Field, text: &str) -> Term {
-        let mut buffer = vec![0u8; 4 + text.len()];
-        BigEndian::write_u32(&mut buffer[0..4], field.0);
-        buffer[4..].clone_from_slice(text.as_bytes());
-        Term(buffer)
+        let buffer = Vec::with_capacity(4 + text.len());
+        let mut term = Term(buffer);
+        term.set_field(field);
+        term.set_text(text);
+        term
+    }
+
+    /// Creates a new Term with an empty buffer, 
+    /// but with a given capacity.
+    ///
+    /// It is declared unsafe, as the term content
+    /// is not initialized, and a call to `.field()`
+    /// would panic.
+    pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term {
+        Term(Vec::with_capacity(num_bytes))
    }

    /// Assume the term is a u64 field.
@@ -112,8 +135,8 @@ impl Term {
    /// If the value is not valid utf-8. This may happen
    /// if the index is corrupted or if you try to 
    /// call this method on a non-string type.
-    pub unsafe fn text(&self) -> &str {
-        str::from_utf8_unchecked(self.value())
+    pub fn text(&self) -> &str {
+        str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
    }

    /// Set the texts only, keeping the field untouched.