Bugfix in merger

2026-05-27 05:30:45 +00:00 · 2016-08-13 20:14:00 +09:00
parent a3ab3940c3
commit f66a48af42
8 changed files with 61 additions and 32 deletions
--- a/src/cli/commands/merge.rs
+++ b/src/cli/commands/merge.rs
@@ -0,0 +1,18 @@
+extern crate tantivy;
+
+use tantivy::Index;
+use std::path::PathBuf;
+use clap::ArgMatches;
+
+pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> {
+    let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
+    run_merge(index_directory).map_err(|e| format!("Indexing failed : {:?}", e))
+}
+
+
+fn run_merge(path: PathBuf) -> tantivy::Result<()> {
+    let index = try!(Index::open(&path));
+    let segments = index.segments();
+    let mut index_writer = try!(index.writer());
+    index_writer.merge(&segments)
+}
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -1,25 +1,11 @@
 mod index;
 mod serve;
 mod new;
-mod merge;
 mod bench;
+mod merge;

 pub use self::new::run_new_cli;
 pub use self::index::run_index_cli;
 pub use self::serve::run_serve_cli;
 pub use self::bench::run_bench_cli;
-
-// pub mod writer;
-// pub mod searcher;
-// pub mod index;
-// pub mod merger;
-
-// mod segment_serializer;
-// mod segment_writer;
-// mod segment_reader;
-// mod segment_id;
-// mod segment_component;
-
-// pub use self::segment_component::SegmentComponent;
-// pub use self::segment_id::SegmentId;
-// pub use self::segment_reader::SegmentReader;
+pub use self::merge::run_merge_cli;
--- a/src/cli/main.rs
+++ b/src/cli/main.rs
@@ -85,6 +85,11 @@ fn main() {
                    .help("Number of time to repeat the benchmark.")
                    .default_value("1"))
        )
+        .subcommand(
+            SubCommand::with_name("merge")
+                .about("Merge all the segments of an index")
+                .arg(index_arg.clone())
+        )
        .get_matches();
    
    let (subcommand, some_options) = cli_options.subcommand();
@@ -95,6 +100,7 @@ fn main() {
        "new" => run_new_cli(options).unwrap(),
        "index" => run_index_cli(options).unwrap(),
        "serve" => run_serve_cli(options).unwrap(),
+        "merge" => run_merge_cli(options).unwrap(),
        "bench" => {
            let res = run_bench_cli(options);
            match res {
--- a/src/common/serialize.rs
+++ b/src/common/serialize.rs
@@ -123,8 +123,14 @@ mod test {

    fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
        let mut buffer: Vec<u8> = Vec::new();
-        assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
-        assert_eq!(buffer.len(), num_bytes);
+        
+        if num_bytes != 0 {
+            assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
+            assert_eq!(buffer.len(), num_bytes);
+        }
+        else {
+            v.serialize(&mut buffer).unwrap();
+        }
        let mut cursor = Cursor::new(&buffer[..]);
        let deser = T::deserialize(&mut cursor).unwrap();
        assert_eq!(deser, v);
@@ -159,9 +165,13 @@ mod test {

    #[test]
    fn test_serialize_vint() {
+        for i in 0..10_000 {
+            serialize_test(VInt(i as u64), 0);
+        }
        serialize_test(VInt(7u64), 1);
        serialize_test(VInt(127u64), 1);
        serialize_test(VInt(128u64), 2);
+        serialize_test(VInt(129u64), 2);
        serialize_test(VInt(1234u64), 2);
        serialize_test(VInt(16_383), 2);
        serialize_test(VInt(16_384), 3);
--- a/src/core/merger.rs
+++ b/src/core/merger.rs
@@ -18,8 +18,8 @@ use postings::ChainedPostings;
 use postings::HasLen;
 use postings::OffsetPostings;
 use core::index::SegmentInfo;
-use compression::NUM_DOCS_PER_BLOCK;
 use std::cmp::{min, max, Ordering};
+use std::iter;


 struct PostingsMerger<'a> {
@@ -130,17 +130,20 @@ pub struct IndexMerger {


 struct DeltaPositionComputer {
-    buffer: [u32; NUM_DOCS_PER_BLOCK]
+    buffer: Vec<u32>
 }

 impl DeltaPositionComputer {
    fn new() -> DeltaPositionComputer {
        DeltaPositionComputer {
-            buffer: [0u32; NUM_DOCS_PER_BLOCK]
+            buffer: iter::repeat(0u32).take(512).collect::<Vec<u32>>(),
        }
    }
    
    fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] {
+        if positions.len() > self.buffer.len() {
+            self.buffer.resize(positions.len(), 0u32);
+        }
        let mut last_pos = 0u32;
        let num_positions = positions.len();
        for i in 0..num_positions {
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -165,9 +165,13 @@ impl SegmentReader {
                            let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
                            FreqHandler::new_with_freq_and_position(offseted_position_data)
                        }
-                        else {
+                        else if indexing_options.is_termfreq_enabled() 
+                        {
                            FreqHandler::new_with_freq()
                        }
+                        else {
+                            FreqHandler::new()
+                        }
                    }
                }
            }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -55,7 +55,6 @@ pub mod directory;
 pub mod collector;
 pub mod schema;

-
 pub use directory::Directory;
 pub use core::searcher::Searcher;
 pub use core::index::Index;
@@ -64,6 +63,11 @@ pub use schema::Document;
 pub use core::SegmentReader;
 pub use self::common::TimerTree;

+pub use postings::DocSet;
+pub use postings::Postings;
+pub use postings::SegmentPostingsOption;
+
+
 /// u32 identifying a document within a segment.
 /// Document gets their doc id assigned incrementally,
 /// as they are added in the segment.
@@ -110,8 +114,6 @@ mod tests {
    use super::*;
    use collector::TestCollector;
    use query::MultiTermQuery;
-    use postings::Postings;
-    use postings::DocSet;

    #[test]
    fn test_indexing() {
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -100,16 +100,16 @@ impl PostingsSerializer {
                    }
                    self.term_freqs.clear();
                }
-                if self.text_indexing_options.is_position_enabled() {
-                    self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
-                    let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
-                    try!(self.positions_write.write_all(positions_encoded));
-                    self.written_bytes_positions += positions_encoded.len();
-                    self.position_deltas.clear();
-                }
            }
            self.doc_ids.clear();
        }
+        if self.text_indexing_options.is_position_enabled() {
+            self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
+            let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
+            try!(self.positions_write.write_all(positions_encoded));
+            self.written_bytes_positions += positions_encoded.len();
+            self.position_deltas.clear();
+        }
        Ok(())
    }