From f66a48af42e5c8601ca2e4c938fa9f34f88c3395 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 Aug 2016 20:14:00 +0900 Subject: [PATCH] Bugfix in merger --- src/cli/commands/merge.rs | 18 ++++++++++++++++++ src/cli/commands/mod.rs | 18 ++---------------- src/cli/main.rs | 6 ++++++ src/common/serialize.rs | 14 ++++++++++++-- src/core/merger.rs | 9 ++++++--- src/core/segment_reader.rs | 6 +++++- src/lib.rs | 8 +++++--- src/postings/serializer.rs | 14 +++++++------- 8 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/cli/commands/merge.rs b/src/cli/commands/merge.rs index e69de29bb..db61e4acf 100644 --- a/src/cli/commands/merge.rs +++ b/src/cli/commands/merge.rs @@ -0,0 +1,18 @@ +extern crate tantivy; + +use tantivy::Index; +use std::path::PathBuf; +use clap::ArgMatches; + +pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> { + let index_directory = PathBuf::from(argmatch.value_of("index").unwrap()); + run_merge(index_directory).map_err(|e| format!("Indexing failed : {:?}", e)) +} + + +fn run_merge(path: PathBuf) -> tantivy::Result<()> { + let index = try!(Index::open(&path)); + let segments = index.segments(); + let mut index_writer = try!(index.writer()); + index_writer.merge(&segments) +} diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs index 916e410ee..fc300536a 100644 --- a/src/cli/commands/mod.rs +++ b/src/cli/commands/mod.rs @@ -1,25 +1,11 @@ mod index; mod serve; mod new; -mod merge; mod bench; +mod merge; pub use self::new::run_new_cli; pub use self::index::run_index_cli; pub use self::serve::run_serve_cli; pub use self::bench::run_bench_cli; - -// pub mod writer; -// pub mod searcher; -// pub mod index; -// pub mod merger; - -// mod segment_serializer; -// mod segment_writer; -// mod segment_reader; -// mod segment_id; -// mod segment_component; - -// pub use self::segment_component::SegmentComponent; -// pub use self::segment_id::SegmentId; -// pub use self::segment_reader::SegmentReader; \ No newline at end of file +pub use self::merge::run_merge_cli; diff --git a/src/cli/main.rs b/src/cli/main.rs index 1ea253384..6d02e795a 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -85,6 +85,11 @@ fn main() { .help("Number of time to repeat the benchmark.") .default_value("1")) ) + .subcommand( + SubCommand::with_name("merge") + .about("Merge all the segments of an index") + .arg(index_arg.clone()) + ) .get_matches(); let (subcommand, some_options) = cli_options.subcommand(); @@ -95,6 +100,7 @@ fn main() { "new" => run_new_cli(options).unwrap(), "index" => run_index_cli(options).unwrap(), "serve" => run_serve_cli(options).unwrap(), + "merge" => run_merge_cli(options).unwrap(), "bench" => { let res = run_bench_cli(options); match res { diff --git a/src/common/serialize.rs b/src/common/serialize.rs index fb37bd552..be0b59437 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -123,8 +123,14 @@ mod test { fn serialize_test(v: T, num_bytes: usize) { let mut buffer: Vec = Vec::new(); - assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes); - assert_eq!(buffer.len(), num_bytes); + + if num_bytes != 0 { + assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes); + assert_eq!(buffer.len(), num_bytes); + } + else { + v.serialize(&mut buffer).unwrap(); + } let mut cursor = Cursor::new(&buffer[..]); let deser = T::deserialize(&mut cursor).unwrap(); assert_eq!(deser, v); @@ -159,9 +165,13 @@ mod test { #[test] fn test_serialize_vint() { + for i in 0..10_000 { + serialize_test(VInt(i as u64), 0); + } serialize_test(VInt(7u64), 1); serialize_test(VInt(127u64), 1); serialize_test(VInt(128u64), 2); + serialize_test(VInt(129u64), 2); serialize_test(VInt(1234u64), 2); serialize_test(VInt(16_383), 2); serialize_test(VInt(16_384), 3); diff --git a/src/core/merger.rs b/src/core/merger.rs index 69bdc3243..34800b7d9 100644 --- a/src/core/merger.rs +++ b/src/core/merger.rs @@ -18,8 +18,8 @@ use postings::ChainedPostings; use postings::HasLen; use postings::OffsetPostings; use core::index::SegmentInfo; -use compression::NUM_DOCS_PER_BLOCK; use std::cmp::{min, max, Ordering}; +use std::iter; struct PostingsMerger<'a> { @@ -130,17 +130,20 @@ pub struct IndexMerger { struct DeltaPositionComputer { - buffer: [u32; NUM_DOCS_PER_BLOCK] + buffer: Vec } impl DeltaPositionComputer { fn new() -> DeltaPositionComputer { DeltaPositionComputer { - buffer: [0u32; NUM_DOCS_PER_BLOCK] + buffer: iter::repeat(0u32).take(512).collect::>(), } } fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] { + if positions.len() > self.buffer.len() { + self.buffer.resize(positions.len(), 0u32); + } let mut last_pos = 0u32; let num_positions = positions.len(); for i in 0..num_positions { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 68494f3ea..771172f01 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -165,9 +165,13 @@ impl SegmentReader { let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..]; FreqHandler::new_with_freq_and_position(offseted_position_data) } - else { + else if indexing_options.is_termfreq_enabled() + { FreqHandler::new_with_freq() } + else { + FreqHandler::new() + } } } } diff --git a/src/lib.rs b/src/lib.rs index a212ac9e1..4258b2466 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,7 +55,6 @@ pub mod directory; pub mod collector; pub mod schema; - pub use directory::Directory; pub use core::searcher::Searcher; pub use core::index::Index; @@ -64,6 +63,11 @@ pub use schema::Document; pub use core::SegmentReader; pub use self::common::TimerTree; +pub use postings::DocSet; +pub use postings::Postings; +pub use postings::SegmentPostingsOption; + + /// u32 identifying a document within a segment. /// Document gets their doc id assigned incrementally, /// as they are added in the segment. @@ -110,8 +114,6 @@ mod tests { use super::*; use collector::TestCollector; use query::MultiTermQuery; - use postings::Postings; - use postings::DocSet; #[test] fn test_indexing() { diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index a23bafe55..4538867a1 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -100,16 +100,16 @@ impl PostingsSerializer { } self.term_freqs.clear(); } - if self.text_indexing_options.is_position_enabled() { - self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write)); - let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]); - try!(self.positions_write.write_all(positions_encoded)); - self.written_bytes_positions += positions_encoded.len(); - self.position_deltas.clear(); - } } self.doc_ids.clear(); } + if self.text_indexing_options.is_position_enabled() { + self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write)); + let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]); + try!(self.positions_write.write_all(positions_encoded)); + self.written_bytes_positions += positions_encoded.len(); + self.position_deltas.clear(); + } Ok(()) }