Bugfix in merger

This commit is contained in:
Paul Masurel
2016-08-13 20:14:00 +09:00
parent a3ab3940c3
commit f66a48af42
8 changed files with 61 additions and 32 deletions

View File

@@ -0,0 +1,18 @@
extern crate tantivy;
use tantivy::Index;
use std::path::PathBuf;
use clap::ArgMatches;
pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> {
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
run_merge(index_directory).map_err(|e| format!("Indexing failed : {:?}", e))
}
fn run_merge(path: PathBuf) -> tantivy::Result<()> {
let index = try!(Index::open(&path));
let segments = index.segments();
let mut index_writer = try!(index.writer());
index_writer.merge(&segments)
}

View File

@@ -1,25 +1,11 @@
mod index;
mod serve;
mod new;
mod merge;
mod bench;
mod merge;
pub use self::new::run_new_cli;
pub use self::index::run_index_cli;
pub use self::serve::run_serve_cli;
pub use self::bench::run_bench_cli;
// pub mod writer;
// pub mod searcher;
// pub mod index;
// pub mod merger;
// mod segment_serializer;
// mod segment_writer;
// mod segment_reader;
// mod segment_id;
// mod segment_component;
// pub use self::segment_component::SegmentComponent;
// pub use self::segment_id::SegmentId;
// pub use self::segment_reader::SegmentReader;
pub use self::merge::run_merge_cli;

View File

@@ -85,6 +85,11 @@ fn main() {
.help("Number of time to repeat the benchmark.")
.default_value("1"))
)
.subcommand(
SubCommand::with_name("merge")
.about("Merge all the segments of an index")
.arg(index_arg.clone())
)
.get_matches();
let (subcommand, some_options) = cli_options.subcommand();
@@ -95,6 +100,7 @@ fn main() {
"new" => run_new_cli(options).unwrap(),
"index" => run_index_cli(options).unwrap(),
"serve" => run_serve_cli(options).unwrap(),
"merge" => run_merge_cli(options).unwrap(),
"bench" => {
let res = run_bench_cli(options);
match res {

View File

@@ -123,8 +123,14 @@ mod test {
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
let mut buffer: Vec<u8> = Vec::new();
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
assert_eq!(buffer.len(), num_bytes);
if num_bytes != 0 {
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
assert_eq!(buffer.len(), num_bytes);
}
else {
v.serialize(&mut buffer).unwrap();
}
let mut cursor = Cursor::new(&buffer[..]);
let deser = T::deserialize(&mut cursor).unwrap();
assert_eq!(deser, v);
@@ -159,9 +165,13 @@ mod test {
#[test]
fn test_serialize_vint() {
for i in 0..10_000 {
serialize_test(VInt(i as u64), 0);
}
serialize_test(VInt(7u64), 1);
serialize_test(VInt(127u64), 1);
serialize_test(VInt(128u64), 2);
serialize_test(VInt(129u64), 2);
serialize_test(VInt(1234u64), 2);
serialize_test(VInt(16_383), 2);
serialize_test(VInt(16_384), 3);

View File

@@ -18,8 +18,8 @@ use postings::ChainedPostings;
use postings::HasLen;
use postings::OffsetPostings;
use core::index::SegmentInfo;
use compression::NUM_DOCS_PER_BLOCK;
use std::cmp::{min, max, Ordering};
use std::iter;
struct PostingsMerger<'a> {
@@ -130,17 +130,20 @@ pub struct IndexMerger {
struct DeltaPositionComputer {
buffer: [u32; NUM_DOCS_PER_BLOCK]
buffer: Vec<u32>
}
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
buffer: [0u32; NUM_DOCS_PER_BLOCK]
buffer: iter::repeat(0u32).take(512).collect::<Vec<u32>>(),
}
}
fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
let num_positions = positions.len();
for i in 0..num_positions {

View File

@@ -165,9 +165,13 @@ impl SegmentReader {
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
else {
else if indexing_options.is_termfreq_enabled()
{
FreqHandler::new_with_freq()
}
else {
FreqHandler::new()
}
}
}
}

View File

@@ -55,7 +55,6 @@ pub mod directory;
pub mod collector;
pub mod schema;
pub use directory::Directory;
pub use core::searcher::Searcher;
pub use core::index::Index;
@@ -64,6 +63,11 @@ pub use schema::Document;
pub use core::SegmentReader;
pub use self::common::TimerTree;
pub use postings::DocSet;
pub use postings::Postings;
pub use postings::SegmentPostingsOption;
/// u32 identifying a document within a segment.
/// Document gets their doc id assigned incrementally,
/// as they are added in the segment.
@@ -110,8 +114,6 @@ mod tests {
use super::*;
use collector::TestCollector;
use query::MultiTermQuery;
use postings::Postings;
use postings::DocSet;
#[test]
fn test_indexing() {

View File

@@ -100,16 +100,16 @@ impl PostingsSerializer {
}
self.term_freqs.clear();
}
if self.text_indexing_options.is_position_enabled() {
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
}
}
self.doc_ids.clear();
}
if self.text_indexing_options.is_position_enabled() {
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
}
Ok(())
}