mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-27 05:30:45 +00:00
Bugfix in merger
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
extern crate tantivy;
|
||||
|
||||
use tantivy::Index;
|
||||
use std::path::PathBuf;
|
||||
use clap::ArgMatches;
|
||||
|
||||
pub fn run_merge_cli(argmatch: &ArgMatches) -> Result<(), String> {
|
||||
let index_directory = PathBuf::from(argmatch.value_of("index").unwrap());
|
||||
run_merge(index_directory).map_err(|e| format!("Indexing failed : {:?}", e))
|
||||
}
|
||||
|
||||
|
||||
fn run_merge(path: PathBuf) -> tantivy::Result<()> {
|
||||
let index = try!(Index::open(&path));
|
||||
let segments = index.segments();
|
||||
let mut index_writer = try!(index.writer());
|
||||
index_writer.merge(&segments)
|
||||
}
|
||||
|
||||
@@ -1,25 +1,11 @@
|
||||
mod index;
|
||||
mod serve;
|
||||
mod new;
|
||||
mod merge;
|
||||
mod bench;
|
||||
mod merge;
|
||||
|
||||
pub use self::new::run_new_cli;
|
||||
pub use self::index::run_index_cli;
|
||||
pub use self::serve::run_serve_cli;
|
||||
pub use self::bench::run_bench_cli;
|
||||
|
||||
// pub mod writer;
|
||||
// pub mod searcher;
|
||||
// pub mod index;
|
||||
// pub mod merger;
|
||||
|
||||
// mod segment_serializer;
|
||||
// mod segment_writer;
|
||||
// mod segment_reader;
|
||||
// mod segment_id;
|
||||
// mod segment_component;
|
||||
|
||||
// pub use self::segment_component::SegmentComponent;
|
||||
// pub use self::segment_id::SegmentId;
|
||||
// pub use self::segment_reader::SegmentReader;
|
||||
pub use self::merge::run_merge_cli;
|
||||
|
||||
@@ -85,6 +85,11 @@ fn main() {
|
||||
.help("Number of time to repeat the benchmark.")
|
||||
.default_value("1"))
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("merge")
|
||||
.about("Merge all the segments of an index")
|
||||
.arg(index_arg.clone())
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let (subcommand, some_options) = cli_options.subcommand();
|
||||
@@ -95,6 +100,7 @@ fn main() {
|
||||
"new" => run_new_cli(options).unwrap(),
|
||||
"index" => run_index_cli(options).unwrap(),
|
||||
"serve" => run_serve_cli(options).unwrap(),
|
||||
"merge" => run_merge_cli(options).unwrap(),
|
||||
"bench" => {
|
||||
let res = run_bench_cli(options);
|
||||
match res {
|
||||
|
||||
@@ -123,8 +123,14 @@ mod test {
|
||||
|
||||
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
|
||||
if num_bytes != 0 {
|
||||
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
}
|
||||
else {
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
}
|
||||
let mut cursor = Cursor::new(&buffer[..]);
|
||||
let deser = T::deserialize(&mut cursor).unwrap();
|
||||
assert_eq!(deser, v);
|
||||
@@ -159,9 +165,13 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vint() {
|
||||
for i in 0..10_000 {
|
||||
serialize_test(VInt(i as u64), 0);
|
||||
}
|
||||
serialize_test(VInt(7u64), 1);
|
||||
serialize_test(VInt(127u64), 1);
|
||||
serialize_test(VInt(128u64), 2);
|
||||
serialize_test(VInt(129u64), 2);
|
||||
serialize_test(VInt(1234u64), 2);
|
||||
serialize_test(VInt(16_383), 2);
|
||||
serialize_test(VInt(16_384), 3);
|
||||
|
||||
@@ -18,8 +18,8 @@ use postings::ChainedPostings;
|
||||
use postings::HasLen;
|
||||
use postings::OffsetPostings;
|
||||
use core::index::SegmentInfo;
|
||||
use compression::NUM_DOCS_PER_BLOCK;
|
||||
use std::cmp::{min, max, Ordering};
|
||||
use std::iter;
|
||||
|
||||
|
||||
struct PostingsMerger<'a> {
|
||||
@@ -130,17 +130,20 @@ pub struct IndexMerger {
|
||||
|
||||
|
||||
struct DeltaPositionComputer {
|
||||
buffer: [u32; NUM_DOCS_PER_BLOCK]
|
||||
buffer: Vec<u32>
|
||||
}
|
||||
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer {
|
||||
buffer: [0u32; NUM_DOCS_PER_BLOCK]
|
||||
buffer: iter::repeat(0u32).take(512).collect::<Vec<u32>>(),
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_delta_positions(&mut self, positions: &[u32],) -> &[u32] {
|
||||
if positions.len() > self.buffer.len() {
|
||||
self.buffer.resize(positions.len(), 0u32);
|
||||
}
|
||||
let mut last_pos = 0u32;
|
||||
let num_positions = positions.len();
|
||||
for i in 0..num_positions {
|
||||
|
||||
@@ -165,9 +165,13 @@ impl SegmentReader {
|
||||
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
|
||||
FreqHandler::new_with_freq_and_position(offseted_position_data)
|
||||
}
|
||||
else {
|
||||
else if indexing_options.is_termfreq_enabled()
|
||||
{
|
||||
FreqHandler::new_with_freq()
|
||||
}
|
||||
else {
|
||||
FreqHandler::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,7 +55,6 @@ pub mod directory;
|
||||
pub mod collector;
|
||||
pub mod schema;
|
||||
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::searcher::Searcher;
|
||||
pub use core::index::Index;
|
||||
@@ -64,6 +63,11 @@ pub use schema::Document;
|
||||
pub use core::SegmentReader;
|
||||
pub use self::common::TimerTree;
|
||||
|
||||
pub use postings::DocSet;
|
||||
pub use postings::Postings;
|
||||
pub use postings::SegmentPostingsOption;
|
||||
|
||||
|
||||
/// u32 identifying a document within a segment.
|
||||
/// Document gets their doc id assigned incrementally,
|
||||
/// as they are added in the segment.
|
||||
@@ -110,8 +114,6 @@ mod tests {
|
||||
use super::*;
|
||||
use collector::TestCollector;
|
||||
use query::MultiTermQuery;
|
||||
use postings::Postings;
|
||||
use postings::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
|
||||
@@ -100,16 +100,16 @@ impl PostingsSerializer {
|
||||
}
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
}
|
||||
}
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64).serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
self.written_bytes_positions += positions_encoded.len();
|
||||
self.position_deltas.clear();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user