Integrated with the merge branch

This commit is contained in:
Paul Masurel
2017-09-09 15:27:19 +09:00
108 changed files with 4572 additions and 3023 deletions

4
.gitignore vendored
View File

@@ -6,4 +6,6 @@ Cargo.lock
benchmark
.DS_Store
cpp/simdcomp/bitpackingbenchmark
*.bk
*.bk
.idea
trace.dat

View File

@@ -22,9 +22,7 @@ before_script:
- |
pip install 'travis-cargo<0.2' --user &&
export PATH=$HOME/.local/bin:$PATH
- (cargo install rustfmt || true)
script:
- cargo fmt -- --write-mode=diff
- |
travis-cargo build &&
travis-cargo test &&

View File

@@ -1,14 +1,47 @@
Tantivy 0.4.3
==========================
- Bugfix race condition when deleting files. (#198)
Tantivy 0.4.2
==========================
- Prevent usage of AVX2 instructions (#201)
Tantivy 0.4.1
==========================
- Bugfix for non-indexed fields. (#199)
Tantivy 0.4.0
==========================
- Raise the limit of number of fields (previously 256 fields)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
- Optimized skip in SegmentPostings (#130) (@lnicola)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- QueryParser:
- Using error-chain (@KodrAus)
- QueryParser: (@fulmicoton)
- Explicit error returned when searched for a term that is not indexed
- Searching for a int term via the query parser was broken `(age:1)`
- Searching for a non-indexed field returns an explicit Error
- Phrase query for non-tokenized field are not tokenized by the query parser.
- Faster/Better indexing (@fulmicoton)
- using murmurhash2
- faster merging
- more memory efficient fast field writer (@lnicola )
- better handling of collisions
- lesser memory usage
- Added API, most notably to iterate over ranges of terms (@fulmicoton)
- Bugfix that was preventing to unmap segment files, on index drop (@fulmicoton)
- Made the doc! macro public (@fulmicoton)
- Added an alternative implementation of the streaming dictionary (@fulmicoton)
Tantivy 0.3.1
==========================
@@ -16,6 +49,7 @@ Tantivy 0.3.1
- Expose a method to trigger files garbage collection
Tantivy 0.3
==========================
@@ -37,6 +71,7 @@ You should not expect backward compatibility before
tantivy 1.0.
New Features
------------

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.4.0-alpha"
version = "0.5.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
@@ -27,7 +27,7 @@ tempdir = "0.3"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
bincode = "0.7.0-alpha7"
bincode = "0.8"
libc = {version = "0.2.20", optional=true}
num_cpus = "1.2"
itertools = "0.5.9"
@@ -37,7 +37,7 @@ time = "0.1"
uuid = { version = "0.5", features = ["v4", "serde"] }
chan = "0.1"
version = "2"
crossbeam = "0.2"
crossbeam = "0.3"
futures = "0.1"
futures-cpupool = "0.1"
error-chain = "0.8"

View File

@@ -19,10 +19,10 @@ It is strongly inspired by Lucene's design.
- Basic query language
- Phrase queries
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- mmap based
- optional SIMD integer compression
- u32 fast fields (equivalent of doc values in Lucene)
- u64 and i64 fast fields (equivalent of doc values in Lucene)
- LZ4 compressed document store
- Cheesy logo with a horse
@@ -38,12 +38,10 @@ It will walk you through getting a wikipedia search engine up and running in a f
- [For the last released version](https://docs.rs/tantivy/)
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
# Compiling
# Compiling
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
By default, `tantivy` uses a git submodule called `simdcomp`.
After cloning the repository, you will need to initialize and update
the submodules. The project can then be built using `cargo`.
The project can then be built using `cargo`.
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
@@ -54,9 +52,9 @@ Alternatively, if you are trying to compile `tantivy` without simd compression,
you can disable this functionality. In this case, this submodule is not required
and you can compile tantivy by using the `--no-default-features` flag.
cargo build --no-default-features
cargo build --no-default-features
# Contribute
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);

View File

@@ -1,5 +1,3 @@
extern crate regex;
mod analyzer;
mod simple_tokenizer;
mod lower_caser;
@@ -9,6 +7,7 @@ mod analyzer_manager;
mod japanese_tokenizer;
mod token_stream_chain;
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream};
pub use self::analyzer::BoxedAnalyzer;
pub use self::analyzer_manager::AnalyzerManager;

View File

@@ -38,10 +38,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())

View File

@@ -45,11 +45,11 @@ mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -15,8 +15,9 @@ use SegmentLocalId;
/// Facet collector for i64/u64 fast field
pub struct FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
counters: HashMap<T::ValueType, u64>,
field: Field,
@@ -25,8 +26,9 @@ pub struct FacetCollector<T>
impl<T> FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
/// Creates a new facet collector for aggregating a given field.
pub fn new(field: Field) -> FacetCollector<T> {
@@ -40,8 +42,9 @@ impl<T> FacetCollector<T>
impl<T> Collector for FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
@@ -51,7 +54,9 @@ impl<T> Collector for FacetCollector<T>
fn collect(&mut self, doc: DocId, _: Score) {
let val = self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.expect(
"collect() was called before set_segment. This should never happen.",
)
.get(doc);
*(self.counters.entry(val).or_insert(0)) += 1;
}

View File

@@ -1,3 +1,6 @@
/*!
Collector module
*/
use SegmentReader;
use SegmentLocalId;
use DocId;
@@ -51,20 +54,22 @@ pub use self::chained_collector::chain;
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()>;
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -169,12 +174,12 @@ pub mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> {
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}
@@ -53,8 +54,8 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::from(vec![&mut top_collector,
&mut count_collector]);
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);

View File

@@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc {
impl Ord for GlobalScoredDoc {
#[inline]
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
other
.score
.partial_cmp(&self.score)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
other.score.partial_cmp(&self.score).unwrap_or_else(|| {
other.doc_address.cmp(&self.doc_address)
})
}
}
@@ -87,7 +86,9 @@ impl TopCollector {
scored_docs.sort();
scored_docs
.into_iter()
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
.map(|GlobalScoredDoc { score, doc_address }| {
(score, doc_address)
})
.collect()
}
@@ -108,14 +109,13 @@ impl Collector for TopCollector {
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc =
*self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect(
"Top collector with size 0 is forbidden",
);
if limit_doc.score < score {
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
let mut mut_head = self.heap.peek_mut().expect(
"Top collector with size 0 is forbidden",
);
mut_head.score = score;
mut_head.doc_address = DocAddress(self.segment_id, doc);
}

View File

@@ -15,7 +15,7 @@ use std::ops::Deref;
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spawning over 9 bytes is possible for instance, if we do
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
@@ -67,7 +67,7 @@ impl BitPacker {
Ok(())
}
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub(crate) fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
@@ -88,7 +88,8 @@ impl BitPacker {
pub struct BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u64,
@@ -96,7 +97,8 @@ pub struct BitUnpacker<Data>
}
impl<Data> BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
let mask: u64 = if num_bits == 64 {
@@ -121,11 +123,53 @@ impl<Data> BitUnpacker<Data>
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes.");
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
} else {
let val_unshifted_unmasked: u64;
if addr + 8 <= data.len() {
val_unshifted_unmasked = unsafe { *(data[addr..].as_ptr() as *const u64) };
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
val_unshifted_unmasked = unsafe { *(buffer[..].as_ptr() as *const u64) };
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
}
}
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
}
}
@@ -148,7 +192,7 @@ mod test {
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
fn create_fastfield_bitpacker(len: usize, num_bits: usize) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new(num_bits);
let max_val: u64 = (1 << num_bits) - 1;
@@ -161,6 +205,11 @@ mod test {
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), (num_bits * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(data, num_bits);
(bitunpacker, vals)
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i), *val);
}
@@ -174,4 +223,17 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

View File

@@ -0,0 +1,191 @@
use std::io::Write;
use common::CountingWriter;
use std::collections::HashMap;
use schema::Field;
use common::VInt;
use directory::WritePtr;
use std::io;
use directory::ReadOnlySource;
use common::BinarySerializable;
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<Field, usize>,
}
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
CompositeWrite {
write: CountingWriter::wrap(w),
offsets: HashMap::new(),
}
}
/// Start writing a new field.
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
assert!(!self.offsets.contains_key(&field));
self.offsets.insert(field, offset);
&mut self.write
}
/// Close the composite file.
///
/// An index of the different field offsets
/// will be written as a footer.
pub fn close(mut self) -> io::Result<()> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(field, offset)| (offset, field))
.collect();
offset_fields.sort();
let mut prev_offset = 0;
for (offset, field) in offset_fields {
VInt((offset - prev_offset) as u64).serialize(
&mut self.write,
)?;
field.serialize(&mut self.write)?;
prev_offset = *offset;
}
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
footer_len.serialize(&mut self.write)?;
self.write.flush()?;
Ok(())
}
}
/// A composite file is an abstraction to store a
/// file partitioned by field.
///
/// The file needs to be written field by field.
/// A footer describes the start and stop offsets
/// for each field.
#[derive(Clone)]
pub struct CompositeFile {
data: ReadOnlySource,
offsets_index: HashMap<Field, (usize, usize)>,
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: ReadOnlySource) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4);
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data.slice(footer_start, footer_start + footer_len);
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut fields = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
let mut offset = 0;
for _ in 0..num_fields {
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
let field = Field::deserialize(&mut footer_buffer)?;
offsets.push(offset);
fields.push(field);
}
offsets.push(footer_start);
for i in 0..num_fields {
let field = fields[i];
let start_offset = offsets[i];
let end_offset = offsets[i + 1];
field_index.insert(field, (start_offset, end_offset));
}
Ok(CompositeFile {
data: data.slice_to(footer_start),
offsets_index: field_index,
})
}
/// Returns a composite file that stores
/// no fields.
pub fn empty() -> CompositeFile {
CompositeFile {
offsets_index: HashMap::new(),
data: ReadOnlySource::empty(),
}
}
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
self.offsets_index.get(&field).map(|&(from, to)| {
self.data.slice(from, to)
})
}
}
#[cfg(test)]
mod test {
use std::io::Write;
use super::{CompositeWrite, CompositeFile};
use directory::{RAMDirectory, Directory};
use schema::Field;
use common::VInt;
use common::BinarySerializable;
use std::path::Path;
#[test]
fn test_composite_file() {
let path = Path::new("test_path");
let mut directory = RAMDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
{
let mut write_0 = composite_write.for_field(Field(0u32));
VInt(32431123u64).serialize(&mut write_0).unwrap();
write_0.flush().unwrap();
}
{
let mut write_4 = composite_write.for_field(Field(4u32));
VInt(2).serialize(&mut write_4).unwrap();
write_4.flush().unwrap();
}
composite_write.close().unwrap();
}
{
let r = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(r).unwrap();
{
let file0 = composite_file.open_read(Field(0u32)).unwrap();
let mut file0_buf = file0.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file4 = composite_file.open_read(Field(4u32)).unwrap();
let mut file4_buf = file4.as_slice();
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
assert_eq!(file4_buf.len(), 0);
assert_eq!(payload_4, 2u64);
}
}
}
}

View File

@@ -2,7 +2,7 @@ use std::io::Write;
use std::io;
pub struct CountingWriter<W: Write> {
pub struct CountingWriter<W> {
underlying: W,
written_bytes: usize,
}

View File

@@ -1,9 +1,13 @@
mod serialize;
mod timer;
mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
pub(crate) use self::composite_file::{CompositeWrite, CompositeFile};
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;

View File

@@ -101,9 +101,9 @@ impl BinarySerializable for String {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
reader.take(string_length as u64).read_to_string(
&mut result,
)?;
Ok(result)
}
}

View File

@@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> {
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self) {
self.timer_tree
.timings
.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
}
}

View File

@@ -47,7 +47,12 @@ impl BinarySerializable for VInt {
}
shift += 7;
}
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
_ => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer",
))
}
}
}
Ok(VInt(result))

View File

@@ -1,170 +0,0 @@
use super::{BlockEncoder, BlockDecoder};
use super::NUM_DOCS_PER_BLOCK;
use compression::{VIntEncoder, VIntDecoder};
pub struct CompositeEncoder {
block_encoder: BlockEncoder,
output: Vec<u8>,
}
impl CompositeEncoder {
pub fn new() -> CompositeEncoder {
CompositeEncoder {
block_encoder: BlockEncoder::new(),
output: Vec::with_capacity(500_000),
}
}
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
let mut offset = 0u32;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
self.output.extend_from_slice(block_compressed);
}
let vint_compressed =
self.block_encoder
.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
self.output.extend_from_slice(vint_compressed);
&self.output
}
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder
.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
self.output.extend_from_slice(vint_compressed);
&self.output
}
}
pub struct CompositeDecoder {
block_decoder: BlockDecoder,
vals: Vec<u32>,
}
impl CompositeDecoder {
pub fn new() -> CompositeDecoder {
CompositeDecoder {
block_decoder: BlockDecoder::new(),
vals: Vec::with_capacity(500_000),
}
}
pub fn uncompress_sorted(&mut self,
mut compressed_data: &[u8],
uncompressed_len: usize)
-> &[u32] {
if uncompressed_len > self.vals.capacity() {
let extra_capacity = uncompressed_len - self.vals.capacity();
self.vals.reserve(extra_capacity);
}
let mut offset = 0u32;
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder
.uncompress_block_sorted(compressed_data, offset);
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.vals
.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder
.uncompress_vint_sorted(compressed_data,
offset,
uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals
.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
pub fn uncompress_unsorted(&mut self,
mut compressed_data: &[u8],
uncompressed_len: usize)
-> &[u32] {
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder
.uncompress_block_unsorted(compressed_data);
self.vals
.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder
.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals
.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
}
impl Into<Vec<u32>> for CompositeDecoder {
fn into(self) -> Vec<u32> {
self.vals
}
}
#[cfg(test)]
pub mod tests {
use test::Bencher;
use super::*;
use tests;
#[test]
fn test_composite_unsorted() {
let data = tests::generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert!(compressed.len() <= 19_794);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
#[test]
fn test_composite_sorted() {
let data = tests::generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert!(compressed.len() <= 7_826);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
const BENCH_NUM_INTS: usize = 99_968;
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
b.iter(|| { encoder.compress_sorted(&data); });
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
let compressed = encoder.compress_sorted(&data);
let mut decoder = CompositeDecoder::new();
b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); });
}
}

View File

@@ -1,52 +1,88 @@
#![allow(dead_code)]
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
mod stream;
pub use self::stream::CompressedIntStream;
#[cfg(not(feature="simdcompression"))]
#[cfg(not(feature = "simdcompression"))]
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::*;
pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder};
}
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::*;
pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder};
}
pub use self::pack::{BlockEncoder, BlockDecoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
mod vint {
mod compression_vint_nosimd;
pub use self::compression_vint_nosimd::*;
pub(crate) use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
mod vint {
mod compression_vint_simd;
pub use self::compression_vint_simd::*;
pub(crate) use self::compression_vint_simd::*;
}
/// Returns the size in bytes of a compressed block, given num_bits.
pub fn compressed_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize) * 16
}
pub trait VIntEncoder {
/// Compresses an array of `u32` integers,
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
/// Compresses an array of `u32` integers,
/// using variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
}
pub trait VIntDecoder {
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> &'a [u8];
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> &'a [u8];
/// Uncompress an array of `u32` integers,
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
///
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize;
/// Uncompress an array of `u32s`, compressed using variable
/// byte encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
@@ -60,26 +96,24 @@ impl VIntEncoder for BlockEncoder {
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> &'a [u8] {
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> &'a [u8] {
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
#[cfg(test)]
pub mod tests {
@@ -95,8 +129,8 @@ pub mod tests {
let compressed_data = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -110,8 +144,8 @@ pub mod tests {
let compressed_data = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -129,9 +163,9 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -149,9 +183,9 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -169,9 +203,9 @@ pub mod tests {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data =
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
}
}
@@ -181,19 +215,32 @@ pub mod tests {
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| { encoder.compress_block_sorted(&data, 0u32); });
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let compressed = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); });
}
#[test]
fn test_all_docs_compression_numbits() {
for num_bits in 0..33 {
let mut data = [0u32; 128];
if num_bits > 0 {
data[0] = 1 << (num_bits - 1);
}
let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_unsorted(&data);
assert_eq!(compressed[0] as usize, num_bits);
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
}
}
const NUM_INTS_BENCH_VINT: usize = 10;
@@ -210,7 +257,9 @@ pub mod tests {
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -1,16 +1,17 @@
use common::bitpacker::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use common::CountingWriter;
use std::cmp;
use std::io::Write;
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize {
pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
let mut max_delta = 0;
{
let mut local_offset = offset;
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let val = vals[i];
let delta = val - local_offset;
max_delta = cmp::max(max_delta, delta);
@@ -18,16 +19,15 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
local_offset = val;
}
}
let num_bits = compute_num_bits(max_delta);
output.write_all(&[num_bits]).unwrap();
let mut counting_writer = CountingWriter::wrap(output);
let num_bits = compute_num_bits(max_delta as u64);
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
}
1 +
bit_packer
.close(&mut output)
.expect("packing in memory should never fail")
counting_writer.written_bytes()
}
@@ -35,7 +35,7 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
}
impl BlockEncoder {
@@ -43,7 +43,7 @@ impl BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
}
}
@@ -54,22 +54,30 @@ impl BlockEncoder {
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size: usize = {
let mut output: &mut [u8] = &mut self.output;
let max = vals.iter()
.cloned()
.max()
.expect("compress unsorted called with an empty array");
let num_bits = compute_num_bits(max);
output.write_all(&[num_bits]).unwrap();
let compressed_size = {
let output: &mut [u8] = &mut self.output;
let max = vals.iter().cloned().max().expect(
"compress unsorted called with an empty array",
);
let num_bits = compute_num_bits(max as u64);
let mut counting_writer = CountingWriter::wrap(output);
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
}
1 +
bit_packer
.close(&mut output)
.expect("packing in memory should never fail")
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
bit_packer
.write(vals[0] as u64, &mut counting_writer)
.unwrap();
}
bit_packer.flush(&mut counting_writer).expect(
"Flushing the bitpacking \
in an in RAM buffer should never fail",
);
// we avoid writing "closing", because we
// do not want 7 bytes of padding here.
counting_writer.written_bytes()
};
&self.output[..compressed_size]
}
@@ -93,34 +101,35 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted<'a>(&mut self,
compressed_data: &'a [u8],
mut offset: u32)
-> &'a [u8] {
pub fn uncompress_block_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
mut offset: u32,
) -> usize {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let delta = bit_unpacker.get(i);
let val = offset + delta;
let val = offset + delta as u32;
self.output[i] = val;
offset = val;
}
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
};
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
self.output[i] = bit_unpacker.get(i);
for i in 0..COMPRESSION_BLOCK_SIZE {
self.output[i] = bit_unpacker.get(i) as u32;
}
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]

View File

@@ -1,6 +1,6 @@
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
mod simdcomp {
use libc::size_t;
@@ -8,10 +8,11 @@ mod simdcomp {
extern "C" {
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
pub fn uncompress_sorted(compressed_data: *const u8,
output: *mut u32,
offset: u32)
-> size_t;
pub fn uncompress_sorted(
compressed_data: *const u8,
output: *mut u32,
offset: u32,
) -> size_t;
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
@@ -78,19 +79,16 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32)
-> &'a [u8] {
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]
@@ -117,4 +115,5 @@ mod tests {
let compressed = encoder.compress_block_sorted(&data, 0u32);
assert_eq!(compressed.len(), 17);
}
}

135
src/compression/stream.rs Normal file
View File

@@ -0,0 +1,135 @@
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
/// Reads a stream of compressed ints.
///
/// Tantivy uses `CompressedIntStream` to read
/// the position file.
/// The `.skip(...)` makes it possible to avoid
/// decompressing blocks that are not required.
pub struct CompressedIntStream {
buffer: SourceRead,
block_decoder: BlockDecoder,
inner_offset: usize,
}
impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
buffer: SourceRead::from(source),
block_decoder: BlockDecoder::new(),
inner_offset: COMPRESSION_BLOCK_SIZE,
}
}
/// Fills a buffer with the next `output.len()` integers,
/// and advance the stream by that many els.
pub fn read(&mut self, output: &mut [u32]) {
let mut num_els: usize = output.len();
let mut start: usize = 0;
loop {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if num_els >= available {
if available > 0 {
let uncompressed_block = &self.block_decoder.output_array()
[self.inner_offset..];
&mut output[start..start + available].clone_from_slice(uncompressed_block);
}
num_els -= available;
start += available;
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = 0;
} else {
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..
self.inner_offset +
num_els];
&output[start..start + num_els].clone_from_slice(uncompressed_block);
self.inner_offset += num_els;
break;
}
}
}
/// Skip the next `skip_len` integer.
///
/// If a full block is skipped, calling
/// `.skip(...)` will avoid decompressing it.
pub fn skip(&mut self, mut skip_len: usize) {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if available >= skip_len {
self.inner_offset += skip_len;
} else {
skip_len -= available;
// entirely skip decompressing some blocks.
while skip_len >= COMPRESSION_BLOCK_SIZE {
skip_len -= COMPRESSION_BLOCK_SIZE;
let num_bits: u8 = self.buffer.as_ref()[0];
let block_len = compressed_block_size(num_bits);
self.buffer.advance(block_len);
}
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = skip_len;
}
}
}
#[cfg(test)]
pub mod tests {
use super::CompressedIntStream;
use compression::compressed_block_size;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::BlockEncoder;
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {
let mut buffer: Vec<u8> = vec![];
let mut encoder = BlockEncoder::new();
let vals: Vec<u32> = (0u32..1_025u32).collect();
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
let compressed_block = encoder.compress_block_unsorted(chunk);
let num_bits = compressed_block[0];
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
buffer.extend_from_slice(compressed_block);
}
if cfg!(simd) {
buffer.extend_from_slice(&[0u8; 7]);
}
ReadOnlySource::from(buffer)
}
#[test]
fn test_compressed_int_stream() {
let buffer = create_stream_buffer();
let mut stream = CompressedIntStream::wrap(buffer);
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
stream.read(&mut block[0..2]);
assert_eq!(block[0], 0);
assert_eq!(block[1], 1);
stream.skip(5);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 7);
assert_eq!(block[1], 8);
assert_eq!(block[2], 9);
stream.skip(500);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 510);
assert_eq!(block[1], 511);
assert_eq!(block[2], 512);
stream.skip(511);
stream.read(&mut block[..1]);
assert_eq!(block[0], 1024);
}
}

View File

@@ -1,6 +1,10 @@
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
@@ -22,7 +26,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
@@ -43,10 +47,11 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
}
#[inline(always)]
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> &'a [u8] {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
@@ -63,11 +68,11 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
@@ -84,5 +89,5 @@ pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) ->
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}

View File

@@ -4,41 +4,47 @@ mod streamvbyte {
use libc::size_t;
extern "C" {
pub fn streamvbyte_delta_encode(data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32,
) -> size_t;
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32,
) -> size_t;
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
pub fn streamvbyte_decode(compressed_data: *const u8,
output: *mut u32,
num_els: usize)
-> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize,
) -> size_t;
}
}
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset,
)
};
&output[..compress_length]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
};
@@ -46,23 +52,24 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
}
#[inline(always)]
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
};
&compressed_data[consumed_bytes..]
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
unsafe {
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset,
)
}
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
let consumed_bytes = unsafe {
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
unsafe {
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
};
&compressed_data[consumed_bytes..]
}
}

View File

@@ -50,9 +50,10 @@ impl Index {
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory)
.expect("Creating a managed directory from a brand new RAM directory \
should never fail.");
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
@@ -134,10 +135,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(&self,
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
open_index_writer(self, num_threads, heap_size_in_bytes)
}
@@ -162,10 +164,12 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
Ok(
self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect(),
)
}
#[doc(hidden)]
@@ -197,10 +201,12 @@ impl Index {
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
Ok(
self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect(),
)
}
/// Creates a new generation of searchers after
@@ -210,10 +216,12 @@ impl Index {
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect());
let segment_readers: Vec<SegmentReader> = try!(
searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect()
);
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();

View File

@@ -9,7 +9,7 @@ use core::SegmentMeta;
/// * the index docstamp
/// * the schema
///
#[derive(Clone,Debug,Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,

View File

@@ -0,0 +1,164 @@
use directory::{SourceRead, ReadOnlySource};
use termdict::{TermDictionary, TermDictionaryImpl};
use postings::{SegmentPostings, BlockSegmentPostings};
use postings::TermInfo;
use postings::SegmentPostingsOption;
use schema::Term;
use std::cmp;
use fastfield::DeleteBitSet;
use schema::Schema;
use compression::CompressedIntStream;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
///
/// # Note
///
/// It is safe to delete the segment associated to
/// an `InvertedIndexReader`. As long as it is open,
/// the `ReadOnlySource` it is relying on should
/// stay available.
///
///
/// `InvertedIndexReader` are created by calling
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict_source: ReadOnlySource,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
postings_source: postings_source,
positions_source: positions_source,
delete_bitset: delete_bitset,
schema: schema,
}
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.termdict.get(term.as_slice())
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.termdict
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
let postings_reader = SourceRead::from(postings_slice);
block_postings.reset(term_info.doc_freq as usize, postings_reader);
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let has_freq = option.has_freq();
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
has_freq,
)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
let position_stream = {
if option.has_positions() {
let position_offset = term_info.positions_offset;
let positions_source = self.positions_source.slice_from(position_offset as usize);
let mut stream = CompressedIntStream::wrap(positions_source);
stream.skip(term_info.positions_inner_offset as usize);
Some(stream)
} else {
None
}
};
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(
&self,
term: &Term,
option: SegmentPostingsOption,
) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(
&term_info,
best_effort_option,
))
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
}
}

View File

@@ -7,7 +7,9 @@ mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod inverted_index_reader;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
@@ -18,7 +20,6 @@ pub use self::index::Index;
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
use std::path::PathBuf;
lazy_static! {

View File

@@ -11,43 +11,19 @@ pub struct GenerationItem<T> {
}
// See https://github.com/crossbeam-rs/crossbeam/issues/91
struct NonLeakingMsQueue<T> {
underlying_queue: MsQueue<T>,
}
impl<T> Default for NonLeakingMsQueue<T> {
fn default() -> NonLeakingMsQueue<T> {
NonLeakingMsQueue { underlying_queue: MsQueue::new() }
}
}
impl<T> NonLeakingMsQueue<T> {
fn pop(&self) -> T {
self.underlying_queue.pop()
}
fn push(&self, el: T) {
self.underlying_queue.push(el);
}
}
impl<T> Drop for NonLeakingMsQueue<T> {
fn drop(&mut self) {
while let Some(_popped_item_to_be_dropped) = self.underlying_queue.try_pop() {}
}
}
pub struct Pool<T> {
queue: Arc<NonLeakingMsQueue<GenerationItem<T>>>,
queue: Arc<MsQueue<GenerationItem<T>>>,
freshest_generation: AtomicUsize,
next_generation: AtomicUsize,
}
impl<T> Pool<T> {
pub fn new() -> Pool<T> {
let queue = Arc::new(MsQueue::new());
Pool {
queue: Arc::default(),
queue: queue,
freshest_generation: AtomicUsize::default(),
next_generation: AtomicUsize::default(),
}
@@ -76,8 +52,11 @@ impl<T> Pool<T> {
if former_generation >= generation {
break;
}
self.freshest_generation
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
self.freshest_generation.compare_and_swap(
former_generation,
generation,
Ordering::SeqCst,
);
}
}
@@ -91,9 +70,9 @@ impl<T> Pool<T> {
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
} else {
// this searcher is obsolete,
// removing it from the pool.
@@ -105,7 +84,7 @@ impl<T> Pool<T> {
pub struct LeasedItem<T> {
gen_item: Option<GenerationItem<T>>,
recycle_queue: Arc<NonLeakingMsQueue<GenerationItem<T>>>,
recycle_queue: Arc<MsQueue<GenerationItem<T>>>,
}
impl<T> Deref for LeasedItem<T> {
@@ -113,25 +92,26 @@ impl<T> Deref for LeasedItem<T> {
fn deref(&self) -> &T {
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect(
"Unwrapping a leased item should never fail",
);
self.recycle_queue.push(gen_item);
}
}

View File

@@ -6,10 +6,11 @@ use common::TimerTree;
use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use termdict::TermMerger;
use schema::{Term, Field};
use termdict::{TermMerger, TermDictionary};
use std::sync::Arc;
use std::fmt;
use postings::TermInfo;
use core::InvertedIndexReader;
/// Holds a list of `SegmentReader`s ready for search.
@@ -21,7 +22,6 @@ pub struct Searcher {
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Fetches a document from tantivy's store given a `DocAddress`.
///
@@ -46,7 +46,9 @@ impl Searcher {
pub fn doc_freq(&self, term: &Term) -> u32 {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.doc_freq(term))
.map(|segment_reader| {
segment_reader.inverted_index(term.field()).doc_freq(term)
})
.fold(0u32, |acc, val| acc + val)
}
@@ -65,20 +67,41 @@ impl Searcher {
query.search(self, collector)
}
/// Returns a Stream over all of the sorted unique terms of
/// the searcher.
///
/// This includes all of the fields from all of the segment_readers.
/// See [`TermIterator`](struct.TermIterator.html).
///
/// # Warning
/// This API is very likely to change in the future.
pub fn terms(&self) -> TermMerger<TermInfo> {
TermMerger::from(self.segment_readers())
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
}
pub struct FieldSearcher {
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher { inv_index_readers: inv_index_readers }
}
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher { segment_readers: segment_readers }

View File

@@ -82,18 +82,20 @@ impl Segment {
}
/// Open one of the component file for a *regular* read.
pub fn open_read(&self,
component: SegmentComponent)
-> result::Result<ReadOnlySource, OpenReadError> {
pub fn open_read(
&self,
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
Ok(source)
}
/// Open one of the component file for *regular* write.
pub fn open_write(&mut self,
component: SegmentComponent)
-> result::Result<WritePtr, OpenWriteError> {
pub fn open_write(
&mut self,
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
Ok(write)
@@ -131,11 +133,11 @@ mod tests {
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(living_files.clone());
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(living_files);
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}

View File

@@ -28,13 +28,15 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE];
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -64,16 +64,14 @@ impl SegmentMeta {
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => {
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
}
});
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}
@@ -111,8 +109,8 @@ impl SegmentMeta {
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
}
}

View File

@@ -2,28 +2,24 @@ use Result;
use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
use std::sync::RwLock;
use common::HasLen;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use schema::Document;
use DocId;
use std::str;
use termdict::TermDictionary;
use std::cmp;
use postings::TermInfo;
use termdict::TermDictionaryImpl;
use std::sync::Arc;
use std::collections::HashMap;
use common::CompositeFile;
use std::fmt;
use core::InvertedIndexReader;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::{SegmentPostings, BlockSegmentPostings};
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use fastfield::{FastFieldReader, U64FastFieldReader};
use schema::Schema;
use postings::FreqHandler;
@@ -40,15 +36,19 @@ use postings::FreqHandler;
///
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
segment_meta: SegmentMeta,
terms: Arc<TermDictionaryImpl>,
postings_data: ReadOnlySource,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
positions_composite: CompositeFile,
fast_fields_composite: CompositeFile,
fieldnorms_composite: CompositeFile,
store_reader: StoreReader,
fast_fields_reader: Arc<FastFieldsReader>,
fieldnorms_reader: Arc<FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
}
@@ -76,11 +76,6 @@ impl SegmentReader {
self.delete_bitset.len() as DocId
}
#[doc(hidden)]
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
&*self.fast_fields_reader
}
/// Accessor to a segment's fast field reader given a field.
///
/// Returns the u64 fast value reader if the field
@@ -91,17 +86,18 @@ impl SegmentReader {
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
(&self,
field: Field)
-> fastfield::Result<TFastFieldReader> {
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
&self,
field: Field,
) -> fastfield::Result<TFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
Err(FastFieldNotAvailableError::new(field_entry))
} else {
Ok(self.fast_fields_reader
.open_reader(field)
.expect("Fast field file corrupted."))
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(TFastFieldReader::open)
}
}
@@ -114,15 +110,9 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_reader.open_reader(field)
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
self.fieldnorms_composite.open_read(field).map(
U64FastFieldReader::open,
)
}
/// Accessor to the segment's `StoreReader`.
@@ -133,23 +123,30 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result<SegmentReader> {
let source = segment.open_read(SegmentComponent::TERMS)?;
let terms = TermDictionaryImpl::from_source(source)?;
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(termdict_source)?;
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
let postings_shared_mmap = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(postings_source)?;
let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(source)?
} else {
CompositeFile::empty()
}
};
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(fast_fields_data)?;
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?;
let fieldnorms_composite = CompositeFile::open(fieldnorms_data)?;
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
let delete_bitset = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -160,22 +157,66 @@ impl SegmentReader {
let schema = segment.schema();
Ok(SegmentReader {
segment_meta: segment.meta().clone(),
postings_data: postings_shared_mmap,
terms: Arc::new(terms),
segment_id: segment.id(),
store_reader: store_reader,
fast_fields_reader: Arc::new(fast_fields_reader),
fieldnorms_reader: Arc::new(fieldnorms_reader),
delete_bitset: delete_bitset,
positions_data: positions_data,
schema: schema,
})
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
termdict_composite: termdict_composite,
postings_composite: postings_composite,
fast_fields_composite: fast_fields_composite,
fieldnorms_composite: fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
delete_bitset: delete_bitset,
positions_composite: positions_composite,
schema: schema,
})
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.terms
/// Returns a field reader associated to the field given in argument.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) =
self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
inv_idx_reader.clone();
}
let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect(
"Index corrupted. Failed to open field term dictionary in composite file.",
);
let postings_source = self.postings_composite.open_read(field).expect(
"Index corrupted. Failed to open field postings in composite file.",
);
let positions_source = self.positions_composite.open_read(field).expect(
"Index corrupted. Failed to open field positions in composite file.",
);
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
termdict_source,
postings_source,
positions_source,
self.delete_bitset.clone(),
self.schema.clone(),
));
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
self.inv_idx_reader_cache
.write()
.expect(
"Field reader cache lock poisoned. This should never happen.",
)
.insert(field, inv_idx_reader.clone());
inv_idx_reader
}
/// Returns the document (or to be accurate, its stored field)
@@ -187,89 +228,6 @@ impl SegmentReader {
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self,
term: &Term,
option: SegmentPostingsOption)
-> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
SegmentPostings::from_block_postings(block_postings, delete_bitset)
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match option {
SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(),
SegmentPostingsOption::Freq => FreqHandler::new_with_freq(),
SegmentPostingsOption::FreqAndPositions => {
let offset = term_info.positions_offset as usize;
let offseted_position_data = &self.positions_data[offset..];
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
};
BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo<'a>(&'a self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings<'a>) {
let offset = term_info.postings_offset as usize;
let postings_data: &'a [u8] = &self.postings_data[offset..];
block_postings.reset(term_info.doc_freq as usize, postings_data);
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.terms.get(term.as_slice())
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id

View File

@@ -39,11 +39,11 @@ impl<T: BinarySerializable> LayerBuilder<T> {
doc_id.serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
}
}
@@ -78,8 +78,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => {
try!(self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset))
try!(self.get_skip_layer(layer_id).insert(
skip_doc_id,
&skip_offset,
))
}
None => {
return Ok(());

View File

@@ -1,8 +1,7 @@
use std::iter;
use std::mem;
use super::heap::{Heap, HeapAllocable, BytesRef};
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
@@ -53,15 +52,36 @@ mod murmurhash2 {
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef {
start: 0u32,
stop: 0u32,
}
}
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let compute_table_size = |num_bits: usize| {
let table_size: usize = (1 << num_bits) * mem::size_of::<KeyValue>();
table_size * mem::size_of::<KeyValue>()
};
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| {
compute_table_size(*num_bits) < table_size_limit
})
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external heap.
/// The `value_addr` also points to an address in the heap.
@@ -70,22 +90,18 @@ impl Default for BytesRef {
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
#[repr(packed)]
struct KeyValue {
key: BytesRef,
value_addr: u32,
key_value_addr: BytesRef,
hash: u32,
}
impl KeyValue {
fn is_empty(&self) -> bool {
self.key.stop == 0u32
self.key_value_addr.is_null()
}
}
pub enum Entry {
Vacant(usize),
Occupied(u32),
}
/// Customized `HashMap` with string keys
///
@@ -111,8 +127,7 @@ struct QuadraticProbing {
}
impl QuadraticProbing {
fn compute(key: &[u8], mask: usize) -> QuadraticProbing {
let hash = murmurhash2::murmurhash2(key) as usize;
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing {
hash: hash,
i: 0,
@@ -140,63 +155,55 @@ impl<'a> HashMap<'a> {
}
}
fn probe(&self, key: &[u8]) -> QuadraticProbing {
QuadraticProbing::compute(key, self.mask)
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 10
self.table.len() < self.occupied.len() * 3
}
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
self.heap.get_slice(bytes_ref)
#[inline(never)]
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key: self.heap.allocate_and_set(key_bytes),
value_addr: addr,
key_value_addr: key_bytes_ref,
hash: hash,
};
addr
}
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| {
let kv = table[bucket];
let addr = kv.value_addr;
(heap.get_slice(kv.key), addr)
})
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let entry = self.lookup(key.as_ref());
match entry {
Entry::Occupied(addr) => self.heap.get_mut_ref(addr),
Entry::Vacant(bucket) => {
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
self.set_bucket(key.as_ref(), bucket, addr);
val
}
}
}
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
let key_bytes: &[u8] = key.as_ref();
let mut probe = self.probe(key_bytes);
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return Entry::Vacant(bucket);
}
if self.get_key(kv.key) == key_bytes {
return Entry::Occupied(kv.value_addr);
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return val;
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return self.heap.get_mut_ref(expull_addr);
}
}
}
}
@@ -211,8 +218,8 @@ mod tests {
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::HashSet;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
use super::split_memory;
struct TestValue {
val: u32,
@@ -228,6 +235,14 @@ mod tests {
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(split_memory(100_000), (67232, 9));
assert_eq!(split_memory(1_000_000), (737856, 12));
assert_eq!(split_memory(10_000_000), (7902848, 15));
}
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
@@ -269,8 +284,10 @@ mod tests {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes()));
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
@@ -290,23 +307,14 @@ mod tests {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}
#[bench]
fn bench_siphasher(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
let mut h = DefaultHasher::new();
h.write(v.as_bytes());
h.finish()
});
}
}

View File

@@ -1,12 +1,29 @@
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
use byteorder::{NativeEndian, ByteOrder};
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///
/// The slice will encode the length of the `&[u8]` slice
/// on 16-bits, and then the data is encoded.
#[derive(Copy, Clone)]
pub struct BytesRef {
pub start: u32,
pub stop: u32,
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
@@ -38,11 +55,6 @@ impl Heap {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self) -> u32 {
self.inner().capacity()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
@@ -70,7 +82,7 @@ impl Heap {
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
@@ -115,10 +127,6 @@ impl InnerHeap {
self.next_heap = None;
}
pub fn capacity(&self) -> u32 {
self.buffer.len() as u32
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self) -> u32 {
@@ -136,66 +144,64 @@ impl InnerHeap {
addr
} else {
if self.next_heap.is_none() {
info!(r#"Exceeded heap size.
The segment will be committed right after indexing this document."#,);
info!(r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(start - self.buffer_len, stop - self.buffer_len)
self.next_heap.as_ref().unwrap().get_slice(BytesRef(
start - self.buffer_len,
))
} else {
&self.buffer[start as usize..stop as usize]
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2..start + 2 + len]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_slice(
start - self.buffer_len,
stop - self.buffer_len,
)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
let start = self.allocate_space(data.len());
let stop = start + data.len() as u32;
self.get_mut_slice(start, stop).clone_from_slice(data);
BytesRef {
start: start as u32,
stop: stop as u32,
}
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut(
addr - self.buffer_len,
)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_ref(
addr - self.buffer_len,
)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
@@ -205,10 +211,10 @@ impl InnerHeap {
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
self.next_heap.as_mut().unwrap().set(
addr - self.buffer_len,
val,
);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;

View File

@@ -1,16 +1,14 @@
mod hashmap;
pub(crate) mod hashmap;
mod heap;
mod expull;
pub use self::heap::{Heap, HeapAllocable};
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::{HashMap, Entry};
pub use self::hashmap::HashMap;
#[test]
fn test_unrolled_linked_list() {
use std::collections;
let heap = Heap::with_capacity(30_000_000);
{
heap.clear();
@@ -21,24 +19,24 @@ fn test_unrolled_linked_list() {
let mut hashmap: HashMap = HashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let mut list: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
list.push(i * j, &heap);
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
v.push(i * j, &heap);
}
}
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
for (key, addr) in hashmap.iter() {
map_addr.insert(Vec::from(key), addr);
}
for i in 0..500 {
match hashmap.lookup(i.to_string()) {
Entry::Occupied(addr) => {
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
let mut it = v.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
_ => {
panic!("should never happen");
}
let key: String = i.to_string();
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
let mut it = exp_pull.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
}

View File

@@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for writing: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
)
}
}
}
@@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for reading: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
)
}
}
}

View File

@@ -45,10 +45,9 @@ pub struct FileProtection {
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_informations_wlock = directory.meta_informations.write().expect(
"Managed file lock poisoned",
);
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
@@ -68,9 +67,10 @@ impl Drop for FileProtection {
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>)
-> io::Result<()> {
fn save_managed_paths(
directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
write!(&mut w, "\n")?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
@@ -84,22 +84,22 @@ impl ManagedDirectory {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> =
serde_json::from_str(&managed_files_json)
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
serde_json::from_str(&managed_files_json).chain_err(|| {
ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone())
})?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files:
HashMap::default(),
})),
})
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => {
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
})
directory: box directory,
meta_informations: Arc::default(),
})
}
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
}
@@ -116,14 +116,25 @@ impl ManagedDirectory {
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect(&mut self, living_files: HashSet<PathBuf>) {
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock =
self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
let meta_informations_rlock = self.meta_informations.read().expect(
"Managed directory rlock poisoned in garbage collect.",
);
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
// avoid the following scenario.
//
// 1) we get the list of living files.
// 2) someone creates a new file.
// 3) we start garbage collection and remove this file
// even though it is a living file.
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
@@ -165,9 +176,9 @@ impl ManagedDirectory {
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed directory wlock poisoned (2).",
);
{
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
@@ -190,13 +201,13 @@ impl ManagedDirectory {
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned on protect",
);
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
@@ -212,9 +223,9 @@ impl ManagedDirectory {
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned",
);
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if has_changed {
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
@@ -229,8 +240,9 @@ impl Directory for ManagedDirectory {
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.register_file_as_managed(path).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
self.directory.open_write(path)
}
@@ -245,9 +257,9 @@ impl Directory for ManagedDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
let metas_rlock = self.meta_informations.read().expect(
"poisoned lock in managed directory meta",
);
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()));
@@ -315,7 +327,7 @@ mod tests {
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(living_files);
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
@@ -331,7 +343,7 @@ mod tests {
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(living_files);
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -354,7 +366,7 @@ mod tests {
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(living_files.clone());
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
@@ -362,7 +374,7 @@ mod tests {
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(living_files);
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -386,11 +398,11 @@ mod tests {
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(living_files.clone());
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(living_files.clone());
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));

View File

@@ -24,15 +24,17 @@ use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let file = File::open(&full_path)
.map_err(|e| if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let file = File::open(&full_path).map_err(|e| if e.kind() ==
io::ErrorKind::NotFound
{
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
let meta_data = file.metadata().map_err(|e| {
IOError::with_path(full_path.to_owned(), e)
})?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
@@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
}
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
@@ -58,7 +60,7 @@ pub struct CacheCounters {
pub miss_weak: usize,
}
#[derive(Clone,Debug,Serialize,Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
@@ -113,31 +115,31 @@ impl MmapCache {
self.cleanup();
}
Ok(match self.cache.entry(full_path.clone()) {
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
}
}
@@ -180,15 +182,19 @@ impl MmapDirectory {
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
Err(OpenDirectoryError::DoesNotExist(
PathBuf::from(directory_path),
))
} else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
Err(OpenDirectoryError::NotADirectory(
PathBuf::from(directory_path),
))
} else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
@@ -215,9 +221,9 @@ impl MmapDirectory {
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
open_opts.write(true).custom_flags(
winbase::FILE_FLAG_BACKUP_SEMANTICS,
);
}
let fd = try!(open_opts.open(&self.root_path));
@@ -270,46 +276,50 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
Ok(
mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())),
)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
.open(full_path);
let open_res = OpenOptions::new().write(true).create_new(true).open(
full_path,
);
let mut file = open_res
.map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
let mut file = open_res.map_err(|err| if err.kind() ==
io::ErrorKind::AlreadyExists
{
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
// making sure the file is created.
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.flush().map_err(
|e| IOError::with_path(path.to_owned(), e),
)?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
@@ -318,22 +328,23 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => {
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into())
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e).into()
})
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
@@ -355,8 +366,9 @@ impl Directory for MmapDirectory {
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.read_to_end(&mut buffer).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
Ok(buffer)
}
Err(e) => {

View File

@@ -13,14 +13,15 @@ mod managed_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{Write, Seek};
use std::io::{Write, Seek, BufWriter};
use std::io::BufWriter;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::{ManagedDirectory, FileProtection};
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::{ManagedDirectory, FileProtection};
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}

View File

@@ -41,8 +41,10 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path)
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path
)
}
}
}
@@ -62,8 +64,10 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
try!(self.shared_directory
.write(self.path.clone(), self.data.get_ref()));
try!(self.shared_directory.write(
self.path.clone(),
self.data.get_ref(),
));
Ok(())
}
}
@@ -79,11 +83,11 @@ impl InnerDirectory {
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = try!(self.0
.write()
.map_err(|_| {
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
path))
let mut map = try!(self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
}));
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
@@ -93,17 +97,21 @@ impl InnerDirectory {
self.0
.read()
.map_err(|_| {
let msg = format!("Failed to acquire read lock for the \
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
})
}
@@ -111,16 +119,18 @@ impl InnerDirectory {
self.0
.write()
.map_err(|_| {
let msg = format!("Failed to acquire write lock for the \
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
fn exists(&self, path: &Path) -> bool {
@@ -164,9 +174,11 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err(
|err| {
IOError::with_path(path.to_owned(), err)
},
)?;
// force the creation of the file to mimic the MMap directory.
if exists {

View File

@@ -2,6 +2,8 @@ use fst::raw::MmapReadOnly;
use std::ops::Deref;
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
use std::slice;
use std::io::{self, Read};
use stable_deref_trait::StableDeref;
/// Read object that represents files in tantivy.
@@ -41,6 +43,14 @@ impl ReadOnlySource {
}
}
/// Splits into 2 `ReadOnlySource`, at the offset given
/// as an argument.
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
let left = self.slice(0, addr);
let right = self.slice_from(addr);
(left, right)
}
/// Creates a ReadOnlySource that is just a
/// view over a slice of the data.
///
@@ -62,6 +72,23 @@ impl ReadOnlySource {
}
}
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
self.slice(0, to_offset)
}
}
impl HasLen for ReadOnlySource {
@@ -82,3 +109,42 @@ impl From<Vec<u8>> for ReadOnlySource {
ReadOnlySource::Anonymous(shared_data)
}
}
/// Acts as a owning cursor over the data backed up by a ReadOnlySource
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8],
}
impl SourceRead {
// Advance the cursor by a given number of bytes.
pub fn advance(&mut self, len: usize) {
self.cursor = &self.cursor[len..];
}
}
impl AsRef<[u8]> for SourceRead {
fn as_ref(&self) -> &[u8] {
self.cursor
}
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();
let slice_ptr = source.as_slice().as_ptr();
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
SourceRead {
_data_owner: source,
cursor: static_slice,
}
}
}
impl Read for SourceRead {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.cursor.read(buf)
}
}

View File

@@ -10,6 +10,7 @@ use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
error_chain!(
errors {
/// Path does not exist.
@@ -111,12 +112,9 @@ impl From<schema::DocParsingError> for Error {
impl From<OpenWriteError> for Error {
fn from(error: OpenWriteError) -> Error {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
ErrorKind::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}
.into()
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}.into()
}
}

View File

@@ -1,5 +1,5 @@
/*!
Fast fields is a column oriented storage storage.
Fast fields is a column oriented storage.
It is the equivalent of `Lucene`'s `DocValues`.
@@ -32,7 +32,7 @@ mod delete;
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
pub use self::reader::{U64FastFieldReader, I64FastFieldReader};
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::error::{Result, FastFieldNotAvailableError};
@@ -51,6 +51,7 @@ mod tests {
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use common::CompositeFile;
use rand::XorShiftRng;
lazy_static! {
@@ -84,7 +85,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
@@ -94,12 +95,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 38 as usize);
assert_eq!(source.len(), 35 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
let composite_file = CompositeFile::open(source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -112,7 +113,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
@@ -128,12 +129,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 63 as usize);
assert_eq!(source.len(), 60 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -154,7 +155,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
@@ -164,12 +165,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 36 as usize);
assert_eq!(source.len(), 33 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
@@ -183,35 +184,39 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i);
add_single_field_doc(
&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i,
);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80044 as usize);
assert_eq!(source.len(), 80041 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
assert_eq!(
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
}
}
#[test]
fn test_signed_intfastfield() {
let path = Path::new("test");
@@ -222,7 +227,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
@@ -234,17 +239,23 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 17711 as usize);
assert_eq!(source.len(), 17708 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: I64FastFieldReader =
fast_field_readers.open_reader(i64_field).unwrap();
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get(doc as u32), i);
}
let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]);
for i in 0..100 {
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
}
}
}
@@ -258,7 +269,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
@@ -268,9 +279,10 @@ mod tests {
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: I64FastFieldReader =
fast_field_readers.open_reader(i64_field).unwrap();
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
}
@@ -291,7 +303,7 @@ mod tests {
let mut directory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -301,9 +313,10 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
let mut a = 0u64;
for _ in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
@@ -316,26 +329,26 @@ mod tests {
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= permutation[i as usize];
}
a
});
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in Iterator::step_by((0u32..n), 7) {
a ^= permutation[i as usize];
}
a
});
}
#[bench]
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
@@ -345,7 +358,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -355,17 +368,19 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= fast_field_reader.get(i);
}
a
});
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in Iterator::step_by((0u32..n), 7) {
a ^= fast_field_reader.get(i);
}
a
});
}
}
@@ -376,7 +391,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -386,17 +401,18 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
}
}
}

View File

@@ -1,19 +1,15 @@
use std::io;
use std::collections::HashMap;
use directory::ReadOnlySource;
use common::BinarySerializable;
use common::{self, BinarySerializable};
use common::bitpacker::{compute_num_bits, BitUnpacker};
use DocId;
use schema::{Field, SchemaBuilder};
use schema::SchemaBuilder;
use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
use fastfield::{FastFieldSerializer, FastFieldsWriter};
use schema::FieldType;
use error::ResultExt;
use common;
use std::mem;
use common::CompositeFile;
use owning_ref::OwningRef;
/// Trait for accessing a fastfield.
@@ -27,8 +23,22 @@ pub trait FastFieldReader: Sized {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Self::ValueType;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u32, output: &mut [Self::ValueType]);
/// Opens a fast field given a source.
fn open(source: ReadOnlySource) -> Self;
@@ -80,6 +90,13 @@ impl FastFieldReader for U64FastFieldReader {
}
}
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
self.bit_unpacker.get_range(start, output);
for out in output.iter_mut() {
*out += self.min_value;
}
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
@@ -89,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader {
let amplitude: u64;
{
let mut cursor = data.as_slice();
min_value = u64::deserialize(&mut cursor)
.expect("Failed to read the min_value of fast field.");
amplitude = u64::deserialize(&mut cursor)
.expect("Failed to read the amplitude of fast field.");
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
let max_value = min_value + amplitude;
@@ -113,33 +130,36 @@ impl From<Vec<u64>> for U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let path = Path::new("__dummy__");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let write: WritePtr = directory.open_write(path).expect(
"With a RAMDirectory, this should never fail.",
);
let mut serializer = FastFieldSerializer::from_write(write).expect(
"With a RAMDirectory, this should never fail.",
);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
{
let fast_field_writer = fast_field_writers.get_field_writer(field).expect(
"With a RAMDirectory, this should never fail.",
);
for val in vals {
fast_field_writer.add_val(val);
}
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
directory
.open_read(path)
.chain_err(|| "Failed to open the file")
.and_then(|source| {
FastFieldsReader::from_source(source)
.chain_err(|| "Failed to read the file.")
})
.and_then(|ff_readers| {
ff_readers
.open_reader(field)
.ok_or_else(|| "Failed to find the requested field".into())
})
.expect("This should never happen, please report.")
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(source).expect("Failed to read the composite file");
let field_source = composite_file.open_read(field).expect(
"File component not found",
);
U64FastFieldReader::open(field_source)
}
}
@@ -181,6 +201,19 @@ impl FastFieldReader for I64FastFieldReader {
common::u64_to_i64(self.underlying.get(doc))
}
///
/// # Panics
///
/// May panic or return wrong random result if `doc`
/// is greater or equal to the segment's `maxdoc`.
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.underlying.get_range(start, output_u64);
for mut_val in output_u64.iter_mut() {
*mut_val = common::u64_to_i64(*mut_val as u64) as u64;
}
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
@@ -196,67 +229,3 @@ impl FastFieldReader for I64FastFieldReader {
}
}
}
/// The `FastFieldsReader` is the datastructure containing
/// all of the fast fields' data.
///
/// It contains a mapping that associated these fields to
/// the proper slice in the fastfield reader file.
pub struct FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
impl FastFieldsReader {
/// Opens a `FastFieldsReader`
///
/// When opening the fast field reader, the
/// the list of the offset is read (as a footer of the
/// data file).
pub fn from_source(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
let buffer = source.as_slice();
{
let mut cursor = buffer;
header_offset = u32::deserialize(&mut cursor)?;
}
{
let mut cursor = &buffer[header_offset as usize..];
field_offsets = Vec::deserialize(&mut cursor)?;
}
}
let mut end_offsets: Vec<u32> = field_offsets.iter().map(|&(_, offset)| offset).collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in
field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
FFReader::open(field_source)
})
}
}

View File

@@ -3,7 +3,8 @@ use directory::WritePtr;
use schema::Field;
use common::bitpacker::{compute_num_bits, BitPacker};
use common::CountingWriter;
use std::io::{self, Write, Seek, SeekFrom};
use common::CompositeWrite;
use std::io::{self, Write};
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.
@@ -26,51 +27,61 @@ use std::io::{self, Write, Seek, SeekFrom};
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
write: CountingWriter<WritePtr>,
fields: Vec<(Field, u32)>,
min_value: u64,
field_open: bool,
bit_packer: BitPacker,
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
/// Constructor
pub fn new(write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let mut counting_writer = CountingWriter::wrap(write);
0u32.serialize(&mut counting_writer)?;
Ok(FastFieldSerializer {
write: counting_writer,
fields: Vec::new(),
min_value: 0,
field_open: false,
bit_packer: BitPacker::new(0),
})
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write: composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self,
field: Field,
min_value: u64,
max_value: u64)
-> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.write.written_bytes() as u32));
let write = &mut self.write;
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field(field);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
self.bit_packer = BitPacker::new(num_bits as usize);
Ok(())
let bit_packer = BitPacker::new(num_bits as usize);
Ok(FastSingleFieldSerializer {
write: write,
bit_packer: bit_packer,
min_value: min_value,
})
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
@@ -78,33 +89,7 @@ impl FastFieldSerializer {
Ok(())
}
/// Close the u64 fast field.
pub fn close_field(&mut self) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
// adding some padding to make sure we
// can read the last elements with our u64
// cursor
self.bit_packer.close(&mut self.write)?;
Ok(())
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.write.written_bytes() as usize;
let (mut write, written_size) = self.write.finish()?;
self.fields.serialize(&mut write)?;
write.seek(SeekFrom::Start(0))?;
(header_offset as u32).serialize(&mut write)?;
write.flush()?;
Ok(written_size)
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}

View File

@@ -58,9 +58,9 @@ impl FastFieldsWriter {
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.field_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
self.field_writers.iter_mut().find(|field_writer| {
field_writer.field == field
})
}
@@ -155,9 +155,9 @@ impl IntFastFieldWriter {
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
VInt(val).serialize(&mut self.vals).expect(
"unable to serialize VInt to Vec",
);
if val > self.val_max {
self.val_max = val;
@@ -208,13 +208,14 @@ impl IntFastFieldWriter {
(self.val_min, self.val_max)
};
serializer.new_u64_fast_field(self.field, min, max)?;
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
let mut cursor = self.vals.as_slice();
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
serializer.add_val(val)?;
single_field_serializer.add_val(val)?;
}
serializer.close_field()
single_field_serializer.close_field()
}
}

View File

@@ -40,9 +40,9 @@ impl DeleteQueue {
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(Arc::new(Block {
operations: Arc::default(),
next: next_block,
}));
operations: Arc::default(),
next: next_block,
}));
}
delete_queue
@@ -59,9 +59,11 @@ impl DeleteQueue {
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect("Failed to unwrap last_block. This should never happen
.expect(
"Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible");
initialization possible",
);
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
@@ -92,9 +94,9 @@ impl DeleteQueue {
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let mut self_wlock = self.inner.write().expect(
"Failed to acquire write lock on delete queue writer",
);
let delete_operations;
{
@@ -108,9 +110,9 @@ impl DeleteQueue {
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
}));
operations: Arc::new(delete_operations),
next: next_block,
}));
}
self_wlock.last_block.clone()
}
@@ -132,18 +134,18 @@ impl From<DeleteQueue> for NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
let next_read_lock = self.0.read().expect(
"Failed to acquire write lock in delete queue",
);
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
return Some(block.clone());
}
}
let next_block;
{
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
let mut next_write_lock = self.0.write().expect(
"Failed to acquire write lock in delete queue",
);
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());

View File

@@ -56,8 +56,10 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value());
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
}
#[test]

View File

@@ -14,6 +14,7 @@ use Directory;
use fastfield::write_delete_bitset;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use futures::Canceled;
use datastruct::stacker::hashmap::split_memory;
use futures::Future;
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::MergePolicy;
@@ -101,17 +102,20 @@ impl !Sync for IndexWriter {}
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn open_index_writer(index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize)
-> Result<IndexWriter> {
pub fn open_index_writer(
index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
panic!(format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT
));
}
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
@@ -149,18 +153,19 @@ pub fn open_index_writer(index: &Index,
worker_id: 0,
};
try!(index_writer.start_workers());
index_writer.start_workers()?;
Ok(index_writer)
}
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64)
-> Result<bool> {
pub fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64,
) -> Result<bool> {
let mut might_have_changed = false;
@@ -176,8 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
if let Some(mut docset) =
segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
if let Some(mut docset) = inverted_index.read_postings(
&delete_op.term,
SegmentPostingsOption::NoFreq,
)
{
while docset.advance() {
let deleted_doc = docset.doc();
if deleted_doc < limit_doc {
@@ -197,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
/// Advance delete for the given segment up
/// to the target opstamp.
pub fn advance_deletes(mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64)
-> Result<Option<FileProtection>> {
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
@@ -221,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment,
let delete_cursor = segment_entry.delete_cursor();
compute_deleted_bitset(&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp)?;
compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp,
)?;
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
@@ -246,25 +258,29 @@ pub fn advance_deletes(mut segment: Segment,
Ok(file_protect)
}
fn index_documents(heap: &mut Heap,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor)
-> Result<bool> {
fn index_documents(
heap: &mut Heap,
table_size: usize,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let segment_id = segment.id();
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), schema)?;
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
for doc in document_iterator {
try!(segment_writer.add_document(&doc, schema));
// There is two possible conditions to close the segment.
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!("Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
@@ -273,8 +289,10 @@ fn index_documents(heap: &mut Heap,
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_term_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
@@ -294,11 +312,13 @@ fn index_documents(heap: &mut Heap,
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp)?;
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
@@ -312,7 +332,6 @@ fn index_documents(heap: &mut Heap,
}
impl IndexWriter {
/// The index writer
pub fn wait_merging_threads(mut self) -> Result<()> {
@@ -321,20 +340,20 @@ impl IndexWriter {
// dropping the last reference to the segment_updater.
drop(self.document_sender);
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
for join_handle in former_workers_handles {
join_handle
.join()
.expect("Indexing Worker thread panicked")
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
.chain_err(|| {
ErrorKind::ErrorInThread("Error in indexing worker thread.".into())
})?;
}
drop(self.workers_join_handle);
let result =
self.segment_updater
.wait_merging_thread()
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
let result = self.segment_updater.wait_merging_thread().chain_err(|| {
ErrorKind::ErrorInThread("Failed to join merging thread.".into())
});
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
@@ -347,8 +366,10 @@ impl IndexWriter {
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater
.add_segment(self.generation, segment_entry);
self.segment_updater.add_segment(
self.generation,
segment_entry,
);
}
#[doc(hidden)]
@@ -363,14 +384,20 @@ impl IndexWriter {
let schema = self.index.schema();
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread);
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
info!("heap size {}, table_size {}", heap_size, table_size);
let mut heap = Heap::with_capacity(heap_size);
let generation = self.generation;
let mut delete_cursor = self.delete_queue.cursor();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
.name(format!(
"indexing thread {} for gen {}",
self.worker_id,
generation
))
.spawn(move || {
loop {
@@ -394,13 +421,16 @@ impl IndexWriter {
return Ok(());
}
let segment = segment_updater.new_segment();
index_documents(&mut heap,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone())?;
index_documents(
&mut heap,
table_size,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
)?;
}
})?;
@@ -433,9 +463,10 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(&mut self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.segment_updater.start_merge(segment_ids)
}
@@ -519,14 +550,15 @@ impl IndexWriter {
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result =
worker_handle
.join()
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
let indexing_worker_result = worker_handle.join().map_err(|e| {
Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e)))
})?;
indexing_worker_result?;
// add a new worker for the next generation.
@@ -620,13 +652,17 @@ mod tests {
let schema_builder = schema::SchemaBuilder::default();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = box NoMergePolicy::default();
index_writer.set_merge_policy(merge_policy);
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy"
);
}
#[test]
@@ -716,9 +752,9 @@ mod tests {
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
index_writer.wait_merging_threads().expect(
"waiting merging thread failed",
);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);

View File

@@ -1,4 +1,3 @@
extern crate itertools;
use super::merge_policy::{MergePolicy, MergeCandidate};
use core::SegmentMeta;
use std::cmp;
@@ -58,11 +57,13 @@ impl MergePolicy for LogMergePolicy {
.enumerate()
.collect::<Vec<(usize, u32)>>();
size_sorted_tuples.sort_by(|x, y| y.cmp(x));
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
.map(|(ind, num_docs)| {
(ind, (self.clip_min_size(num_docs) as f64).log2())
})
.collect();
let (first_ind, first_score) = size_sorted_log_tuples[0];
@@ -79,7 +80,9 @@ impl MergePolicy for LogMergePolicy {
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.map(|ind_vec| {
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
})
.collect()
}
@@ -134,12 +137,23 @@ mod tests {
#[test]
fn test_log_merge_policy_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
// 2 MergeCandidates expected:
// * one with the 6 * 10-docs segments
// * one with the 3 * 1000-docs segments
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -147,24 +161,28 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(11),
seg_meta(12),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let test_input = vec![
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000),
]; // log2(1000) = ~9.97
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2)];
// segments under min_layer_size are merged together
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -5,7 +5,7 @@ use DocId;
use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
use fastfield::U64FastFieldReader;
use itertools::Itertools;
use postings::Postings;
@@ -17,9 +17,9 @@ use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
use termdict::TermDictionary;
use schema::Term;
use termdict::TermStreamer;
use postings::SegmentPostingsOption;
pub struct IndexMerger {
schema: Schema,
@@ -28,33 +28,11 @@ pub struct IndexMerger {
}
struct DeltaPositionComputer {
buffer: Vec<u32>,
}
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer { buffer: vec![0u32; 512] }
}
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
for (i, position) in positions.iter().cloned().enumerate() {
self.buffer[i] = position - last_pos;
last_pos = position;
}
&self.buffer[..positions.len()]
}
}
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet)
-> Option<(u64, u64)> {
fn compute_min_max_val(
u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet,
) -> Option<(u64, u64)> {
if max_doc == 0 {
None
} else if !delete_bitset.has_deletes() {
@@ -72,18 +50,46 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader,
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
fn extract_fieldnorm_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
segment_reader.fast_fields_reader().open_reader(field)
fn extract_fast_field_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field).ok()
}
struct DeltaComputer {
buffer: Vec<u32>,
}
impl DeltaComputer {
fn new() -> DeltaComputer {
DeltaComputer { buffer: vec![0u32; 512] }
}
fn compute_delta(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
let num_positions = positions.len();
for i in 0..num_positions {
let cur_pos = positions[i];
self.buffer[i] = cur_pos - last_pos;
last_pos = cur_pos;
}
&self.buffer[..positions.len()]
}
}
impl IndexMerger {
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
let mut readers = vec![];
@@ -96,10 +102,10 @@ impl IndexMerger {
}
}
Ok(IndexMerger {
schema: schema,
readers: readers,
max_doc: max_doc,
})
schema: schema,
readers: readers,
max_doc: max_doc,
})
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -110,9 +116,11 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer)
self.generic_write_fast_field(
fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer,
)
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -123,19 +131,21 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields,
&extract_fast_field_reader,
fast_field_serializer)
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field)
-> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer)
-> Result<()> {
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
@@ -147,19 +157,25 @@ impl IndexMerger {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader,
reader.max_doc(),
reader.delete_bitset()) {
compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
)
{
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u64_readers
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
}
}
None => {
let error_msg = format!("Failed to find a u64_reader for field {:?}",
field);
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
bail!(ErrorKind::SchemaError(error_msg));
}
@@ -174,50 +190,68 @@ impl IndexMerger {
assert!(min_val <= max_val);
fast_field_serializer
.new_u64_fast_field(field, min_val, max_val)?;
let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(
field,
min_val,
max_val,
)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u64_reader.get(doc_id);
fast_field_serializer.add_val(val)?;
fast_single_field_serializer.add_val(val)?;
}
}
}
fast_field_serializer.close_field()?;
fast_single_field_serializer.close_field()?;
}
Ok(())
}
fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> {
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut merged_terms = TermMerger::from(&self.readers[..]);
let mut delta_position_computer = DeltaPositionComputer::new();
let mut delta_computer = DeltaComputer::new();
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
let mut indexed_fields = vec![];
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
if field_entry.is_indexed() {
indexed_fields.push(Field(field_ord as u32));
}
merged_doc_id_map.push(segment_local_map);
}
let mut last_field: Option<Field> = None;
for indexed_field in indexed_fields {
let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions;
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
while merged_terms.advance() {
let field_term_streams = field_readers
.iter()
.map(|field_reader| field_reader.terms().stream())
.collect();
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
@@ -229,85 +263,92 @@ impl IndexMerger {
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let term = Term::wrap(merged_terms.key());
let current_field = term.field();
if last_field != Some(current_field) {
// we reached a new field.
let field_entry = self.schema.get_field_entry(current_field);
// ... set segment postings option the new field.
segment_postings_option = field_entry
.field_type()
.get_segment_postings_option()
.expect("Encounterred a field that is not supposed to be
indexed. Have you modified the index?");
last_field = Some(current_field);
let mut field_serializer = serializer.new_field(indexed_field)?;
// it is perfectly safe to call `.new_field`
// even if there is no postings associated.
serializer.new_field(current_field);
}
let field_entry = self.schema.get_field_entry(indexed_field);
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let mut segment_postings =
segment_reader
.read_postings_from_terminfo(term_info, segment_postings_option);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
// ... set segment postings option the new field.
let segment_postings_option = field_entry
.field_type()
.get_segment_postings_option()
.expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
while merged_terms.advance() {
let term = Term::wrap(merged_terms.key());
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(term.field());
let mut segment_postings = inverted_index.read_postings_from_terminfo(
term_info,
segment_postings_option,
);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
field_serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
{
// we make sure to only write the term iff
// there is at least one document.
let positions: &[u32] = segment_postings.positions();
let term_freq = segment_postings.term_freq();
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
}
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if segment_postings.is_empty() {
// by continuing here, the `term` will be entirely removed.
continue;
}
// We know that there is at least one document containing
// the term, so we add it.
serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize] {
// we make sure to only write the term iff
// there is at least one document.
let delta_positions: &[u32] =
delta_position_computer
.compute_delta_positions(segment_postings.positions());
let term_freq = segment_postings.term_freq();
serializer
.write_doc(remapped_doc_id, term_freq, delta_positions)?;
}
if !segment_postings.advance() {
break;
}
// closing the term.
field_serializer.close_term()?;
}
}
// closing the term.
serializer.close_term()?;
field_serializer.close()?;
}
Ok(())
}
@@ -317,9 +358,9 @@ impl IndexMerger {
let store_reader = reader.get_store_reader();
for doc_id in 0..reader.max_doc() {
if !reader.is_deleted(doc_id) {
let doc = try!(store_reader.get(doc_id));
let doc = store_reader.get(doc_id)?;
let field_values: Vec<&FieldValue> = doc.field_values().iter().collect();
try!(store_writer.store(&field_values));
store_writer.store(&field_values)?;
}
}
}
@@ -329,11 +370,15 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
try!(self.write_postings(serializer.get_postings_serializer()));
try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
try!(self.write_storable_fields(serializer.get_store_writer()));
try!(serializer.close());
self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(
serializer.get_fieldnorms_serializer(),
)?;
self.write_fast_fields(
serializer.get_fast_field_serializer(),
)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
Ok(self.max_doc)
}
}
@@ -410,14 +455,13 @@ mod tests {
}
}
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index_writer.wait_merging_threads().unwrap();
}
{
@@ -430,14 +474,22 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]
);
}
{
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
@@ -466,8 +518,10 @@ mod tests {
assert!(searcher.search(&query, &mut collector).is_ok());
collector.vals()
};
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]);
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]
);
}
}
}
@@ -514,14 +568,22 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]
);
}
{
// a second commit
@@ -553,20 +615,34 @@ mod tests {
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
@@ -584,33 +660,46 @@ mod tests {
}
{
// merging the segments
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -629,20 +718,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -652,13 +755,12 @@ mod tests {
}
{
// Test merging a single segment in order to remove deletes.
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -666,20 +768,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -691,13 +807,12 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();

View File

@@ -44,10 +44,11 @@ pub struct SegmentEntry {
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>)
-> SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,

View File

@@ -32,31 +32,36 @@ pub struct SegmentManager {
impl Debug for SegmentManager {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let lock = self.read();
write!(f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed)
write!(
f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed
)
}
}
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments())
(
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
impl SegmentManager {
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor)
-> SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas,
delete_cursor),
writing: HashSet::new(),
}),
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
}
@@ -94,25 +99,24 @@ impl SegmentManager {
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
registers.committed.segment_entry(segment_id).or_else(|| {
registers.uncommitted.segment_entry(segment_id)
})
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
self.registers
.read()
.expect("Failed to acquire read lock on SegmentManager.")
self.registers.read().expect(
"Failed to acquire read lock on SegmentManager.",
)
}
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
self.registers
.write()
.expect("Failed to acquire write lock on SegmentManager.")
self.registers.write().expect(
"Failed to acquire write lock on SegmentManager.",
)
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
@@ -140,9 +144,11 @@ impl SegmentManager {
}
pub fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId) {
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
@@ -150,13 +156,15 @@ impl SegmentManager {
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
@@ -185,23 +193,26 @@ impl SegmentManager {
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry) {
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
registers_lock.writing.remove(&after_merge_segment_entry
.segment_id());
let mut target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
let target_register: &mut SegmentRegister = {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");

View File

@@ -24,7 +24,12 @@ impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
try!(write!(
f,
"{}:{}, ",
k.short_uuid_string(),
v.state().letter_code()
));
}
try!(write!(f, ")"));
Ok(())
@@ -74,9 +79,9 @@ impl SegmentRegister {
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
segment_ids.iter().all(|segment_id| {
self.segment_states.contains_key(segment_id)
})
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
@@ -91,14 +96,18 @@ impl SegmentRegister {
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.start_merge();
}
@@ -144,34 +153,42 @@ mod tests {
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{

View File

@@ -4,8 +4,7 @@ use core::Segment;
use core::SegmentComponent;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
@@ -13,7 +12,7 @@ pub struct SegmentSerializer {
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FastFieldSerializer,
postings_serializer: PostingsSerializer,
postings_serializer: InvertedIndexSerializer,
}
impl SegmentSerializer {
@@ -22,22 +21,22 @@ impl SegmentSerializer {
let store_write = try!(segment.open_write(SegmentComponent::STORE));
let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS));
let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write));
let fast_field_serializer = try!(FastFieldSerializer::from_write(fast_field_write));
let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS));
let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write));
let fieldnorms_serializer = try!(FastFieldSerializer::from_write(fieldnorms_write));
let postings_serializer = try!(PostingsSerializer::open(segment));
let postings_serializer = try!(InvertedIndexSerializer::open(segment));
Ok(SegmentSerializer {
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer {
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer
}

View File

@@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema: schema,
@@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64)
-> Result<SegmentEntry> {
fn perform_merge(
segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
@@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId],
for segment_id in segment_ids {
if let Some(mut segment_entry) =
segment_updater.0.segment_manager.segment_entry(segment_id) {
segment_updater.0.segment_manager.segment_entry(segment_id)
{
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) =
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
advance_deletes(segment, &mut segment_entry, target_opstamp)?
{
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
} else {
error!("Error, had to abort merge as some of the segment is not managed anymore.");
let msg = format!("Segment {:?} requested for merge is not managed.",
segment_id);
let msg = format!(
"Segment {:?} requested for merge is not managed.",
segment_id
);
bail!(ErrorKind::InvalidArgument(msg));
}
}
@@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId],
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect(
"Creating index serializer failed",
);
let num_docs = merger
.write(segment_serializer)
.expect("Serializing merged index failed");
let num_docs = merger.write(segment_serializer).expect(
"Serializing merged index failed",
);
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
@@ -161,23 +168,24 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn new(index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor)
-> Result<SegmentUpdater> {
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor,
) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
}
pub fn new_segment(&self) -> Segment {
@@ -199,10 +207,10 @@ impl SegmentUpdater {
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
}
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
(&self,
f: F)
-> CpuFuture<T, Error> {
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
&self,
f: F,
) -> CpuFuture<T, Error> {
let me_clone = self.clone();
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
}
@@ -211,11 +219,10 @@ impl SegmentUpdater {
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
if generation >= self.0.generation.load(Ordering::Acquire) {
self.run_async(|segment_updater| {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
}).forget();
true
} else {
false
@@ -249,42 +256,46 @@ impl SegmentUpdater {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
save_metas(self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut())
.expect("Could not save metas.");
save_metas(
self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| { segment_updater.garbage_collect_files_exec(); })
.wait()
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
}).wait()
}
fn garbage_collect_files_exec(&self) {
let living_files = self.0.segment_manager.list_files();
info!("Running garbage collection");
let mut index = self.0.index.clone();
index.directory_mut().garbage_collect(living_files);
index.directory_mut().garbage_collect(
|| self.0.segment_manager.list_files(),
);
}
pub fn commit(&self, opstamp: u64) -> Result<()> {
self.run_async(move |segment_updater| if segment_updater.is_alive() {
let segment_entries = segment_updater
.purge_deletes(opstamp)
.expect("Failed purge deletes");
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
})
.wait()
let segment_entries = segment_updater.purge_deletes(opstamp).expect(
"Failed purge deletes",
);
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}).wait()
}
pub fn start_merge(&self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn start_merge(
&self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
@@ -304,10 +315,12 @@ impl SegmentUpdater {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp);
let merge_result = perform_merge(
&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
@@ -341,11 +354,10 @@ impl SegmentUpdater {
.remove(&merging_thread_id);
Ok(())
});
self.0
.merging_threads
.write()
.unwrap()
.insert(merging_thread_id, merging_join_handle);
self.0.merging_threads.write().unwrap().insert(
merging_thread_id,
merging_join_handle,
);
merging_future_recv
}
@@ -364,22 +376,26 @@ impl SegmentUpdater {
}
}
fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0.segment_manager.cancel_merge(
before_merge_segment_ids,
after_merge_segment_entry,
);
}
fn end_merge(&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry)
-> Result<()> {
fn end_merge(
&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
debug!("End merge {:?}", after_merge_segment_entry.meta());
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
let mut _file_protection_opt = None;
if let Some(delete_operation) = delete_cursor.get() {
@@ -387,30 +403,41 @@ impl SegmentUpdater {
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(segment,
&mut after_merge_segment_entry,
committed_opstamp) {
match advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e);
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids,
e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(&before_merge_segment_ids,
after_merge_segment_entry.segment_id());
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids,
after_merge_segment_entry);
segment_updater.0.segment_manager.end_merge(
&before_merge_segment_ids,
after_merge_segment_entry,
);
segment_updater.consider_merge_options();
info!("save metas");
segment_updater.save_metas(segment_updater.0.index.opstamp());
segment_updater.garbage_collect_files_exec();
}).wait()
}
@@ -444,10 +471,9 @@ impl SegmentUpdater {
}
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
merging_thread_handle.join().map(|_| ()).map_err(|_| {
ErrorKind::ErrorInThread("Merging thread failed.".into())
})?;
}
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
@@ -516,9 +542,9 @@ mod tests {
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
index_writer.wait_merging_threads().expect(
"waiting for merging threads",
);
}
index.load_searchers().unwrap();

View File

@@ -58,11 +58,12 @@ impl<'a> SegmentWriter<'a> {
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema)
-> Result<SegmentWriter<'a>> {
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
let analyzers = schema.fields()
.iter()
.map(|field_entry| field_entry.field_type())
@@ -92,10 +93,12 @@ impl<'a> SegmentWriter<'a> {
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
)?;
Ok(self.doc_opstamps)
}
@@ -122,15 +125,19 @@ impl<'a> SegmentWriter<'a> {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self,
add_operation: &AddOperation,
schema: &Schema)
-> io::Result<()> {
pub fn add_document(
&mut self,
add_operation: &AddOperation,
schema: &Schema,
) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
for (field, field_values) in doc.get_sorted_field_values() {
let field_options = schema.get_field_entry(field);
if !field_options.is_indexed() {
continue;
}
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 =
@@ -166,8 +173,10 @@ impl<'a> SegmentWriter<'a> {
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(field_value.field(),
field_value.value().u64_value());
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -175,8 +184,10 @@ impl<'a> SegmentWriter<'a> {
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(),
field_value.value().i64_value());
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -187,7 +198,9 @@ impl<'a> SegmentWriter<'a> {
self.fast_field_writers.add_document(doc);
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
.filter(|field_value| {
schema.get_field_entry(field_value.field()).is_stored()
})
.collect();
let doc_writer = self.segment_serializer.get_store_writer();
try!(doc_writer.store(&stored_fieldvalues));
@@ -218,15 +231,22 @@ impl<'a> SegmentWriter<'a> {
}
// This method is used as a trick to workaround the borrow checker
fn write(multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer)
-> Result<()> {
fn write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer,
) -> Result<()> {
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(multifield_postings.serialize(
serializer.get_postings_serializer(),
));
try!(fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
));
try!(fieldnorms_writer.serialize(
serializer.get_fieldnorms_serializer(),
));
try!(serializer.close());
Ok(())
@@ -235,10 +255,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter,
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
)?;
Ok(max_doc)
}
}

View File

@@ -8,7 +8,8 @@
#![feature(integer_atomics)]
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(step_by))]
#![cfg_attr(test, feature(iterator_step_by))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
@@ -44,11 +45,9 @@ extern crate fst;
extern crate byteorder;
extern crate memmap;
extern crate regex;
extern crate tempfile;
extern crate atomicwrites;
extern crate tempdir;
extern crate serde;
extern crate bincode;
extern crate serde_json;
extern crate time;
extern crate lz4;
@@ -68,7 +67,7 @@ extern crate rust_stemmers;
#[cfg(test)]
extern crate env_logger;
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
extern crate libc;
#[cfg(windows)]
@@ -99,29 +98,19 @@ mod core;
mod compression;
mod indexer;
mod common;
#[allow(unused_doc_comment)]
mod error;
pub mod analyzer;
mod datastruct;
pub mod termdict;
/// Row-oriented, slow, compressed storage of documents
pub mod store;
/// Query module
pub mod query;
pub mod directory;
/// Collector module
pub mod collector;
/// Postings module (also called inverted index)
pub mod postings;
/// Schema
pub mod schema;
pub mod fastfield;
@@ -129,7 +118,7 @@ pub use directory::Directory;
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
pub use indexer::IndexWriter;
pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use core::{SegmentReader, InvertedIndexReader};
pub use self::common::TimerTree;
pub use postings::DocSet;
@@ -157,15 +146,19 @@ pub mod merge_policy {
pub use indexer::DefaultMergePolicy;
}
/// u32 identifying a document within a segment.
/// Documents have their doc id assigned incrementally,
/// A `u32` identifying a document within a segment.
/// Documents have their `DocId` assigned incrementally,
/// as they are added in the segment.
pub type DocId = u32;
/// f32 the score of a document.
/// A f32 that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The
/// larger the number, the more relevant the document
/// to the search
pub type Score = f32;
/// A segment local id identifies a segment.
/// A `SegmentLocalId` identifies a segment.
/// It only makes sense for a given searcher.
pub type SegmentLocalId = u32;
@@ -263,7 +256,7 @@ mod tests {
}
#[test]
fn test_docfreq() {
fn test_docfreq1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
@@ -302,7 +295,6 @@ mod tests {
}
}
#[test]
fn test_fieldnorm() {
let mut schema_builder = SchemaBuilder::default();
@@ -391,15 +383,24 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -425,16 +426,25 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -460,13 +470,22 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -474,7 +493,9 @@ mod tests {
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_c, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
@@ -498,6 +519,7 @@ mod tests {
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -521,6 +543,7 @@ mod tests {
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -583,10 +606,17 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_af, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);
@@ -628,29 +658,43 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![0, 1, 2]
);
}
}
}
@@ -687,7 +731,8 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let other_text_field = schema_builder.add_text_field("text2", TEXT);
let document = doc!(text_field => "tantivy",
let document =
doc!(text_field => "tantivy",
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);

View File

@@ -52,6 +52,33 @@ pub trait DocSet {
}
}
/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
/// If that many `DocId`s are available, the method should
/// fill the entire buffer and return the length of the buffer.
///
/// If we reach the end of the `DocSet` before filling
/// it entirely, then the buffer is filled up to this point, and
/// return value is the number of elements that were filled.
///
/// # Warning
///
/// This method is only here for specific high-performance
/// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
} else {
return i;
}
}
return buffer.len();
}
/// Returns the current document
fn doc(&self) -> DocId;

View File

@@ -1,125 +0,0 @@
use compression::BlockDecoder;
use common::VInt;
use common::BinarySerializable;
use compression::{CompositeDecoder, VIntDecoder};
use postings::SegmentPostingsOption;
use compression::NUM_DOCS_PER_BLOCK;
/// `FreqHandler` is in charge of decompressing
/// frequencies and/or positions.
pub struct FreqHandler {
freq_decoder: BlockDecoder,
positions: Vec<u32>,
option: SegmentPostingsOption,
positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1],
}
fn read_positions(data: &[u8]) -> Vec<u32> {
let mut composite_reader = CompositeDecoder::new();
let mut readable: &[u8] = data;
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
composite_reader.uncompress_unsorted(readable, uncompressed_len);
composite_reader.into()
}
impl FreqHandler {
/// Returns a `FreqHandler` that just decodes `DocId`s.
pub fn new_without_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::with_val(1u32),
positions: Vec::new(),
option: SegmentPostingsOption::NoFreq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
pub fn new_with_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: Vec::new(),
option: SegmentPostingsOption::Freq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
let positions = read_positions(position_data);
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: positions,
option: SegmentPostingsOption::FreqAndPositions,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
fn fill_positions_offset(&mut self) {
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
let mut i: usize = 0;
self.positions_offsets[i] = cur_position;
let mut last_cur_position = cur_position;
for &doc_freq in self.freq_decoder.output_array() {
i += 1;
let mut cumulated_pos = 0u32;
// this next loop decodes delta positions into normal positions.
for j in last_cur_position..(last_cur_position + (doc_freq as usize)) {
cumulated_pos += self.positions[j];
self.positions[j] = cumulated_pos;
}
cur_position += doc_freq as usize;
self.positions_offsets[i] = cur_position;
last_cur_position = cur_position;
}
}
/// Accessor to term frequency
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Accessor to the positions
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn positions(&self, idx: usize) -> &[u32] {
let start = self.positions_offsets[idx];
let stop = self.positions_offsets[idx + 1];
&self.positions[start..stop]
}
/// Decompresses a complete frequency block
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match self.option {
SegmentPostingsOption::NoFreq => data,
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
SegmentPostingsOption::FreqAndPositions => {
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
self.fill_positions_offset();
remaining
}
}
}
/// Decompresses an incomplete frequency block
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
match self.option {
SegmentPostingsOption::NoFreq => {}
SegmentPostingsOption::Freq => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
}
SegmentPostingsOption::FreqAndPositions => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
self.fill_positions_offset();
}
}
}
}

View File

@@ -1,3 +1,7 @@
/*!
Postings module (also called inverted index)
*/
/// Postings module
///
/// Postings, also called inverted lists, is the key datastructure
@@ -12,14 +16,14 @@ mod term_info;
mod vec_postings;
mod segment_postings;
mod intersection;
mod freq_handler;
mod docset;
mod segment_postings_option;
pub use self::docset::{SkipResult, DocSet};
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
pub use self::serializer::PostingsSerializer;
pub use self::serializer::{InvertedIndexSerializer, FieldSerializer};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
@@ -28,7 +32,6 @@ pub use self::vec_postings::VecPostings;
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
pub use common::HasLen;
@@ -60,30 +63,35 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut segment = index.new_segment();
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec![1, 2, 3, 2];
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
{
let mut field_serializer = posting_serializer.new_field(text_field).unwrap();
field_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer
.write_doc(doc_id, 2, &delta_positions)
.unwrap();
}
field_serializer.close_term().unwrap();
}
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert!(read.len() <= 16);
assert!(read.len() <= 140);
}
#[test]
pub fn test_position_and_fieldnorm() {
pub fn test_position_and_fieldnorm1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let segment = index.new_segment();
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema)
.unwrap();
let mut segment_writer =
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
@@ -129,13 +137,17 @@ mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.read_postings(&term_a, FreqAndPositions)
.is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
@@ -143,6 +155,7 @@ mod tests {
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);
@@ -157,6 +170,7 @@ mod tests {
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader
.inverted_index(term_e.field())
.read_postings(&term_e, FreqAndPositions)
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
@@ -196,8 +210,10 @@ mod tests {
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let searcher = index.searcher();
let mut term_weight = term_query.specialized_weight(&*searcher);
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
@@ -244,6 +260,7 @@ mod tests {
for i in 0..num_docs - 1 {
for j in i + 1..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -257,6 +274,7 @@ mod tests {
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -277,6 +295,7 @@ mod tests {
// check that filtering works
{
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -286,6 +305,7 @@ mod tests {
}
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -310,6 +330,7 @@ mod tests {
// make sure seeking still works
for i in 0..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -325,6 +346,7 @@ mod tests {
// now try with a longer sequence
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -360,12 +382,14 @@ mod tests {
// finally, check that it's empty
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -432,11 +456,12 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut segment_postings = segment_reader
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
}
#[bench]
@@ -445,21 +470,27 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let segment_postings_a = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_b = segment_reader
.inverted_index(TERM_B.field())
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_c = segment_reader
.inverted_index(TERM_C.field())
.read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_d = segment_reader
.inverted_index(TERM_D.field())
.read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
.unwrap();
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d]);
let mut intersection = IntersectionDocSet::from(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
while intersection.advance() {}
});
}
@@ -470,6 +501,7 @@ mod tests {
let docs = tests::sample(segment_reader.num_docs(), p);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -486,6 +518,7 @@ mod tests {
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
for doc in &existing_docs {
@@ -523,6 +556,7 @@ mod tests {
b.iter(|| {
let n: u32 = test::black_box(17);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let mut s = 0u32;

View File

@@ -1,6 +1,7 @@
use DocId;
use schema::Term;
use postings::PostingsSerializer;
use schema::FieldValue;
use postings::{InvertedIndexSerializer, FieldSerializer};
use std::io;
use postings::Recorder;
use Result;
@@ -15,9 +16,10 @@ use schema::FieldType;
use analyzer::TokenStream;
use schema::TextIndexingOptions;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
heap: &'a Heap)
-> Box<PostingsWriter + 'a> {
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
@@ -45,16 +47,14 @@ pub struct MultiFieldPostingsWriter<'a> {
impl<'a> MultiFieldPostingsWriter<'a> {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
pub fn new(schema: &Schema, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
let term_index = HashMap::new(hashmap_size, heap);
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let term_index = HashMap::new(table_bits, heap);
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
let mut per_field_postings_writers: Vec<_> = vec![];
for field_entry in schema.fields() {
let field_entry = posting_from_field_entry(field_entry, heap);
per_field_postings_writers.push(field_entry);
}
MultiFieldPostingsWriter {
heap: heap,
term_index: term_index,
@@ -77,7 +77,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
/// It pushes all term, one field at a time, towards the
/// postings serializer.
#[allow(needless_range_loop)]
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _v)| k);
@@ -100,8 +100,13 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
postings_writer
.serialize(field, &term_offsets[start..stop], serializer, self.heap)?;
let mut field_serializer = serializer.new_field(field)?;
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
self.heap,
)?;
field_serializer.close()?;
}
Ok(())
}
@@ -125,22 +130,23 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap);
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap,
);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
@@ -149,7 +155,6 @@ pub trait PostingsWriter {
token_stream: &mut TokenStream,
heap: &Heap)
-> u32 {
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut sink = |token: &Token| {
@@ -168,20 +173,6 @@ pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
_recorder_type: PhantomData<Rec>,
}
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
let num_buckets_usable = heap_capacity / 100;
let hash_table_size = num_buckets_usable * 2;
let mut pow = 512;
for num_bits in 10..32 {
pow <<= 1;
if pow > hash_table_size {
return num_bits;
}
}
32
}
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
@@ -198,13 +189,15 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap) {
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap,
) {
debug_assert!(term.as_slice().len() >= 4);
let recorder: &mut Rec = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
@@ -217,28 +210,20 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
recorder.record_position(position, heap);
}
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
serializer.new_field(field);
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
try!(serializer.new_term(term_bytes));
try!(recorder.serialize(addr, serializer, heap));
try!(serializer.close_term());
serializer.new_term(term_bytes)?;
recorder.serialize(addr, serializer, heap)?;
serializer.close_term()?;
}
Ok(())
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(hashmap_size_in_bits(10), 10);
assert_eq!(hashmap_size_in_bits(0), 10);
assert_eq!(hashmap_size_in_bits(100_000), 11);
assert_eq!(hashmap_size_in_bits(300_000_000), 23);
}

View File

@@ -1,6 +1,6 @@
use DocId;
use std::io;
use postings::PostingsSerializer;
use postings::FieldSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
@@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Pushes the postings information to the serializer.
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
}
/// Only records the doc ids
@@ -64,13 +65,14 @@ impl Recorder for NothingRecorder {
fn close_doc(&mut self, _heap: &Heap) {}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
@@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder {
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
let mut doc_iter = self.stack.iter(self_addr, heap).chain(
Some(self.current_tf)
.into_iter(),
);
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
let term_freq = doc_iter.next().expect(
"The IndexWriter recorded a doc without a term freq.",
);
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
@@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder {
self.stack.push(POSITION_END, heap);
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
@@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder {
prev_position = position;
}
}
try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions));
serializer.write_doc(
doc,
doc_positions.len() as u32,
&doc_positions,
)?;
}
Ok(())
}

View File

@@ -1,12 +1,65 @@
use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
use compression::{COMPRESSION_BLOCK_SIZE, BlockDecoder, VIntDecoder, CompressedIntStream};
use DocId;
use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
use postings::{Postings, DocSet, HasLen, SkipResult};
use std::cmp;
use fastfield::DeleteBitSet;
use fst::Streamer;
use fastfield::DeleteBitSet;
use std::cell::UnsafeCell;
use directory::{SourceRead, ReadOnlySource};
const EMPTY_DATA: [u8; 0] = [0u8; 0];
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
struct PositionComputer {
// store the amount of position int
// before reading positions.
//
// if none, position are already loaded in
// the positions vec.
position_to_skip: Option<usize>,
positions: Vec<u32>,
positions_stream: CompressedIntStream,
}
impl PositionComputer {
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
PositionComputer {
position_to_skip: None,
positions: vec![],
positions_stream: positions_stream,
}
}
pub fn add_skip(&mut self, num_skip: usize) {
self.position_to_skip = Some(
self.position_to_skip
.map(|prev_skip| prev_skip + num_skip)
.unwrap_or(0),
);
}
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
if let Some(num_skip) = self.position_to_skip {
self.positions.resize(term_freq, 0u32);
self.positions_stream.skip(num_skip);
self.positions_stream.read(&mut self.positions[..term_freq]);
let mut cum = 0u32;
for i in 0..term_freq as usize {
cum += self.positions[i];
self.positions[i] = cum;
}
self.position_to_skip = None;
}
&self.positions[..term_freq]
}
}
/// `SegmentPostings` represents the inverted list or postings associated to
@@ -14,42 +67,60 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
block_cursor: BlockSegmentPostings<'a>,
pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
delete_bitset: DeleteBitSet,
position_computer: Option<UnsafeCell<PositionComputer>>,
}
impl<'a> SegmentPostings<'a> {
impl SegmentPostings {
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
delete_bitset: DeleteBitSet)
-> SegmentPostings<'a> {
pub fn from_block_postings(
segment_block_postings: BlockSegmentPostings,
delete_bitset: DeleteBitSet,
positions_stream_opt: Option<CompressedIntStream>,
) -> SegmentPostings {
let position_computer =
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
SegmentPostings {
block_cursor: segment_block_postings,
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
delete_bitset: delete_bitset,
position_computer: position_computer,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
pub fn empty() -> SegmentPostings {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: NUM_DOCS_PER_BLOCK,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
}
}
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
if let Some(ref position_computer) = self.position_computer.as_ref() {
let num_skips = num_skips_fn();
unsafe {
(*position_computer.get()).add_skip(num_skips);
}
}
}
}
impl<'a> DocSet for SegmentPostings<'a> {
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
@@ -59,10 +130,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = NUM_DOCS_PER_BLOCK;
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
self.position_add_skip(|| self.term_freq() as usize);
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
}
@@ -75,6 +147,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
return SkipResult::End;
}
// in the following, thanks to the call to advance above,
// we know that the position is not loaded and we need
// to skip every doc_freq we cross.
// skip blocks until one that might contain the target
loop {
// check if we need to go to the next block
@@ -83,13 +159,26 @@ impl<'a> DocSet for SegmentPostings<'a> {
(block_docs[self.cur], block_docs[block_docs.len() - 1])
};
if target > last_doc_in_block {
// we add skip for the current term independantly,
// so that position_add_skip will decide if it should
// just set itself to Some(0) or effectively
// add the term freq.
//let num_skips: u32 = ;
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
let sum_freq: u32 = freqs_skipped.iter().cloned().sum();
sum_freq as usize
});
if !self.block_cursor.advance() {
return SkipResult::End;
}
self.cur = 0;
} else {
if target < current_doc {
// We've overpassed the target after the first `advance` call
// We've passed the target after the first `advance` call
// or we're at the beginning of a block.
// Either way, we're on the first `DocId` greater than `target`
return SkipResult::OverStep;
@@ -135,6 +224,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
// `doc` is now >= `target`
let doc = block_docs[start];
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
let sum_freqs: u32 = freqs_skipped.iter().sum();
sum_freqs as usize
});
self.cur = start;
if !self.delete_bitset.is_deleted(doc) {
@@ -156,31 +252,41 @@ impl<'a> DocSet for SegmentPostings<'a> {
self.len()
}
/// Return the current document's `DocId`.
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
assert!(self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc().");
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc()."
);
docs[self.cur]
}
}
impl<'a> HasLen for SegmentPostings<'a> {
impl HasLen for SegmentPostings {
fn len(&self) -> usize {
self.block_cursor.doc_freq()
}
}
impl<'a> Postings for SegmentPostings<'a> {
impl Postings for SegmentPostings {
fn term_freq(&self) -> u32 {
self.block_cursor.freq_handler().freq(self.cur)
self.block_cursor.freq(self.cur)
}
fn positions(&self) -> &[u32] {
self.block_cursor.freq_handler().positions(self.cur)
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| unsafe {
(&mut *position_computer.get()).positions(term_freq as usize)
})
.unwrap_or(&EMPTY_POSITIONS[..])
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
@@ -188,28 +294,35 @@ impl<'a> Postings for SegmentPostings<'a> {
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings<'a> {
block_decoder: BlockDecoder,
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
has_freq: bool,
doc_freq: usize,
doc_offset: DocId,
num_binpacked_blocks: usize,
num_vint_docs: usize,
remaining_data: &'a [u8],
freq_handler: FreqHandler,
remaining_data: SourceRead,
}
impl<'a> BlockSegmentPostings<'a> {
pub(crate) fn from_data(doc_freq: usize,
data: &'a [u8],
freq_handler: FreqHandler)
-> BlockSegmentPostings<'a> {
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
has_freq: bool,
) -> BlockSegmentPostings {
let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks;
BlockSegmentPostings {
num_binpacked_blocks: num_binpacked_blocks,
num_vint_docs: num_vint_docs,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: has_freq,
remaining_data: data,
doc_offset: 0,
doc_freq: doc_freq,
@@ -226,9 +339,9 @@ impl<'a> BlockSegmentPostings<'a> {
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) {
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
self.num_binpacked_blocks = num_binpacked_blocks;
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
@@ -250,7 +363,25 @@ impl<'a> BlockSegmentPostings<'a> {
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.block_decoder.output_array()
self.doc_decoder.output_array()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
@@ -260,13 +391,7 @@ impl<'a> BlockSegmentPostings<'a> {
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.block_decoder.output_len
}
/// Returns a reference to the frequency handler.
pub fn freq_handler(&self) -> &FreqHandler {
&self.freq_handler
self.doc_decoder.output_len
}
/// Advance to the next block.
@@ -274,21 +399,35 @@ impl<'a> BlockSegmentPostings<'a> {
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_binpacked_blocks > 0 {
self.remaining_data =
self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
);
self.remaining_data.advance(num_consumed_bytes);
if self.has_freq {
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(
self.remaining_data.as_ref(),
);
self.remaining_data.advance(num_consumed_bytes);
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
self.num_binpacked_blocks -= 1;
true
} else if self.num_vint_docs > 0 {
self.remaining_data =
self.block_decoder
.uncompress_vint_sorted(self.remaining_data,
self.doc_offset,
self.num_vint_docs);
self.freq_handler
.read_freq_vint(self.remaining_data, self.num_vint_docs);
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
if self.has_freq {
self.freq_decoder.uncompress_vint_unsorted(
self.remaining_data.as_ref(),
self.num_vint_docs,
);
}
self.num_vint_docs = 0;
true
} else {
@@ -297,20 +436,23 @@ impl<'a> BlockSegmentPostings<'a> {
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings<'static> {
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_binpacked_blocks: 0,
num_vint_docs: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: false,
remaining_data: From::from(ReadOnlySource::empty()),
doc_offset: 0,
doc_freq: 0,
}
}
}
impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> {
impl<'b> Streamer<'b> for BlockSegmentPostings {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
@@ -366,11 +508,13 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
let mut block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let term_info = inverted_index.get_term_info(&term).unwrap();
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -406,17 +550,20 @@ mod tests {
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[1, 3, 5]);

View File

@@ -16,6 +16,26 @@ pub enum SegmentPostingsOption {
FreqAndPositions,
}
impl SegmentPostingsOption {
/// Returns true iff this option includes encoding
/// term frequencies.
pub fn has_freq(&self) -> bool {
match *self {
SegmentPostingsOption::NoFreq => false,
_ => true,
}
}
/// Returns true iff this option include encoding
/// term positions.
pub fn has_positions(&self) -> bool {
match *self {
SegmentPostingsOption::FreqAndPositions => true,
_ => false,
}
}
}
#[cfg(test)]
mod tests {

View File

@@ -5,16 +5,14 @@ use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
use schema::Schema;
use schema::TextIndexingOptions;
use directory::WritePtr;
use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
use compression::{COMPRESSION_BLOCK_SIZE, BlockEncoder};
use DocId;
use core::Segment;
use std::io::{self, Write};
use compression::VIntEncoder;
use common::VInt;
use common::BinarySerializable;
use common::CountingWriter;
use common::CompositeWrite;
use termdict::TermDictionaryBuilder;
@@ -49,74 +47,127 @@ use termdict::TermDictionaryBuilder;
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>,
postings_write: CountingWriter<WritePtr>,
positions_write: CountingWriter<WritePtr>,
last_doc_id_encoded: u32,
positions_encoder: CompositeEncoder,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
position_deltas: Vec<u32>,
pub struct InvertedIndexSerializer {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
text_indexing_options: TextIndexingOptions,
term_open: bool,
current_term_info: TermInfo,
}
impl PostingsSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn new(terms_write: WritePtr,
postings_write: WritePtr,
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: CountingWriter::wrap(postings_write),
positions_write: CountingWriter::wrap(positions_write),
last_doc_id_encoded: 0u32,
positions_encoder: CompositeEncoder::new(),
block_encoder: BlockEncoder::new(),
doc_ids: Vec::new(),
term_freqs: Vec::new(),
position_deltas: Vec::new(),
schema: schema,
text_indexing_options: TextIndexingOptions::Unindexed,
term_open: false,
current_term_info: TermInfo::default(),
})
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
) -> Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer {
terms_write: terms_write,
postings_write: postings_write,
positions_write: positions_write,
schema: schema,
})
}
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{TERMS, POSTINGS, POSITIONS};
PostingsSerializer::new(segment.open_write(TERMS)?,
segment.open_write(POSTINGS)?,
segment.open_write(POSITIONS)?,
segment.schema())
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
segment.schema(),
)
}
/// Must be called before starting pushing terms of
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field) {
pub fn new_field(&mut self, field: Field) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U64(ref int_options) |
FieldType::I64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized
}
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
let positions_write = self.positions_write.for_field(field);
FieldSerializer::new(
field_entry.field_type().clone(),
term_dictionary_write,
postings_write,
positions_write,
)
}
/// Closes the serializer.
pub fn close(self) -> io::Result<()> {
self.terms_write.close()?;
self.postings_write.close()?;
self.positions_write.close()?;
Ok(())
}
}
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
}
impl<'a> FieldSerializer<'a> {
fn new(
field_type: FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
let text_indexing_options = text_options.get_indexing_options();
(
text_indexing_options.is_termfreq_enabled(),
text_indexing_options.is_position_enabled(),
)
}
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write))
} else {
None
};
Ok(FieldSerializer {
term_dictionary_builder: term_dictionary_builder,
postings_serializer: postings_serializer,
positions_serializer_opt: positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
})
}
fn current_term_info(&self) -> TermInfo {
let (filepos, offset) = self.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u32, 0u8));
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
positions_offset: filepos,
positions_inner_offset: offset,
}
}
/// Starts the postings for a new term.
@@ -124,72 +175,16 @@ impl PostingsSerializer {
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
}
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
);
self.term_open = true;
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
self.position_deltas.clear();
self.current_term_info = TermInfo {
doc_freq: 0,
postings_offset: self.postings_write.written_bytes() as u32,
positions_offset: self.positions_write.written_bytes() as u32,
};
self.terms_fst_builder.insert_key(term)
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.terms_fst_builder
.insert_value(&self.current_term_info)?;
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded =
self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.text_indexing_options.is_termfreq_enabled() {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
for num in block_encoded {
num.serialize(&mut self.postings_write)?;
}
self.term_freqs.clear();
}
}
// On the other hand, positions are entirely buffered until the
// end of the term, at which point they are compressed and written.
if self.text_indexing_options.is_position_enabled() {
let posdelta_len = VInt(self.position_deltas.len() as u64);
posdelta_len.serialize(&mut self.positions_write)?;
let positions_encoded: &[u8] = self.positions_encoder
.compress_unsorted(&self.position_deltas[..]);
self.positions_write.write_all(positions_encoded)?;
self.position_deltas.clear();
}
self.term_open = false;
}
Ok(())
}
/// Serialize the information that a document contains the current term,
/// its term frequency, and the position deltas.
///
@@ -199,32 +194,93 @@ impl PostingsSerializer {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32])
-> io::Result<()> {
pub fn write_doc(
&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32],
) -> io::Result<()> {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq)?;
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.write(position_deltas)?;
}
Ok(())
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.term_dictionary_builder.insert_value(
&self.current_term_info,
)?;
self.postings_serializer.close_term()?;
self.term_open = false;
}
Ok(())
}
/// Closes the current current field.
pub fn close(mut self) -> io::Result<()> {
self.close_term()?;
if let Some(positions_serializer) = self.positions_serializer_opt {
positions_serializer.close()?;
}
self.postings_serializer.close()?;
self.term_dictionary_builder.finish()?;
Ok(())
}
}
struct PostingsSerializer<W: Write> {
postings_write: CountingWriter<W>,
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
termfreq_enabled: bool,
}
impl<W: Write> PostingsSerializer<W> {
fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
PostingsSerializer {
postings_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
doc_ids: vec![],
term_freqs: vec![],
last_doc_id_encoded: 0u32,
termfreq_enabled: termfreq_enabled,
}
}
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
self.term_freqs.push(term_freq as u32);
}
if self.text_indexing_options.is_position_enabled() {
self.position_deltas.extend_from_slice(position_deltas);
}
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] =
self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
}
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
@@ -233,12 +289,93 @@ impl PostingsSerializer {
Ok(())
}
/// Closes the serializer.
pub fn close(mut self) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
try!(self.positions_write.flush());
fn close_term(&mut self) -> io::Result<()> {
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder.compress_vint_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder.compress_vint_unsorted(
&self.term_freqs[..],
);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
}
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.postings_write.flush()
}
fn addr(&self) -> u32 {
self.postings_write.written_bytes() as u32
}
fn clear(&mut self) {
self.doc_ids.clear();
self.term_freqs.clear();
self.last_doc_id_encoded = 0;
}
}
struct PositionSerializer<W: Write> {
buffer: Vec<u32>,
write: CountingWriter<W>, // See if we can offset the original counting writer.
block_encoder: BlockEncoder,
}
impl<W: Write> PositionSerializer<W> {
fn new(write: W) -> PositionSerializer<W> {
PositionSerializer {
buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE),
write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
}
}
fn addr(&self) -> (u32, u8) {
(self.write.written_bytes() as u32, self.buffer.len() as u8)
}
fn write_block(&mut self) -> io::Result<()> {
assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE);
let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer);
self.write.write_all(block_compressed)?;
self.buffer.clear();
Ok(())
}
fn write(&mut self, mut vals: &[u32]) -> io::Result<()> {
let mut buffer_len = self.buffer.len();
while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE {
let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len;
self.buffer.extend_from_slice(&vals[..len_to_completion]);
self.write_block()?;
vals = &vals[len_to_completion..];
buffer_len = self.buffer.len();
}
self.buffer.extend_from_slice(&vals);
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32);
self.write_block()?;
self.write.flush()
}
}

View File

@@ -12,7 +12,7 @@ use std::io;
/// * `postings_offset` : an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
@@ -20,6 +20,8 @@ pub struct TermInfo {
pub postings_offset: u32,
/// Offset within the position (`.pos`) file.
pub positions_offset: u32,
/// Offset within the position block.
pub positions_inner_offset: u8,
}
@@ -27,17 +29,20 @@ impl BinarySerializable for TermInfo {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.positions_offset.serialize(writer)
self.positions_offset.serialize(writer)?;
self.positions_inner_offset.serialize(writer)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let postings_offset = try!(u32::deserialize(reader));
let positions_offset = try!(u32::deserialize(reader));
let doc_freq = u32::deserialize(reader)?;
let postings_offset = u32::deserialize(reader)?;
let positions_offset = u32::deserialize(reader)?;
let positions_inner_offset = u8::deserialize(reader)?;
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: postings_offset,
positions_offset: positions_offset,
})
doc_freq: doc_freq,
postings_offset: postings_offset,
positions_offset: positions_offset,
positions_inner_offset: positions_inner_offset,
})
}
}

View File

@@ -37,10 +37,12 @@ impl Query for BooleanQuery {
}
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
let sub_weights = try!(self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect());
let sub_weights = try!(
self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect()
);
let occurs: Vec<Occur> = self.subqueries
.iter()
.map(|&(ref occur, ref _subquery)| *occur)
@@ -57,10 +59,9 @@ impl BooleanQuery {
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
.into_iter()
.map(|term| {
let term_query: Box<Query> = box TermQuery::new(term,
SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
.collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -55,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
.map(|posting| posting.doc())
.enumerate()
.map(|(ord, doc)| {
HeapItem {
doc: doc,
ord: ord as u32,
}
})
HeapItem {
doc: doc,
ord: ord as u32,
}
})
.collect();
BooleanScorer {
scorers: non_empty_scorers,

View File

@@ -22,11 +22,12 @@ impl BooleanWeight {
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> =
try!(self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect());
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect()
);
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}

View File

@@ -64,8 +64,10 @@ mod tests {
}
let make_term_query = |text: &str| {
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq,
);
let query: Box<Query> = box term_query;
query
};
@@ -87,19 +89,25 @@ mod tests {
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
}
{

View File

@@ -1,7 +1,8 @@
/// Query module
///
/// The query module regroups all of tantivy's query objects
///
/*!
Query module
The query module regroups all of tantivy's query objects
*/
mod query;
mod boolean_query;

View File

@@ -2,7 +2,7 @@ use query::Occur;
/// An `OccurFilter` represents a filter over a bitset of
// at most 64 elements.
/// at most 64 elements.
///
/// It wraps some simple bitmask to compute the filter
/// rapidly.

View File

@@ -61,9 +61,9 @@ mod tests {
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::from(terms);
searcher
.search(&phrase_query, &mut test_collector)
.expect("search should succeed");
searcher.search(&phrase_query, &mut test_collector).expect(
"search should succeed",
);
test_collector.docs()
};

View File

@@ -7,7 +7,8 @@ use query::Weight;
use Result;
/// `PhraseQuery` matches a specific sequence of word.
/// `PhraseQuery` matches a specific sequence of words.
///
/// For instance the phrase query for `"part time"` will match
/// the sentence
///

View File

@@ -5,12 +5,12 @@ use postings::Postings;
use postings::IntersectionDocSet;
use DocId;
pub struct PhraseScorer<'a> {
pub intersection_docset: IntersectionDocSet<SegmentPostings<'a>>,
pub struct PhraseScorer {
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
}
impl<'a> PhraseScorer<'a> {
impl PhraseScorer {
fn phrase_match(&self) -> bool {
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
.docsets()
@@ -54,7 +54,7 @@ impl<'a> PhraseScorer<'a> {
}
}
impl<'a> DocSet for PhraseScorer<'a> {
impl DocSet for PhraseScorer {
fn advance(&mut self) -> bool {
while self.intersection_docset.advance() {
if self.phrase_match() {
@@ -74,7 +74,7 @@ impl<'a> DocSet for PhraseScorer<'a> {
}
impl<'a> Scorer for PhraseScorer<'a> {
impl Scorer for PhraseScorer {
fn score(&self) -> f32 {
1f32
}

View File

@@ -22,14 +22,17 @@ impl Weight for PhraseWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
let inverted_index = reader.inverted_index(term.field());
let term_postings_option =
reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions);
if let Some(term_postings) = term_postings_option {
term_postings_list.push(term_postings);
} else {
return Ok(box EmptyScorer);
}
}
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
Ok(box PhraseScorer {
intersection_docset: IntersectionDocSet::from(term_postings_list),
})
}
}

View File

@@ -8,7 +8,10 @@ use std::fmt;
use std::any::Any;
/// Query trait are in charge of defining :
/// The `Query` trait defines a set of documents and a scoring method
/// for those documents.
///
/// The `Query` trait is in charge of defining :
///
/// - a set of documents
/// - a way to score these documents
@@ -58,17 +61,18 @@ pub trait Query: fmt::Debug {
/// - iterate throw the matched documents and push them to the collector.
///
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<TimerTree> {
let mut timer_tree = TimerTree::default();
let weight = try!(self.weight(searcher));
{
let mut search_timer = timer_tree.open("search");
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let mut segment_search_timer = search_timer.open("segment_search");
{
let _ = segment_search_timer.open("set_segment");
try!(collector.set_segment(segment_ord as SegmentLocalId, segment_reader));
try!(collector.set_segment(
segment_ord as SegmentLocalId,
segment_reader,
));
}
let mut scorer = try!(weight.scorer(segment_reader));
{

View File

@@ -3,7 +3,8 @@ use combine::char::*;
use super::user_input_ast::*;
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
@@ -11,27 +12,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
phrase.or(word)
};
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map(
|(s1, s2): (char, String)| format!("{}{}", s1, s2),
);
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field = (
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name:
Some(field_name),
phrase: phrase,
}
});
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,
}
});
let term_default_field = term_val().map(|phrase| {
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
try(term_query)
.or(term_default_field)
.map(UserInputAST::from)
@@ -40,25 +43,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
(char('-'), parser(literal))
.map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or((char('+'), parser(literal)).map(|(_, expr)| {
UserInputAST::Must(box expr)
}))
.or(parser(literal))
.parse_stream(input)
}
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
sep_by(parser(leaf), spaces())
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
.parse_stream(input)
}

View File

@@ -124,32 +124,35 @@ impl QueryParser {
}
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&mut self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) = parse_to_ast(query)
.map_err(|_| QueryParserError::SyntaxError)?;
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) = parse_to_ast(query).map_err(
|_| QueryParserError::SyntaxError,
)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
self.schema.get_field(field_name).ok_or_else(|| {
QueryParserError::FieldDoesNotExist(String::from(field_name))
})
}
fn compute_logical_ast(&mut self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
fn compute_logical_ast(
&self,
user_input_ast: UserInputAST,
) -> Result<LogicalAST, QueryParserError> {
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
}
Ok(ast)
}
fn compute_logical_ast_for_leaf(&mut self,
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
@@ -187,7 +190,9 @@ impl QueryParser {
if terms.is_empty() {
Ok(None)
} else if terms.len() == 1 {
Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
Ok(Some(
LogicalLiteral::Term(terms.into_iter().next().unwrap()),
))
} else {
Ok(Some(LogicalLiteral::Phrase(terms)))
}
@@ -204,18 +209,26 @@ impl QueryParser {
}
}
fn compute_logical_ast_with_occur(&mut self,
user_input_ast: UserInputAST)
-> Result<(Occur, LogicalAST), QueryParserError> {
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
) -> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
})
.collect());
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
sub_queries
.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| {
(compose_occur(default_occur, occur), sub_ast)
})
})
.collect()
);
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
@@ -335,9 +348,10 @@ mod test {
}
fn parse_query_to_logical_ast(query: &str,
default_conjunction: bool)
-> Result<LogicalAST, QueryParserError> {
fn parse_query_to_logical_ast(
query: &str,
default_conjunction: bool,
) -> Result<LogicalAST, QueryParserError> {
let mut query_parser = make_query_parser();
if default_conjunction {
query_parser.set_conjunction_by_default();
@@ -345,9 +359,11 @@ mod test {
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str,
expected: &str,
default_conjunction: bool) {
fn test_parse_query_to_logical_ast_helper(
query: &str,
expected: &str,
default_conjunction: bool,
) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
@@ -373,21 +389,29 @@ mod test {
}
};
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text")));
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64")));
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64")));
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
101, 32, 119, 111, 114, 100, 116, 119, 111])",
false);
false,
);
}
#[test]
@@ -396,82 +420,115 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
test_parse_query_to_logical_ast_helper("unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false);
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false,
);
test_parse_query_to_logical_ast_helper("signed:-2324",
&format!("{:?}",
Term::from_field_i64(Field(2u32), -2324)),
false);
test_parse_query_to_logical_ast_helper(
"signed:-2324",
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false,
);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
false,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
false);
false,
);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 0, 0, 0, 97]) \
true,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(+Term([0, 0, 0, 0, 97]) \
+(Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
true);
true,
);
}
}

View File

@@ -44,8 +44,10 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let term_weight = term_query.weight(&searcher).unwrap();
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();

View File

@@ -7,7 +7,8 @@ use postings::Postings;
use fastfield::FastFieldReader;
pub struct TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub idf: Score,
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
@@ -15,7 +16,8 @@ pub struct TermScorer<TPostings>
}
impl<TPostings> TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub fn postings(&self) -> &TPostings {
&self.postings
@@ -23,7 +25,8 @@ impl<TPostings> TermScorer<TPostings>
}
impl<TPostings> DocSet for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn advance(&mut self) -> bool {
self.postings.advance()
@@ -40,7 +43,8 @@ impl<TPostings> DocSet for TermScorer<TPostings>
}
impl<TPostings> Scorer for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn score(&self) -> Score {
let doc = self.postings.doc();

View File

@@ -27,24 +27,28 @@ impl TermWeight {
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
}
pub fn specialized_scorer<'a>(&'a self,
reader: &'a SegmentReader)
-> Result<TermScorer<SegmentPostings<'a>>> {
/// If the field is not found, returns an empty `DocSet`.
pub fn specialized_scorer(
&self,
reader: &SegmentReader,
) -> Result<TermScorer<SegmentPostings>> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
Ok(reader
.read_postings(&self.term, self.segment_postings_options)
.map(|segment_postings| {
TermScorer {
idf: self.idf(),
fieldnorm_reader_opt: fieldnorm_reader_opt,
postings: segment_postings,
}
})
.unwrap_or(TermScorer {
idf: 1f32,
fieldnorm_reader_opt: None,
postings: SegmentPostings::empty(),
}))
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.segment_postings_options);
if let Some(segment_postings) = postings_opt {
Ok(TermScorer {
idf: self.idf(),
fieldnorm_reader_opt: fieldnorm_reader_opt,
postings: segment_postings,
})
} else {
Ok(TermScorer {
idf: 1f32,
fieldnorm_reader_opt: None,
postings: SegmentPostings::empty(),
})
}
}
}

View File

@@ -10,7 +10,7 @@ use common::BinarySerializable;
///
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
/// Value 255 is reserved.
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub struct Field(pub u32);
impl BinarySerializable for Field {

View File

@@ -89,7 +89,8 @@ impl FieldEntry {
impl Serialize for FieldEntry {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut s = serializer.serialize_struct("field_entry", 3)?;
s.serialize_field("name", &self.name)?;
@@ -115,7 +116,8 @@ impl Serialize for FieldEntry {
impl<'de> Deserialize<'de> for FieldEntry {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(field_identifier, rename_all = "lowercase")]
@@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
}
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
where V: MapAccess<'de>
where
V: MapAccess<'de>,
{
let mut name = None;
let mut ty = None;
@@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry {
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
let field_type = field_type
.ok_or_else(|| de::Error::missing_field("options"))?;
let field_type = field_type.ok_or_else(
|| de::Error::missing_field("options"),
)?;
Ok(FieldEntry {
name: name,
field_type: field_type,
})
name: name,
field_type: field_type,
})
}
}

View File

@@ -9,7 +9,7 @@ use schema::TextIndexingOptions;
/// At this point the JSON is known to be valid.
#[derive(Debug)]
pub enum ValueParsingError {
/// Encounterred a numerical value that overflows or underflow its integer type.
/// Encountered a numerical value that overflows or underflow its integer type.
OverflowError(String),
/// The json node is not of the correct type.
/// (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
@@ -80,8 +80,9 @@ impl FieldType {
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) |
FieldType::I64(_) => {
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}",
json)))
Err(ValueParsingError::TypeError(
format!("Expected an integer, got {:?}", json),
))
}
}
}
@@ -110,9 +111,11 @@ impl FieldType {
}
}
_ => {
let msg = format!("Json value not supported error {:?}. Expected {:?}",
json,
self);
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json,
self
);
Err(ValueParsingError::TypeError(msg))
}
}

View File

@@ -1,7 +1,7 @@
/*!
# Schema definition
# Setting your schema in Tantivy
Tantivy has a very strict schema.
The schema defines information about the fields your index contains, that is, for each field:

View File

@@ -105,9 +105,9 @@ impl SchemaBuilder {
/// This will consume your `SchemaBuilder`
pub fn build(self) -> Schema {
Schema(Arc::new(InnerSchema {
fields: self.fields,
fields_map: self.fields_map,
}))
fields: self.fields,
fields_map: self.fields_map,
}))
}
}
@@ -206,15 +206,14 @@ impl Schema {
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json)
.map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let mut doc = Document::default();
for (field_name, json_value) in json_obj.iter() {
@@ -225,18 +224,15 @@ impl Schema {
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = try!(field_type
.value_from_json(json_item)
.map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
let value =
try!(field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value = try!(field_type
.value_from_json(json_value)
.map_err(|e| {
let value = try!(field_type.value_from_json(json_value).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
@@ -259,7 +255,8 @@ impl fmt::Debug for Schema {
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
for e in &self.0.fields {
@@ -271,7 +268,8 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct SchemaVisitor;
@@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema {
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de>
where
A: SeqAccess<'de>,
{
let mut schema = SchemaBuilder {
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
@@ -430,12 +429,14 @@ mod tests {
}
{
let doc = schema
.parse_document(r#"{
.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10
}"#)
}"#,
)
.unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
@@ -443,13 +444,15 @@ mod tests {
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
assert_eq!(field_name, "jambon");
@@ -460,13 +463,15 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": "5",
"popularity": "10",
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
@@ -477,12 +482,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": -5,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -493,12 +500,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 9223372036854775808,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
@@ -509,12 +518,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
"popularity": 9223372036854775808
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -525,11 +536,13 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
}"#);
}"#,
);
match json_err {
Err(NotJSON(_)) => {
assert!(true);

View File

@@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
pub struct Term<B = Vec<u8>>(B)
where
B: AsRef<[u8]>;
impl Term {
/// Builds a term given a field, and a u64-value
@@ -109,7 +111,8 @@ impl Term {
}
impl<B> Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
/// Wraps a source of data
pub fn wrap(data: B) -> Term<B> {
@@ -166,7 +169,8 @@ impl<B> Term<B>
}
impl<B> AsRef<[u8]> for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn as_ref(&self) -> &[u8] {
self.0.as_ref()

View File

@@ -2,7 +2,7 @@ use std::ops::BitOr;
/// Define how a text field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: TextIndexingOptions,
stored: bool,
@@ -45,10 +45,10 @@ impl Default for TextOptions {
/// Describe how a field should be indexed
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)]
pub enum TextIndexingOptions {
/// Unindexed fields will not generate any postings. They will not be searchable either.
#[serde(rename="unindexed")]
#[serde(rename = "unindexed")]
Unindexed,
/// Untokenized means that the field text will not be split into tokens before being indexed.
/// A field with the value "Hello world", will have the document suscribe to one single
@@ -56,23 +56,23 @@ pub enum TextIndexingOptions {
///
/// It will **not** be searchable if the user enter "hello" for instance.
/// This can be useful for tags, or ids for instance.
#[serde(rename="untokenized")]
#[serde(rename = "untokenized")]
Untokenized,
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
/// to the posting lists associated to all of the tokens.
/// The frequence of appearance of the term in the document however will be lost.
/// The term frequency used in the TfIdf formula will always be 1.
#[serde(rename="tokenize")]
#[serde(rename = "tokenize")]
TokenizedNoFreq,
/// TokenizedWithFreq will tokenize the field value, and encode
/// both the docid and the term frequency in the posting lists associated to all
#[serde(rename="freq")]
#[serde(rename = "freq")]
TokenizedWithFreq,
/// Like TokenizedWithFreq, but also encodes the positions of the
/// terms in a separate file. This option is required for phrase queries.
/// Don't use this if you are certain you won't need it, the term positions file
/// can be very big.
#[serde(rename="position")]
#[serde(rename = "position")]
TokenizedWithFreqAndPosition,
}

View File

@@ -16,7 +16,8 @@ pub enum Value {
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
@@ -28,7 +29,8 @@ impl Serialize for Value {
impl<'de> Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct ValueVisitor;
@@ -162,9 +164,13 @@ mod binary_serialize {
Ok(Value::I64(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData,
format!("No field type is associated with code {:?}",
type_code)))
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"No field type is associated with code {:?}",
type_code
),
))
}
}
}

View File

@@ -54,17 +54,19 @@ mod tests {
fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
let mut schema_builder = SchemaBuilder::default();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
let field_title = schema_builder
.add_text_field("title", TextOptions::default().set_stored());
let field_title =
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
let lorem = String::from(
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.");
laborum.",
);
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..num_docs {
@@ -96,8 +98,10 @@ mod tests {
let store_source = directory.open_read(path).unwrap();
let store = StoreReader::from_source(store_source);
for i in 0..1_000 {
assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i));
assert_eq!(
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i)
);
}
}
@@ -106,9 +110,9 @@ mod tests {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = Path::new("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
}

View File

@@ -49,7 +49,7 @@ impl StoreReader {
let mut cursor = &total_buffer[block_offset..];
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..
(block_offset + 4 + block_length as usize)];
(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = try!(lz4::Decoder::new(block_array));
*self.current_block_offset.borrow_mut() = usize::max_value();
try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()));
@@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
let offset = offset as usize;
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
(
data.slice(0, offset),
data.slice(offset, footer_offset),
max_doc,
)
}

View File

@@ -49,12 +49,15 @@ impl StoreWriter {
///
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
try!((field_values.len() as u32).serialize(
&mut self.intermediary_buffer,
));
for field_value in field_values {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.current_block)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.current_block,
)?;
self.current_block.write_all(&self.intermediary_buffer[..])?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
@@ -66,16 +69,22 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer));
let mut encoder = try!(lz4::EncoderBuilder::new().build(
&mut self.intermediary_buffer,
));
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.writer)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.writer,
)?;
self.writer.write_all(&self.intermediary_buffer)?;
self.offset_index_writer
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
self.offset_index_writer.insert(
self.doc,
&(self.writer.written_bytes() as
u64),
)?;
self.current_block.clear();
Ok(())
}
@@ -90,8 +99,7 @@ impl StoreWriter {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.writer.written_bytes() as u64;
try!(self.offset_index_writer
.write(&mut self.writer));
try!(self.offset_index_writer.write(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
try!(self.doc.serialize(&mut self.writer));
self.writer.flush()

Some files were not shown because too many files have changed in this diff Show More