Merge branch 'imhotep-new-codec'

Conflicts:
	src/common/bitpacker.rs
	src/compression/pack/compression_pack_nosimd.rs
	src/indexer/log_merge_policy.rs
This commit is contained in:
Paul Masurel
2017-08-28 19:29:49 +09:00
98 changed files with 4220 additions and 2823 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.4.3"
version = "0.5.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"

View File

@@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);

View File

@@ -38,10 +38,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())

View File

@@ -45,11 +45,11 @@ mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -15,8 +15,9 @@ use SegmentLocalId;
/// Facet collector for i64/u64 fast field
pub struct FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
counters: HashMap<T::ValueType, u64>,
field: Field,
@@ -25,8 +26,9 @@ pub struct FacetCollector<T>
impl<T> FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
/// Creates a new facet collector for aggregating a given field.
pub fn new(field: Field) -> FacetCollector<T> {
@@ -40,8 +42,9 @@ impl<T> FacetCollector<T>
impl<T> Collector for FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
@@ -51,7 +54,9 @@ impl<T> Collector for FacetCollector<T>
fn collect(&mut self, doc: DocId, _: Score) {
let val = self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.expect(
"collect() was called before set_segment. This should never happen.",
)
.get(doc);
*(self.counters.entry(val).or_insert(0)) += 1;
}

View File

@@ -54,20 +54,22 @@ pub use self::chained_collector::chain;
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()>;
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -172,12 +174,12 @@ pub mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> {
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}
@@ -53,8 +54,8 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::from(vec![&mut top_collector,
&mut count_collector]);
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);

View File

@@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc {
impl Ord for GlobalScoredDoc {
#[inline]
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
other
.score
.partial_cmp(&self.score)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
other.score.partial_cmp(&self.score).unwrap_or_else(|| {
other.doc_address.cmp(&self.doc_address)
})
}
}
@@ -87,7 +86,9 @@ impl TopCollector {
scored_docs.sort();
scored_docs
.into_iter()
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
.map(|GlobalScoredDoc { score, doc_address }| {
(score, doc_address)
})
.collect()
}
@@ -108,14 +109,13 @@ impl Collector for TopCollector {
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc =
*self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect(
"Top collector with size 0 is forbidden",
);
if limit_doc.score < score {
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
let mut mut_head = self.heap.peek_mut().expect(
"Top collector with size 0 is forbidden",
);
mut_head.score = score;
mut_head.doc_address = DocAddress(self.segment_id, doc);
}

View File

@@ -15,7 +15,7 @@ use std::ops::Deref;
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spawning over 9 bytes is possible for instance, if we do
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
@@ -88,7 +88,8 @@ impl BitPacker {
pub struct BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u64,
@@ -96,7 +97,8 @@ pub struct BitUnpacker<Data>
}
impl<Data> BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
let mask: u64 = if num_bits == 64 {
@@ -121,33 +123,13 @@ impl<Data> BitUnpacker<Data>
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
}
else {
let val_unshifted_unmasked: u64;
if addr + 8 <= data.len() {
val_unshifted_unmasked = unsafe { *(data[addr..].as_ptr() as *const u64) };
}
else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
val_unshifted_unmasked = unsafe { *(buffer[..].as_ptr() as *const u64) };
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
}
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
}
pub fn get_range(&self, start: u32, output: &mut [u64]) {

View File

@@ -0,0 +1,191 @@
use std::io::Write;
use common::CountingWriter;
use std::collections::HashMap;
use schema::Field;
use common::VInt;
use directory::WritePtr;
use std::io;
use directory::ReadOnlySource;
use common::BinarySerializable;
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<Field, usize>,
}
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
CompositeWrite {
write: CountingWriter::wrap(w),
offsets: HashMap::new(),
}
}
/// Start writing a new field.
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
assert!(!self.offsets.contains_key(&field));
self.offsets.insert(field, offset);
&mut self.write
}
/// Close the composite file.
///
/// An index of the different field offsets
/// will be written as a footer.
pub fn close(mut self) -> io::Result<()> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(field, offset)| (offset, field))
.collect();
offset_fields.sort();
let mut prev_offset = 0;
for (offset, field) in offset_fields {
VInt((offset - prev_offset) as u64).serialize(
&mut self.write,
)?;
field.serialize(&mut self.write)?;
prev_offset = *offset;
}
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
footer_len.serialize(&mut self.write)?;
self.write.flush()?;
Ok(())
}
}
/// A composite file is an abstraction to store a
/// file partitioned by field.
///
/// The file needs to be written field by field.
/// A footer describes the start and stop offsets
/// for each field.
#[derive(Clone)]
pub struct CompositeFile {
data: ReadOnlySource,
offsets_index: HashMap<Field, (usize, usize)>,
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: ReadOnlySource) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4);
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data.slice(footer_start, footer_start + footer_len);
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut fields = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
let mut offset = 0;
for _ in 0..num_fields {
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
let field = Field::deserialize(&mut footer_buffer)?;
offsets.push(offset);
fields.push(field);
}
offsets.push(footer_start);
for i in 0..num_fields {
let field = fields[i];
let start_offset = offsets[i];
let end_offset = offsets[i + 1];
field_index.insert(field, (start_offset, end_offset));
}
Ok(CompositeFile {
data: data.slice_to(footer_start),
offsets_index: field_index,
})
}
/// Returns a composite file that stores
/// no fields.
pub fn empty() -> CompositeFile {
CompositeFile {
offsets_index: HashMap::new(),
data: ReadOnlySource::empty(),
}
}
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
self.offsets_index.get(&field).map(|&(from, to)| {
self.data.slice(from, to)
})
}
}
#[cfg(test)]
mod test {
use std::io::Write;
use super::{CompositeWrite, CompositeFile};
use directory::{RAMDirectory, Directory};
use schema::Field;
use common::VInt;
use common::BinarySerializable;
use std::path::Path;
#[test]
fn test_composite_file() {
let path = Path::new("test_path");
let mut directory = RAMDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
{
let mut write_0 = composite_write.for_field(Field(0u32));
VInt(32431123u64).serialize(&mut write_0).unwrap();
write_0.flush().unwrap();
}
{
let mut write_4 = composite_write.for_field(Field(4u32));
VInt(2).serialize(&mut write_4).unwrap();
write_4.flush().unwrap();
}
composite_write.close().unwrap();
}
{
let r = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(r).unwrap();
{
let file0 = composite_file.open_read(Field(0u32)).unwrap();
let mut file0_buf = file0.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file4 = composite_file.open_read(Field(4u32)).unwrap();
let mut file4_buf = file4.as_slice();
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
assert_eq!(file4_buf.len(), 0);
assert_eq!(payload_4, 2u64);
}
}
}
}

View File

@@ -2,7 +2,7 @@ use std::io::Write;
use std::io;
pub struct CountingWriter<W: Write> {
pub struct CountingWriter<W> {
underlying: W,
written_bytes: usize,
}

View File

@@ -1,9 +1,13 @@
mod serialize;
mod timer;
mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
pub(crate) use self::composite_file::{CompositeWrite, CompositeFile};
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;

View File

@@ -101,9 +101,9 @@ impl BinarySerializable for String {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
reader.take(string_length as u64).read_to_string(
&mut result,
)?;
Ok(result)
}
}

View File

@@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> {
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self) {
self.timer_tree
.timings
.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
}
}

View File

@@ -47,7 +47,12 @@ impl BinarySerializable for VInt {
}
shift += 7;
}
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
_ => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer",
))
}
}
}
Ok(VInt(result))

View File

@@ -1,170 +0,0 @@
use super::{BlockEncoder, BlockDecoder};
use super::NUM_DOCS_PER_BLOCK;
use compression::{VIntEncoder, VIntDecoder};
pub struct CompositeEncoder {
block_encoder: BlockEncoder,
output: Vec<u8>,
}
impl CompositeEncoder {
pub fn new() -> CompositeEncoder {
CompositeEncoder {
block_encoder: BlockEncoder::new(),
output: Vec::with_capacity(500_000),
}
}
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
let mut offset = 0u32;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
self.output.extend_from_slice(block_compressed);
}
let vint_compressed =
self.block_encoder
.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
self.output.extend_from_slice(vint_compressed);
&self.output
}
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder
.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
self.output.extend_from_slice(vint_compressed);
&self.output
}
}
pub struct CompositeDecoder {
block_decoder: BlockDecoder,
vals: Vec<u32>,
}
impl CompositeDecoder {
pub fn new() -> CompositeDecoder {
CompositeDecoder {
block_decoder: BlockDecoder::new(),
vals: Vec::with_capacity(500_000),
}
}
pub fn uncompress_sorted(&mut self,
mut compressed_data: &[u8],
uncompressed_len: usize)
-> &[u32] {
if uncompressed_len > self.vals.capacity() {
let extra_capacity = uncompressed_len - self.vals.capacity();
self.vals.reserve(extra_capacity);
}
let mut offset = 0u32;
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder
.uncompress_block_sorted(compressed_data, offset);
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.vals
.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder
.uncompress_vint_sorted(compressed_data,
offset,
uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals
.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
pub fn uncompress_unsorted(&mut self,
mut compressed_data: &[u8],
uncompressed_len: usize)
-> &[u32] {
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder
.uncompress_block_unsorted(compressed_data);
self.vals
.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder
.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals
.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
}
impl Into<Vec<u32>> for CompositeDecoder {
fn into(self) -> Vec<u32> {
self.vals
}
}
#[cfg(test)]
pub mod tests {
use test::Bencher;
use super::*;
use tests;
#[test]
fn test_composite_unsorted() {
let data = tests::generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert!(compressed.len() <= 19_794);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
#[test]
fn test_composite_sorted() {
let data = tests::generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert!(compressed.len() <= 7_826);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
const BENCH_NUM_INTS: usize = 99_968;
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
b.iter(|| { encoder.compress_sorted(&data); });
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
let compressed = encoder.compress_sorted(&data);
let mut decoder = CompositeDecoder::new();
b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); });
}
}

View File

@@ -1,52 +1,88 @@
#![allow(dead_code)]
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
mod stream;
pub use self::stream::CompressedIntStream;
#[cfg(not(feature="simdcompression"))]
#[cfg(not(feature = "simdcompression"))]
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::*;
pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder};
}
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::*;
pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder};
}
pub use self::pack::{BlockEncoder, BlockDecoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
mod vint {
mod compression_vint_nosimd;
pub use self::compression_vint_nosimd::*;
pub(crate) use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
mod vint {
mod compression_vint_simd;
pub use self::compression_vint_simd::*;
pub(crate) use self::compression_vint_simd::*;
}
/// Returns the size in bytes of a compressed block, given num_bits.
pub fn compressed_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize) * 16
}
pub trait VIntEncoder {
/// Compresses an array of `u32` integers,
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
/// Compresses an array of `u32` integers,
/// using variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
}
pub trait VIntDecoder {
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> &'a [u8];
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> &'a [u8];
/// Uncompress an array of `u32` integers,
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
///
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize;
/// Uncompress an array of `u32s`, compressed using variable
/// byte encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
@@ -60,26 +96,24 @@ impl VIntEncoder for BlockEncoder {
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> &'a [u8] {
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> &'a [u8] {
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
#[cfg(test)]
pub mod tests {
@@ -95,8 +129,8 @@ pub mod tests {
let compressed_data = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -110,8 +144,8 @@ pub mod tests {
let compressed_data = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -129,9 +163,9 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -149,9 +183,9 @@ pub mod tests {
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -169,9 +203,9 @@ pub mod tests {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data =
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
}
}
@@ -181,19 +215,32 @@ pub mod tests {
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| { encoder.compress_block_sorted(&data, 0u32); });
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let compressed = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); });
}
#[test]
fn test_all_docs_compression_numbits() {
for num_bits in 0..33 {
let mut data = [0u32; 128];
if num_bits > 0 {
data[0] = 1 << (num_bits - 1);
}
let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_unsorted(&data);
assert_eq!(compressed[0] as usize, num_bits);
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
}
}
const NUM_INTS_BENCH_VINT: usize = 10;
@@ -210,7 +257,9 @@ pub mod tests {
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -3,15 +3,15 @@ use common::bitpacker::{BitPacker, BitUnpacker};
use common::CountingWriter;
use std::cmp;
use std::io::Write;
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
let mut max_delta = 0;
{
let mut local_offset = offset;
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let val = vals[i];
let delta = val - local_offset;
max_delta = cmp::max(max_delta, delta);
@@ -22,6 +22,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
let mut counting_writer = CountingWriter::wrap(output);
let num_bits = compute_num_bits(max_delta as u64);
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
@@ -34,7 +35,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
}
impl BlockEncoder {
@@ -42,7 +43,7 @@ impl BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
}
}
@@ -55,10 +56,9 @@ impl BlockEncoder {
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size = {
let output: &mut [u8] = &mut self.output;
let max = vals.iter()
.cloned()
.max()
.expect("compress unsorted called with an empty array");
let max = vals.iter().cloned().max().expect(
"compress unsorted called with an empty array",
);
let num_bits = compute_num_bits(max as u64);
let mut counting_writer = CountingWriter::wrap(output);
counting_writer.write_all(&[num_bits]).unwrap();
@@ -66,8 +66,16 @@ impl BlockEncoder {
for val in vals {
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
}
bit_packer.flush(&mut counting_writer);
// we voluntarility avoid writing "closing", because we
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
bit_packer
.write(vals[0] as u64, &mut counting_writer)
.unwrap();
}
bit_packer.flush(&mut counting_writer).expect(
"Flushing the bitpacking \
in an in RAM buffer should never fail",
);
// we avoid writing "closing", because we
// do not want 7 bytes of padding here.
counting_writer.written_bytes()
};
@@ -93,34 +101,35 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted<'a>(&mut self,
compressed_data: &'a [u8],
mut offset: u32)
-> &'a [u8] {
pub fn uncompress_block_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
mut offset: u32,
) -> usize {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let delta = bit_unpacker.get(i);
let val = offset + delta as u32;
self.output[i] = val;
offset = val;
}
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
};
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
self.output[i] = bit_unpacker.get(i) as u32;
}
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]

View File

@@ -1,6 +1,6 @@
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
mod simdcomp {
use libc::size_t;
@@ -8,10 +8,11 @@ mod simdcomp {
extern "C" {
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
pub fn uncompress_sorted(compressed_data: *const u8,
output: *mut u32,
offset: u32)
-> size_t;
pub fn uncompress_sorted(
compressed_data: *const u8,
output: *mut u32,
offset: u32,
) -> size_t;
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
@@ -78,19 +79,16 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32)
-> &'a [u8] {
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]
@@ -117,4 +115,5 @@ mod tests {
let compressed = encoder.compress_block_sorted(&data, 0u32);
assert_eq!(compressed.len(), 17);
}
}

135
src/compression/stream.rs Normal file
View File

@@ -0,0 +1,135 @@
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
/// Reads a stream of compressed ints.
///
/// Tantivy uses `CompressedIntStream` to read
/// the position file.
/// The `.skip(...)` makes it possible to avoid
/// decompressing blocks that are not required.
pub struct CompressedIntStream {
buffer: SourceRead,
block_decoder: BlockDecoder,
inner_offset: usize,
}
impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
buffer: SourceRead::from(source),
block_decoder: BlockDecoder::new(),
inner_offset: COMPRESSION_BLOCK_SIZE,
}
}
/// Fills a buffer with the next `output.len()` integers,
/// and advance the stream by that many els.
pub fn read(&mut self, output: &mut [u32]) {
let mut num_els: usize = output.len();
let mut start: usize = 0;
loop {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if num_els >= available {
if available > 0 {
let uncompressed_block = &self.block_decoder.output_array()
[self.inner_offset..];
&mut output[start..start + available].clone_from_slice(uncompressed_block);
}
num_els -= available;
start += available;
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = 0;
} else {
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..
self.inner_offset +
num_els];
&output[start..start + num_els].clone_from_slice(uncompressed_block);
self.inner_offset += num_els;
break;
}
}
}
/// Skip the next `skip_len` integer.
///
/// If a full block is skipped, calling
/// `.skip(...)` will avoid decompressing it.
pub fn skip(&mut self, mut skip_len: usize) {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if available >= skip_len {
self.inner_offset += skip_len;
} else {
skip_len -= available;
// entirely skip decompressing some blocks.
while skip_len >= COMPRESSION_BLOCK_SIZE {
skip_len -= COMPRESSION_BLOCK_SIZE;
let num_bits: u8 = self.buffer.as_ref()[0];
let block_len = compressed_block_size(num_bits);
self.buffer.advance(block_len);
}
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = skip_len;
}
}
}
#[cfg(test)]
pub mod tests {
use super::CompressedIntStream;
use compression::compressed_block_size;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::BlockEncoder;
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {
let mut buffer: Vec<u8> = vec![];
let mut encoder = BlockEncoder::new();
let vals: Vec<u32> = (0u32..1_025u32).collect();
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
let compressed_block = encoder.compress_block_unsorted(chunk);
let num_bits = compressed_block[0];
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
buffer.extend_from_slice(compressed_block);
}
if cfg!(simd) {
buffer.extend_from_slice(&[0u8; 7]);
}
ReadOnlySource::from(buffer)
}
#[test]
fn test_compressed_int_stream() {
let buffer = create_stream_buffer();
let mut stream = CompressedIntStream::wrap(buffer);
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
stream.read(&mut block[0..2]);
assert_eq!(block[0], 0);
assert_eq!(block[1], 1);
stream.skip(5);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 7);
assert_eq!(block[1], 8);
assert_eq!(block[2], 9);
stream.skip(500);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 510);
assert_eq!(block[1], 511);
assert_eq!(block[2], 512);
stream.skip(511);
stream.read(&mut block[..1]);
assert_eq!(block[0], 1024);
}
}

View File

@@ -1,6 +1,10 @@
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
@@ -22,7 +26,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
@@ -43,10 +47,11 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
}
#[inline(always)]
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> &'a [u8] {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
@@ -63,11 +68,11 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
@@ -84,5 +89,5 @@ pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) ->
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}

View File

@@ -4,41 +4,47 @@ mod streamvbyte {
use libc::size_t;
extern "C" {
pub fn streamvbyte_delta_encode(data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32,
) -> size_t;
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32,
) -> size_t;
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
pub fn streamvbyte_decode(compressed_data: *const u8,
output: *mut u32,
num_els: usize)
-> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize,
) -> size_t;
}
}
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset,
)
};
&output[..compress_length]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
};
@@ -46,23 +52,24 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
}
#[inline(always)]
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
};
&compressed_data[consumed_bytes..]
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
unsafe {
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset,
)
}
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
let consumed_bytes = unsafe {
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
unsafe {
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
};
&compressed_data[consumed_bytes..]
}
}

View File

@@ -48,9 +48,10 @@ impl Index {
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory)
.expect("Creating a managed directory from a brand new RAM directory \
should never fail.");
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
@@ -127,10 +128,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(&self,
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
open_index_writer(self, num_threads, heap_size_in_bytes)
}
@@ -155,10 +157,12 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
Ok(
self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect(),
)
}
#[doc(hidden)]
@@ -190,10 +194,12 @@ impl Index {
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
Ok(
self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect(),
)
}
/// Creates a new generation of searchers after
@@ -203,10 +209,12 @@ impl Index {
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect());
let segment_readers: Vec<SegmentReader> = try!(
searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect()
);
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();

View File

@@ -9,7 +9,7 @@ use core::SegmentMeta;
/// * the index docstamp
/// * the schema
///
#[derive(Clone,Debug,Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,

View File

@@ -0,0 +1,164 @@
use directory::{SourceRead, ReadOnlySource};
use termdict::{TermDictionary, TermDictionaryImpl};
use postings::{SegmentPostings, BlockSegmentPostings};
use postings::TermInfo;
use postings::SegmentPostingsOption;
use schema::Term;
use std::cmp;
use fastfield::DeleteBitSet;
use schema::Schema;
use compression::CompressedIntStream;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
///
/// # Note
///
/// It is safe to delete the segment associated to
/// an `InvertedIndexReader`. As long as it is open,
/// the `ReadOnlySource` it is relying on should
/// stay available.
///
///
/// `InvertedIndexReader` are created by calling
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict_source: ReadOnlySource,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
postings_source: postings_source,
positions_source: positions_source,
delete_bitset: delete_bitset,
schema: schema,
}
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.termdict.get(term.as_slice())
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.termdict
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
let postings_reader = SourceRead::from(postings_slice);
block_postings.reset(term_info.doc_freq as usize, postings_reader);
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let has_freq = option.has_freq();
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
has_freq,
)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
let position_stream = {
if option.has_positions() {
let position_offset = term_info.positions_offset;
let positions_source = self.positions_source.slice_from(position_offset as usize);
let mut stream = CompressedIntStream::wrap(positions_source);
stream.skip(term_info.positions_inner_offset as usize);
Some(stream)
} else {
None
}
};
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(
&self,
term: &Term,
option: SegmentPostingsOption,
) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(
&term_info,
best_effort_option,
))
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
}
}

View File

@@ -7,7 +7,9 @@ mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod inverted_index_reader;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
@@ -18,7 +20,6 @@ pub use self::index::Index;
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
use std::path::PathBuf;
lazy_static! {

View File

@@ -76,8 +76,11 @@ impl<T> Pool<T> {
if former_generation >= generation {
break;
}
self.freshest_generation
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
self.freshest_generation.compare_and_swap(
former_generation,
generation,
Ordering::SeqCst,
);
}
}
@@ -91,9 +94,9 @@ impl<T> Pool<T> {
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
} else {
// this searcher is obsolete,
// removing it from the pool.
@@ -113,25 +116,26 @@ impl<T> Deref for LeasedItem<T> {
fn deref(&self) -> &T {
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect(
"Unwrapping a leased item should never fail",
);
self.recycle_queue.push(gen_item);
}
}

View File

@@ -6,10 +6,11 @@ use common::TimerTree;
use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use termdict::TermMerger;
use schema::{Term, Field};
use termdict::{TermMerger, TermDictionary};
use std::sync::Arc;
use std::fmt;
use postings::TermInfo;
use core::InvertedIndexReader;
/// Holds a list of `SegmentReader`s ready for search.
@@ -21,7 +22,6 @@ pub struct Searcher {
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Fetches a document from tantivy's store given a `DocAddress`.
///
@@ -46,7 +46,9 @@ impl Searcher {
pub fn doc_freq(&self, term: &Term) -> u32 {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.doc_freq(term))
.map(|segment_reader| {
segment_reader.inverted_index(term.field()).doc_freq(term)
})
.fold(0u32, |acc, val| acc + val)
}
@@ -65,20 +67,41 @@ impl Searcher {
query.search(self, collector)
}
/// Returns a Stream over all of the sorted unique terms of
/// the searcher.
///
/// This includes all of the fields from all of the segment_readers.
/// See [`TermIterator`](struct.TermIterator.html).
///
/// # Warning
/// This API is very likely to change in the future.
pub fn terms(&self) -> TermMerger<TermInfo> {
TermMerger::from(self.segment_readers())
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
}
pub struct FieldSearcher {
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher { inv_index_readers: inv_index_readers }
}
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher { segment_readers: segment_readers }

View File

@@ -76,18 +76,20 @@ impl Segment {
}
/// Open one of the component file for a *regular* read.
pub fn open_read(&self,
component: SegmentComponent)
-> result::Result<ReadOnlySource, OpenReadError> {
pub fn open_read(
&self,
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
Ok(source)
}
/// Open one of the component file for *regular* write.
pub fn open_write(&mut self,
component: SegmentComponent)
-> result::Result<WritePtr, OpenWriteError> {
pub fn open_write(
&mut self,
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
Ok(write)
@@ -125,11 +127,11 @@ mod tests {
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(|| { living_files.clone() });
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(|| { living_files });
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}

View File

@@ -28,13 +28,15 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE];
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -64,16 +64,14 @@ impl SegmentMeta {
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => {
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
}
});
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}
@@ -111,8 +109,8 @@ impl SegmentMeta {
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
}
}

View File

@@ -2,28 +2,24 @@ use Result;
use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
use std::sync::RwLock;
use common::HasLen;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use schema::Document;
use DocId;
use std::str;
use termdict::TermDictionary;
use std::cmp;
use postings::TermInfo;
use termdict::TermDictionaryImpl;
use std::sync::Arc;
use std::collections::HashMap;
use common::CompositeFile;
use std::fmt;
use core::InvertedIndexReader;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::{SegmentPostings, BlockSegmentPostings};
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use fastfield::{FastFieldReader, U64FastFieldReader};
use schema::Schema;
use postings::FreqHandler;
@@ -40,15 +36,19 @@ use postings::FreqHandler;
///
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
segment_meta: SegmentMeta,
terms: Arc<TermDictionaryImpl>,
postings_data: ReadOnlySource,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
positions_composite: CompositeFile,
fast_fields_composite: CompositeFile,
fieldnorms_composite: CompositeFile,
store_reader: StoreReader,
fast_fields_reader: Arc<FastFieldsReader>,
fieldnorms_reader: Arc<FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
}
@@ -76,11 +76,6 @@ impl SegmentReader {
self.delete_bitset.len() as DocId
}
#[doc(hidden)]
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
&*self.fast_fields_reader
}
/// Accessor to a segment's fast field reader given a field.
///
/// Returns the u64 fast value reader if the field
@@ -91,17 +86,18 @@ impl SegmentReader {
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
(&self,
field: Field)
-> fastfield::Result<TFastFieldReader> {
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
&self,
field: Field,
) -> fastfield::Result<TFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
Err(FastFieldNotAvailableError::new(field_entry))
} else {
Ok(self.fast_fields_reader
.open_reader(field)
.expect("Fast field file corrupted."))
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(TFastFieldReader::open)
}
}
@@ -114,15 +110,9 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_reader.open_reader(field)
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
self.fieldnorms_composite.open_read(field).map(
U64FastFieldReader::open,
)
}
/// Accessor to the segment's `StoreReader`.
@@ -133,23 +123,30 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result<SegmentReader> {
let source = segment.open_read(SegmentComponent::TERMS)?;
let terms = TermDictionaryImpl::from_source(source)?;
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(termdict_source)?;
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
let postings_shared_mmap = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(postings_source)?;
let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(source)?
} else {
CompositeFile::empty()
}
};
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(fast_fields_data)?;
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?;
let fieldnorms_composite = CompositeFile::open(fieldnorms_data)?;
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
let delete_bitset = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -160,22 +157,66 @@ impl SegmentReader {
let schema = segment.schema();
Ok(SegmentReader {
segment_meta: segment.meta().clone(),
postings_data: postings_shared_mmap,
terms: Arc::new(terms),
segment_id: segment.id(),
store_reader: store_reader,
fast_fields_reader: Arc::new(fast_fields_reader),
fieldnorms_reader: Arc::new(fieldnorms_reader),
delete_bitset: delete_bitset,
positions_data: positions_data,
schema: schema,
})
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
termdict_composite: termdict_composite,
postings_composite: postings_composite,
fast_fields_composite: fast_fields_composite,
fieldnorms_composite: fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
delete_bitset: delete_bitset,
positions_composite: positions_composite,
schema: schema,
})
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.terms
/// Returns a field reader associated to the field given in argument.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) =
self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
inv_idx_reader.clone();
}
let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect(
"Index corrupted. Failed to open field term dictionary in composite file.",
);
let postings_source = self.postings_composite.open_read(field).expect(
"Index corrupted. Failed to open field postings in composite file.",
);
let positions_source = self.positions_composite.open_read(field).expect(
"Index corrupted. Failed to open field positions in composite file.",
);
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
termdict_source,
postings_source,
positions_source,
self.delete_bitset.clone(),
self.schema.clone(),
));
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
self.inv_idx_reader_cache
.write()
.expect(
"Field reader cache lock poisoned. This should never happen.",
)
.insert(field, inv_idx_reader.clone());
inv_idx_reader
}
/// Returns the document (or to be accurate, its stored field)
@@ -187,89 +228,6 @@ impl SegmentReader {
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self,
term: &Term,
option: SegmentPostingsOption)
-> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
SegmentPostings::from_block_postings(block_postings, delete_bitset)
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match option {
SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(),
SegmentPostingsOption::Freq => FreqHandler::new_with_freq(),
SegmentPostingsOption::FreqAndPositions => {
let offset = term_info.positions_offset as usize;
let offseted_position_data = &self.positions_data[offset..];
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
};
BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo<'a>(&'a self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings<'a>) {
let offset = term_info.postings_offset as usize;
let postings_data: &'a [u8] = &self.postings_data[offset..];
block_postings.reset(term_info.doc_freq as usize, postings_data);
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.terms.get(term.as_slice())
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id

View File

@@ -39,11 +39,11 @@ impl<T: BinarySerializable> LayerBuilder<T> {
doc_id.serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
}
}
@@ -78,8 +78,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => {
try!(self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset))
try!(self.get_skip_layer(layer_id).insert(
skip_doc_id,
&skip_offset,
))
}
None => {
return Ok(());

View File

@@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
};
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.take_while(|num_bits: &usize| {
compute_table_size(*num_bits) < table_size_limit
})
.last()
.expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget));
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
@@ -174,13 +179,10 @@ impl<'a> HashMap<'a> {
}
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
}
@@ -282,8 +284,10 @@ mod tests {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes()));
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
@@ -303,13 +307,13 @@ mod tests {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}

View File

@@ -144,7 +144,8 @@ impl InnerHeap {
addr
} else {
if self.next_heap.is_none() {
info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,);
info!(r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
@@ -154,10 +155,9 @@ impl InnerHeap {
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
self.next_heap.as_ref().unwrap().get_slice(BytesRef(
start - self.buffer_len,
))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
@@ -167,10 +167,10 @@ impl InnerHeap {
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_slice(
start - self.buffer_len,
stop - self.buffer_len,
)
} else {
&mut self.buffer[start as usize..stop as usize]
}
@@ -188,10 +188,9 @@ impl InnerHeap {
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut(
addr - self.buffer_len,
)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
@@ -200,10 +199,9 @@ impl InnerHeap {
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_ref(
addr - self.buffer_len,
)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
@@ -213,10 +211,10 @@ impl InnerHeap {
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
self.next_heap.as_mut().unwrap().set(
addr - self.buffer_len,
val,
);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;

View File

@@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for writing: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
)
}
}
}
@@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for reading: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
)
}
}
}

View File

@@ -45,10 +45,9 @@ pub struct FileProtection {
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_informations_wlock = directory.meta_informations.write().expect(
"Managed file lock poisoned",
);
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
@@ -68,9 +67,10 @@ impl Drop for FileProtection {
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>)
-> io::Result<()> {
fn save_managed_paths(
directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
write!(&mut w, "\n")?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
@@ -84,22 +84,22 @@ impl ManagedDirectory {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> =
serde_json::from_str(&managed_files_json)
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
serde_json::from_str(&managed_files_json).chain_err(|| {
ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone())
})?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files:
HashMap::default(),
})),
})
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => {
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
})
directory: box directory,
meta_informations: Arc::default(),
})
}
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
}
@@ -116,15 +116,14 @@ impl ManagedDirectory {
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect<L: FnOnce()-> HashSet<PathBuf> >(&mut self, get_living_files: L) {
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock =
self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
let meta_informations_rlock = self.meta_informations.read().expect(
"Managed directory rlock poisoned in garbage collect.",
);
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
@@ -177,9 +176,9 @@ impl ManagedDirectory {
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed directory wlock poisoned (2).",
);
{
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
@@ -202,13 +201,13 @@ impl ManagedDirectory {
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned on protect",
);
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
@@ -224,9 +223,9 @@ impl ManagedDirectory {
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned",
);
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if has_changed {
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
@@ -241,8 +240,9 @@ impl Directory for ManagedDirectory {
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.register_file_as_managed(path).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
self.directory.open_write(path)
}
@@ -257,9 +257,9 @@ impl Directory for ManagedDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
let metas_rlock = self.meta_informations.read().expect(
"poisoned lock in managed directory meta",
);
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()));
@@ -327,7 +327,7 @@ mod tests {
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
@@ -343,7 +343,7 @@ mod tests {
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -366,7 +366,7 @@ mod tests {
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
@@ -374,7 +374,7 @@ mod tests {
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -398,11 +398,11 @@ mod tests {
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));

View File

@@ -24,15 +24,17 @@ use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let file = File::open(&full_path)
.map_err(|e| if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let file = File::open(&full_path).map_err(|e| if e.kind() ==
io::ErrorKind::NotFound
{
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
let meta_data = file.metadata().map_err(|e| {
IOError::with_path(full_path.to_owned(), e)
})?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
@@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
}
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
@@ -58,7 +60,7 @@ pub struct CacheCounters {
pub miss_weak: usize,
}
#[derive(Clone,Debug,Serialize,Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
@@ -113,31 +115,31 @@ impl MmapCache {
self.cleanup();
}
Ok(match self.cache.entry(full_path.clone()) {
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
}
}
@@ -180,15 +182,19 @@ impl MmapDirectory {
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
Err(OpenDirectoryError::DoesNotExist(
PathBuf::from(directory_path),
))
} else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
Err(OpenDirectoryError::NotADirectory(
PathBuf::from(directory_path),
))
} else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
@@ -215,9 +221,9 @@ impl MmapDirectory {
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
open_opts.write(true).custom_flags(
winbase::FILE_FLAG_BACKUP_SEMANTICS,
);
}
let fd = try!(open_opts.open(&self.root_path));
@@ -270,46 +276,50 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
Ok(
mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())),
)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
.open(full_path);
let open_res = OpenOptions::new().write(true).create_new(true).open(
full_path,
);
let mut file = open_res
.map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
let mut file = open_res.map_err(|err| if err.kind() ==
io::ErrorKind::AlreadyExists
{
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
// making sure the file is created.
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.flush().map_err(
|e| IOError::with_path(path.to_owned(), e),
)?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
@@ -318,22 +328,23 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => {
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into())
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e).into()
})
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
@@ -355,8 +366,9 @@ impl Directory for MmapDirectory {
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.read_to_end(&mut buffer).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
Ok(buffer)
}
Err(e) => {

View File

@@ -13,14 +13,15 @@ mod managed_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{Write, Seek};
use std::io::{Write, Seek, BufWriter};
use std::io::BufWriter;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::{ManagedDirectory, FileProtection};
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::{ManagedDirectory, FileProtection};
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}

View File

@@ -41,8 +41,10 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path)
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path
)
}
}
}
@@ -62,8 +64,10 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
try!(self.shared_directory
.write(self.path.clone(), self.data.get_ref()));
try!(self.shared_directory.write(
self.path.clone(),
self.data.get_ref(),
));
Ok(())
}
}
@@ -79,11 +83,11 @@ impl InnerDirectory {
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = try!(self.0
.write()
.map_err(|_| {
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
path))
let mut map = try!(self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
}));
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
@@ -93,17 +97,21 @@ impl InnerDirectory {
self.0
.read()
.map_err(|_| {
let msg = format!("Failed to acquire read lock for the \
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
})
}
@@ -111,16 +119,18 @@ impl InnerDirectory {
self.0
.write()
.map_err(|_| {
let msg = format!("Failed to acquire write lock for the \
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
fn exists(&self, path: &Path) -> bool {
@@ -164,9 +174,11 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err(
|err| {
IOError::with_path(path.to_owned(), err)
},
)?;
// force the creation of the file to mimic the MMap directory.
if exists {

View File

@@ -2,6 +2,8 @@ use fst::raw::MmapReadOnly;
use std::ops::Deref;
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
use std::slice;
use std::io::{self, Read};
use stable_deref_trait::StableDeref;
/// Read object that represents files in tantivy.
@@ -41,6 +43,14 @@ impl ReadOnlySource {
}
}
/// Splits into 2 `ReadOnlySource`, at the offset given
/// as an argument.
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
let left = self.slice(0, addr);
let right = self.slice_from(addr);
(left, right)
}
/// Creates a ReadOnlySource that is just a
/// view over a slice of the data.
///
@@ -62,6 +72,23 @@ impl ReadOnlySource {
}
}
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
self.slice(0, to_offset)
}
}
impl HasLen for ReadOnlySource {
@@ -82,3 +109,42 @@ impl From<Vec<u8>> for ReadOnlySource {
ReadOnlySource::Anonymous(shared_data)
}
}
/// Acts as a owning cursor over the data backed up by a ReadOnlySource
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8],
}
impl SourceRead {
// Advance the cursor by a given number of bytes.
pub fn advance(&mut self, len: usize) {
self.cursor = &self.cursor[len..];
}
}
impl AsRef<[u8]> for SourceRead {
fn as_ref(&self) -> &[u8] {
self.cursor
}
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();
let slice_ptr = source.as_slice().as_ptr();
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
SourceRead {
_data_owner: source,
cursor: static_slice,
}
}
}
impl Read for SourceRead {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.cursor.read(buf)
}
}

View File

@@ -10,6 +10,7 @@ use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
error_chain!(
errors {
/// Path does not exist.
@@ -111,12 +112,9 @@ impl From<schema::DocParsingError> for Error {
impl From<OpenWriteError> for Error {
fn from(error: OpenWriteError) -> Error {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
ErrorKind::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}
.into()
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}.into()
}
}

View File

@@ -32,7 +32,7 @@ mod delete;
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
pub use self::reader::{U64FastFieldReader, I64FastFieldReader};
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::error::{Result, FastFieldNotAvailableError};
@@ -51,6 +51,7 @@ mod tests {
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use common::CompositeFile;
use rand::XorShiftRng;
lazy_static! {
@@ -84,7 +85,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
@@ -94,12 +95,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 38 as usize);
assert_eq!(source.len(), 35 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
let composite_file = CompositeFile::open(source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -112,7 +113,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
@@ -128,12 +129,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 63 as usize);
assert_eq!(source.len(), 60 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -154,7 +155,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
@@ -164,12 +165,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 36 as usize);
assert_eq!(source.len(), 33 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
@@ -183,30 +184,35 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i);
add_single_field_doc(
&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i,
);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80044 as usize);
assert_eq!(source.len(), 80041 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
assert_eq!(
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
}
}
@@ -221,7 +227,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
@@ -233,12 +239,13 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 17711 as usize);
assert_eq!(source.len(), 17708 as usize);
}
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: I64FastFieldReader =
fast_field_readers.open_reader(i64_field).unwrap();
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
@@ -262,7 +269,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
@@ -272,9 +279,10 @@ mod tests {
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: I64FastFieldReader =
fast_field_readers.open_reader(i64_field).unwrap();
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
}
@@ -295,7 +303,7 @@ mod tests {
let mut directory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -305,9 +313,10 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
let mut a = 0u64;
for _ in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
@@ -333,13 +342,13 @@ mod tests {
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
@@ -349,7 +358,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -359,9 +368,11 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
@@ -380,7 +391,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -390,17 +401,18 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
let fast_fields_composite = CompositeFile::open(source).unwrap();
let fast_field_reader: U64FastFieldReader =
fast_field_readers.open_reader(*FIELD).unwrap();
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
}
}
}

View File

@@ -1,20 +1,15 @@
use std::io;
use std::collections::HashMap;
use directory::ReadOnlySource;
use common::BinarySerializable;
use common::{self, BinarySerializable};
use common::bitpacker::{compute_num_bits, BitUnpacker};
use DocId;
use schema::{Field, SchemaBuilder};
use schema::SchemaBuilder;
use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
use fastfield::{FastFieldSerializer, FastFieldsWriter};
use schema::FieldType;
use error::ResultExt;
use std::mem;
use common;
use common::CompositeFile;
use owning_ref::OwningRef;
/// Trait for accessing a fastfield.
@@ -111,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader {
let amplitude: u64;
{
let mut cursor = data.as_slice();
min_value = u64::deserialize(&mut cursor)
.expect("Failed to read the min_value of fast field.");
amplitude = u64::deserialize(&mut cursor)
.expect("Failed to read the amplitude of fast field.");
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
let max_value = min_value + amplitude;
@@ -135,33 +130,36 @@ impl From<Vec<u64>> for U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let path = Path::new("__dummy__");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let write: WritePtr = directory.open_write(path).expect(
"With a RAMDirectory, this should never fail.",
);
let mut serializer = FastFieldSerializer::from_write(write).expect(
"With a RAMDirectory, this should never fail.",
);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
{
let fast_field_writer = fast_field_writers.get_field_writer(field).expect(
"With a RAMDirectory, this should never fail.",
);
for val in vals {
fast_field_writer.add_val(val);
}
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
directory
.open_read(path)
.chain_err(|| "Failed to open the file")
.and_then(|source| {
FastFieldsReader::from_source(source)
.chain_err(|| "Failed to read the file.")
})
.and_then(|ff_readers| {
ff_readers
.open_reader(field)
.ok_or_else(|| "Failed to find the requested field".into())
})
.expect("This should never happen, please report.")
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(source).expect("Failed to read the composite file");
let field_source = composite_file.open_read(field).expect(
"File component not found",
);
U64FastFieldReader::open(field_source)
}
}
@@ -212,7 +210,7 @@ impl FastFieldReader for I64FastFieldReader {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.underlying.get_range(start, output_u64);
for mut_val in output_u64.iter_mut() {
*mut_val ^= 1 << 63;
*mut_val = common::u64_to_i64(*mut_val as u64) as u64;
}
}
@@ -231,67 +229,3 @@ impl FastFieldReader for I64FastFieldReader {
}
}
}
/// The `FastFieldsReader` is the datastructure containing
/// all of the fast fields' data.
///
/// It contains a mapping that associated these fields to
/// the proper slice in the fastfield reader file.
pub struct FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
impl FastFieldsReader {
/// Opens a `FastFieldsReader`
///
/// When opening the fast field reader, the
/// the list of the offset is read (as a footer of the
/// data file).
pub fn from_source(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
let buffer = source.as_slice();
{
let mut cursor = buffer;
header_offset = u32::deserialize(&mut cursor)?;
}
{
let mut cursor = &buffer[header_offset as usize..];
field_offsets = Vec::deserialize(&mut cursor)?;
}
}
let mut end_offsets: Vec<u32> = field_offsets.iter().map(|&(_, offset)| offset).collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in
field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
FFReader::open(field_source)
})
}
}

View File

@@ -3,7 +3,8 @@ use directory::WritePtr;
use schema::Field;
use common::bitpacker::{compute_num_bits, BitPacker};
use common::CountingWriter;
use std::io::{self, Write, Seek, SeekFrom};
use common::CompositeWrite;
use std::io::{self, Write};
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.
@@ -26,51 +27,61 @@ use std::io::{self, Write, Seek, SeekFrom};
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
write: CountingWriter<WritePtr>,
fields: Vec<(Field, u32)>,
min_value: u64,
field_open: bool,
bit_packer: BitPacker,
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
/// Constructor
pub fn new(write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let mut counting_writer = CountingWriter::wrap(write);
0u32.serialize(&mut counting_writer)?;
Ok(FastFieldSerializer {
write: counting_writer,
fields: Vec::new(),
min_value: 0,
field_open: false,
bit_packer: BitPacker::new(0),
})
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write: composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self,
field: Field,
min_value: u64,
max_value: u64)
-> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.write.written_bytes() as u32));
let write = &mut self.write;
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field(field);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
self.bit_packer = BitPacker::new(num_bits as usize);
Ok(())
let bit_packer = BitPacker::new(num_bits as usize);
Ok(FastSingleFieldSerializer {
write: write,
bit_packer: bit_packer,
min_value: min_value,
})
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
@@ -78,33 +89,7 @@ impl FastFieldSerializer {
Ok(())
}
/// Close the u64 fast field.
pub fn close_field(&mut self) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
// adding some padding to make sure we
// can read the last elements with our u64
// cursor
self.bit_packer.close(&mut self.write)?;
Ok(())
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.write.written_bytes() as usize;
let (mut write, written_size) = self.write.finish()?;
self.fields.serialize(&mut write)?;
write.seek(SeekFrom::Start(0))?;
(header_offset as u32).serialize(&mut write)?;
write.flush()?;
Ok(written_size)
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}

View File

@@ -58,9 +58,9 @@ impl FastFieldsWriter {
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.field_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
self.field_writers.iter_mut().find(|field_writer| {
field_writer.field == field
})
}
@@ -155,9 +155,9 @@ impl IntFastFieldWriter {
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
VInt(val).serialize(&mut self.vals).expect(
"unable to serialize VInt to Vec",
);
if val > self.val_max {
self.val_max = val;
@@ -208,13 +208,14 @@ impl IntFastFieldWriter {
(self.val_min, self.val_max)
};
serializer.new_u64_fast_field(self.field, min, max)?;
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
let mut cursor = self.vals.as_slice();
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
serializer.add_val(val)?;
single_field_serializer.add_val(val)?;
}
serializer.close_field()
single_field_serializer.close_field()
}
}

View File

@@ -40,9 +40,9 @@ impl DeleteQueue {
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(Arc::new(Block {
operations: Arc::default(),
next: next_block,
}));
operations: Arc::default(),
next: next_block,
}));
}
delete_queue
@@ -59,9 +59,11 @@ impl DeleteQueue {
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect("Failed to unwrap last_block. This should never happen
.expect(
"Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible");
initialization possible",
);
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
@@ -92,9 +94,9 @@ impl DeleteQueue {
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let mut self_wlock = self.inner.write().expect(
"Failed to acquire write lock on delete queue writer",
);
let delete_operations;
{
@@ -108,9 +110,9 @@ impl DeleteQueue {
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
}));
operations: Arc::new(delete_operations),
next: next_block,
}));
}
self_wlock.last_block.clone()
}
@@ -132,18 +134,18 @@ impl From<DeleteQueue> for NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
let next_read_lock = self.0.read().expect(
"Failed to acquire write lock in delete queue",
);
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
return Some(block.clone());
}
}
let next_block;
{
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
let mut next_write_lock = self.0.write().expect(
"Failed to acquire write lock in delete queue",
);
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());

View File

@@ -56,8 +56,10 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value());
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
}
#[test]

View File

@@ -102,14 +102,17 @@ impl !Sync for IndexWriter {}
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn open_index_writer(index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize)
-> Result<IndexWriter> {
pub fn open_index_writer(
index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
panic!(format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT
));
}
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
@@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index,
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64)
-> Result<bool> {
pub fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64,
) -> Result<bool> {
let mut might_have_changed = false;
@@ -177,8 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
if let Some(mut docset) =
segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
if let Some(mut docset) = inverted_index.read_postings(
&delete_op.term,
SegmentPostingsOption::NoFreq,
)
{
while docset.advance() {
let deleted_doc = docset.doc();
if deleted_doc < limit_doc {
@@ -198,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
/// Advance delete for the given segment up
/// to the target opstamp.
pub fn advance_deletes(mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64)
-> Result<Option<FileProtection>> {
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
@@ -222,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment,
let delete_cursor = segment_entry.delete_cursor();
compute_deleted_bitset(&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp)?;
compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp,
)?;
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
@@ -247,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment,
Ok(file_protect)
}
fn index_documents(heap: &mut Heap,
table_size: usize,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor)
-> Result<bool> {
fn index_documents(
heap: &mut Heap,
table_size: usize,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let segment_id = segment.id();
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
@@ -265,8 +277,10 @@ fn index_documents(heap: &mut Heap,
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!("Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
@@ -275,8 +289,10 @@ fn index_documents(heap: &mut Heap,
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_term_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
@@ -296,11 +312,13 @@ fn index_documents(heap: &mut Heap,
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp)?;
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
@@ -327,14 +345,15 @@ impl IndexWriter {
join_handle
.join()
.expect("Indexing Worker thread panicked")
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
.chain_err(|| {
ErrorKind::ErrorInThread("Error in indexing worker thread.".into())
})?;
}
drop(self.workers_join_handle);
let result =
self.segment_updater
.wait_merging_thread()
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
let result = self.segment_updater.wait_merging_thread().chain_err(|| {
ErrorKind::ErrorInThread("Failed to join merging thread.".into())
});
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
@@ -347,8 +366,10 @@ impl IndexWriter {
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater
.add_segment(self.generation, segment_entry);
self.segment_updater.add_segment(
self.generation,
segment_entry,
);
}
#[doc(hidden)]
@@ -372,7 +393,11 @@ impl IndexWriter {
let mut delete_cursor = self.delete_queue.cursor();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
.name(format!(
"indexing thread {} for gen {}",
self.worker_id,
generation
))
.spawn(move || {
loop {
@@ -396,14 +421,16 @@ impl IndexWriter {
return Ok(());
}
let segment = segment_updater.new_segment();
index_documents(&mut heap,
table_size,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone())?;
index_documents(
&mut heap,
table_size,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
)?;
}
})?;
@@ -436,9 +463,10 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(&mut self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.segment_updater.start_merge(segment_ids)
}
@@ -522,14 +550,15 @@ impl IndexWriter {
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result =
worker_handle
.join()
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
let indexing_worker_result = worker_handle.join().map_err(|e| {
Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e)))
})?;
indexing_worker_result?;
// add a new worker for the next generation.
@@ -623,13 +652,17 @@ mod tests {
let schema_builder = schema::SchemaBuilder::default();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = box NoMergePolicy::default();
index_writer.set_merge_policy(merge_policy);
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy"
);
}
#[test]
@@ -719,9 +752,9 @@ mod tests {
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
index_writer.wait_merging_threads().expect(
"waiting merging thread failed",
);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);

View File

@@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy {
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
.map(|(ind, num_docs)| {
(ind, (self.clip_min_size(num_docs) as f64).log2())
})
.collect();
let (first_ind, first_score) = size_sorted_log_tuples[0];
@@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy {
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.map(|ind_vec| {
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
})
.collect()
}
@@ -138,17 +142,19 @@ mod tests {
// * one with the 6 * 10-docs segments
// * one with the 3 * 1000-docs segments
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10)];
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -156,24 +162,28 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000)]; // log2(1000) = ~9.97
let test_input = vec![
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000),
]; // log2(1000) = ~9.97
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// segments under min_layer_size are merged together
let test_input = vec![seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2)];
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -5,7 +5,7 @@ use DocId;
use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
use fastfield::U64FastFieldReader;
use itertools::Itertools;
use postings::Postings;
@@ -17,9 +17,9 @@ use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
use termdict::TermDictionary;
use schema::Term;
use termdict::TermStreamer;
use postings::SegmentPostingsOption;
pub struct IndexMerger {
schema: Schema,
@@ -28,33 +28,11 @@ pub struct IndexMerger {
}
struct DeltaPositionComputer {
buffer: Vec<u32>,
}
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer { buffer: vec![0u32; 512] }
}
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
for (i, position) in positions.iter().cloned().enumerate() {
self.buffer[i] = position - last_pos;
last_pos = position;
}
&self.buffer[..positions.len()]
}
}
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet)
-> Option<(u64, u64)> {
fn compute_min_max_val(
u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet,
) -> Option<(u64, u64)> {
if max_doc == 0 {
None
} else if !delete_bitset.has_deletes() {
@@ -72,18 +50,46 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader,
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
fn extract_fieldnorm_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
segment_reader.fast_fields_reader().open_reader(field)
fn extract_fast_field_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field).ok()
}
struct DeltaComputer {
buffer: Vec<u32>,
}
impl DeltaComputer {
fn new() -> DeltaComputer {
DeltaComputer { buffer: vec![0u32; 512] }
}
fn compute_delta(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
let num_positions = positions.len();
for i in 0..num_positions {
let cur_pos = positions[i];
self.buffer[i] = cur_pos - last_pos;
last_pos = cur_pos;
}
&self.buffer[..positions.len()]
}
}
impl IndexMerger {
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
let mut readers = vec![];
@@ -96,10 +102,10 @@ impl IndexMerger {
}
}
Ok(IndexMerger {
schema: schema,
readers: readers,
max_doc: max_doc,
})
schema: schema,
readers: readers,
max_doc: max_doc,
})
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -110,9 +116,11 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer)
self.generic_write_fast_field(
fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer,
)
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -123,19 +131,21 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields,
&extract_fast_field_reader,
fast_field_serializer)
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field)
-> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer)
-> Result<()> {
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
@@ -147,19 +157,25 @@ impl IndexMerger {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader,
reader.max_doc(),
reader.delete_bitset()) {
compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
)
{
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u64_readers
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
}
}
None => {
let error_msg = format!("Failed to find a u64_reader for field {:?}",
field);
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
bail!(ErrorKind::SchemaError(error_msg));
}
@@ -174,50 +190,68 @@ impl IndexMerger {
assert!(min_val <= max_val);
fast_field_serializer
.new_u64_fast_field(field, min_val, max_val)?;
let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(
field,
min_val,
max_val,
)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u64_reader.get(doc_id);
fast_field_serializer.add_val(val)?;
fast_single_field_serializer.add_val(val)?;
}
}
}
fast_field_serializer.close_field()?;
fast_single_field_serializer.close_field()?;
}
Ok(())
}
fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> {
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut merged_terms = TermMerger::from(&self.readers[..]);
let mut delta_position_computer = DeltaPositionComputer::new();
let mut delta_computer = DeltaComputer::new();
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
let mut indexed_fields = vec![];
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
if field_entry.is_indexed() {
indexed_fields.push(Field(field_ord as u32));
}
merged_doc_id_map.push(segment_local_map);
}
let mut last_field: Option<Field> = None;
for indexed_field in indexed_fields {
let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions;
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
while merged_terms.advance() {
let field_term_streams = field_readers
.iter()
.map(|field_reader| field_reader.terms().stream())
.collect();
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
@@ -229,86 +263,92 @@ impl IndexMerger {
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let term = Term::wrap(merged_terms.key());
let current_field = term.field();
if last_field != Some(current_field) {
// we reached a new field.
let field_entry = self.schema.get_field_entry(current_field);
// ... set segment postings option the new field.
segment_postings_option = field_entry
.field_type()
.get_segment_postings_option()
.expect("Encountered a field that is not supposed to be
indexed. Have you modified the schema?");
let mut field_serializer = serializer.new_field(indexed_field)?;
last_field = Some(current_field);
let field_entry = self.schema.get_field_entry(indexed_field);
// it is perfectly safe to call `.new_field`
// even if there is no postings associated.
serializer.new_field(current_field);
}
// ... set segment postings option the new field.
let segment_postings_option = field_entry
.field_type()
.get_segment_postings_option()
.expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let mut segment_postings =
segment_reader
.read_postings_from_terminfo(term_info, segment_postings_option);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
while merged_terms.advance() {
let term = Term::wrap(merged_terms.key());
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(term.field());
let mut segment_postings = inverted_index.read_postings_from_terminfo(
term_info,
segment_postings_option,
);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
field_serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
{
// we make sure to only write the term iff
// there is at least one document.
let positions: &[u32] = segment_postings.positions();
let term_freq = segment_postings.term_freq();
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
}
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if segment_postings.is_empty() {
// by continuing here, the `term` will be entirely removed.
continue;
}
// We know that there is at least one document containing
// the term, so we add it.
serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize] {
// we make sure to only write the term iff
// there is at least one document.
let delta_positions: &[u32] =
delta_position_computer
.compute_delta_positions(segment_postings.positions());
let term_freq = segment_postings.term_freq();
serializer
.write_doc(remapped_doc_id, term_freq, delta_positions)?;
}
if !segment_postings.advance() {
break;
}
// closing the term.
field_serializer.close_term()?;
}
}
// closing the term.
serializer.close_term()?;
field_serializer.close()?;
}
Ok(())
}
@@ -318,9 +358,9 @@ impl IndexMerger {
let store_reader = reader.get_store_reader();
for doc_id in 0..reader.max_doc() {
if !reader.is_deleted(doc_id) {
let doc = try!(store_reader.get(doc_id));
let doc = store_reader.get(doc_id)?;
let field_values: Vec<&FieldValue> = doc.field_values().iter().collect();
try!(store_writer.store(&field_values));
store_writer.store(&field_values)?;
}
}
}
@@ -330,11 +370,15 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
try!(self.write_postings(serializer.get_postings_serializer()));
try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
try!(self.write_storable_fields(serializer.get_store_writer()));
try!(serializer.close());
self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(
serializer.get_fieldnorms_serializer(),
)?;
self.write_fast_fields(
serializer.get_fast_field_serializer(),
)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
Ok(self.max_doc)
}
}
@@ -411,14 +455,13 @@ mod tests {
}
}
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index_writer.wait_merging_threads().unwrap();
}
{
@@ -431,14 +474,22 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]
);
}
{
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
@@ -467,8 +518,10 @@ mod tests {
assert!(searcher.search(&query, &mut collector).is_ok());
collector.vals()
};
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]);
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]
);
}
}
}
@@ -515,14 +568,22 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]
);
}
{
// a second commit
@@ -554,20 +615,34 @@ mod tests {
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
@@ -585,33 +660,46 @@ mod tests {
}
{
// merging the segments
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -630,20 +718,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -653,13 +755,12 @@ mod tests {
}
{
// Test merging a single segment in order to remove deletes.
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -667,20 +768,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -692,13 +807,12 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();

View File

@@ -44,10 +44,11 @@ pub struct SegmentEntry {
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>)
-> SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,

View File

@@ -32,31 +32,36 @@ pub struct SegmentManager {
impl Debug for SegmentManager {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let lock = self.read();
write!(f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed)
write!(
f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed
)
}
}
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments())
(
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
impl SegmentManager {
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor)
-> SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas,
delete_cursor),
writing: HashSet::new(),
}),
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
}
@@ -94,25 +99,24 @@ impl SegmentManager {
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
registers.committed.segment_entry(segment_id).or_else(|| {
registers.uncommitted.segment_entry(segment_id)
})
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
self.registers
.read()
.expect("Failed to acquire read lock on SegmentManager.")
self.registers.read().expect(
"Failed to acquire read lock on SegmentManager.",
)
}
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
self.registers
.write()
.expect("Failed to acquire write lock on SegmentManager.")
self.registers.write().expect(
"Failed to acquire write lock on SegmentManager.",
)
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
@@ -140,9 +144,11 @@ impl SegmentManager {
}
pub fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId) {
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
@@ -150,13 +156,15 @@ impl SegmentManager {
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
@@ -185,23 +193,26 @@ impl SegmentManager {
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry) {
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
registers_lock.writing.remove(&after_merge_segment_entry
.segment_id());
let mut target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
let target_register: &mut SegmentRegister = {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");

View File

@@ -24,7 +24,12 @@ impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
try!(write!(
f,
"{}:{}, ",
k.short_uuid_string(),
v.state().letter_code()
));
}
try!(write!(f, ")"));
Ok(())
@@ -74,9 +79,9 @@ impl SegmentRegister {
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
segment_ids.iter().all(|segment_id| {
self.segment_states.contains_key(segment_id)
})
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
@@ -91,14 +96,18 @@ impl SegmentRegister {
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.start_merge();
}
@@ -144,34 +153,42 @@ mod tests {
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{

View File

@@ -4,8 +4,7 @@ use core::Segment;
use core::SegmentComponent;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
@@ -13,7 +12,7 @@ pub struct SegmentSerializer {
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FastFieldSerializer,
postings_serializer: PostingsSerializer,
postings_serializer: InvertedIndexSerializer,
}
impl SegmentSerializer {
@@ -22,22 +21,22 @@ impl SegmentSerializer {
let store_write = try!(segment.open_write(SegmentComponent::STORE));
let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS));
let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write));
let fast_field_serializer = try!(FastFieldSerializer::from_write(fast_field_write));
let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS));
let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write));
let fieldnorms_serializer = try!(FastFieldSerializer::from_write(fieldnorms_write));
let postings_serializer = try!(PostingsSerializer::open(segment));
let postings_serializer = try!(InvertedIndexSerializer::open(segment));
Ok(SegmentSerializer {
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer {
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer
}

View File

@@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema: schema,
@@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64)
-> Result<SegmentEntry> {
fn perform_merge(
segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
@@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId],
for segment_id in segment_ids {
if let Some(mut segment_entry) =
segment_updater.0.segment_manager.segment_entry(segment_id) {
segment_updater.0.segment_manager.segment_entry(segment_id)
{
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) =
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
advance_deletes(segment, &mut segment_entry, target_opstamp)?
{
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
} else {
error!("Error, had to abort merge as some of the segment is not managed anymore.");
let msg = format!("Segment {:?} requested for merge is not managed.",
segment_id);
let msg = format!(
"Segment {:?} requested for merge is not managed.",
segment_id
);
bail!(ErrorKind::InvalidArgument(msg));
}
}
@@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId],
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect(
"Creating index serializer failed",
);
let num_docs = merger
.write(segment_serializer)
.expect("Serializing merged index failed");
let num_docs = merger.write(segment_serializer).expect(
"Serializing merged index failed",
);
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
@@ -161,23 +168,24 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn new(index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor)
-> Result<SegmentUpdater> {
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor,
) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
}
pub fn new_segment(&self) -> Segment {
@@ -199,10 +207,10 @@ impl SegmentUpdater {
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
}
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
(&self,
f: F)
-> CpuFuture<T, Error> {
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
&self,
f: F,
) -> CpuFuture<T, Error> {
let me_clone = self.clone();
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
}
@@ -211,11 +219,10 @@ impl SegmentUpdater {
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
if generation >= self.0.generation.load(Ordering::Acquire) {
self.run_async(|segment_updater| {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
}).forget();
true
} else {
false
@@ -249,46 +256,46 @@ impl SegmentUpdater {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
save_metas(self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut())
.expect("Could not save metas.");
save_metas(
self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
fn garbage_collect_files_exec(&self) {
info!("Running garbage collection");
let mut index = self.0.index.clone();
index.directory_mut().garbage_collect(|| {
self.0.segment_manager.list_files()
});
index.directory_mut().garbage_collect(
|| self.0.segment_manager.list_files(),
);
}
pub fn commit(&self, opstamp: u64) -> Result<()> {
self.run_async(move |segment_updater| if segment_updater.is_alive() {
let segment_entries = segment_updater
.purge_deletes(opstamp)
.expect("Failed purge deletes");
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
})
.wait()
let segment_entries = segment_updater.purge_deletes(opstamp).expect(
"Failed purge deletes",
);
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}).wait()
}
pub fn start_merge(&self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn start_merge(
&self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
@@ -308,10 +315,12 @@ impl SegmentUpdater {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp);
let merge_result = perform_merge(
&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
@@ -345,11 +354,10 @@ impl SegmentUpdater {
.remove(&merging_thread_id);
Ok(())
});
self.0
.merging_threads
.write()
.unwrap()
.insert(merging_thread_id, merging_join_handle);
self.0.merging_threads.write().unwrap().insert(
merging_thread_id,
merging_join_handle,
);
merging_future_recv
}
@@ -368,19 +376,23 @@ impl SegmentUpdater {
}
}
fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0.segment_manager.cancel_merge(
before_merge_segment_ids,
after_merge_segment_entry,
);
}
fn end_merge(&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry)
-> Result<()> {
fn end_merge(
&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
@@ -391,28 +403,37 @@ impl SegmentUpdater {
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(segment,
&mut after_merge_segment_entry,
committed_opstamp) {
match advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e);
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids,
e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(&before_merge_segment_ids,
after_merge_segment_entry.segment_id());
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids,
after_merge_segment_entry);
segment_updater.0.segment_manager.end_merge(
&before_merge_segment_ids,
after_merge_segment_entry,
);
segment_updater.consider_merge_options();
info!("save metas");
segment_updater.save_metas(segment_updater.0.index.opstamp());
@@ -450,10 +471,9 @@ impl SegmentUpdater {
}
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
merging_thread_handle.join().map(|_| ()).map_err(|_| {
ErrorKind::ErrorInThread("Merging thread failed.".into())
})?;
}
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
@@ -522,9 +542,9 @@ mod tests {
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
index_writer.wait_merging_threads().expect(
"waiting for merging threads",
);
}
index.load_searchers().unwrap();

View File

@@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> {
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema)
-> Result<SegmentWriter<'a>> {
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter<'a>> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
/// Lay on disk the current content of the `SegmentWriter`
@@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> {
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
)?;
Ok(self.doc_opstamps)
}
@@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self,
add_operation: &AddOperation,
schema: &Schema)
-> io::Result<()> {
pub fn add_document(
&mut self,
add_operation: &AddOperation,
schema: &Schema,
) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
@@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> {
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
self.multifield_postings
.index_text(doc_id, field, &field_values)
self.multifield_postings.index_text(
doc_id,
field,
&field_values,
)
} else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
@@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> {
}
num_field_values
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
self.fieldnorms_writer.get_field_writer(field).map(
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
);
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(field_value.field(),
field_value.value().u64_value());
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> {
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(),
field_value.value().i64_value());
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> {
self.fast_field_writers.add_document(doc);
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
.filter(|field_value| {
schema.get_field_entry(field_value.field()).is_stored()
})
.collect();
let doc_writer = self.segment_serializer.get_store_writer();
try!(doc_writer.store(&stored_fieldvalues));
@@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> {
}
// This method is used as a trick to workaround the borrow checker
fn write(multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer)
-> Result<()> {
fn write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer,
) -> Result<()> {
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(multifield_postings.serialize(
serializer.get_postings_serializer(),
));
try!(fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
));
try!(fieldnorms_writer.serialize(
serializer.get_fieldnorms_serializer(),
));
try!(serializer.close());
Ok(())
@@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter,
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
)?;
Ok(max_doc)
}
}

View File

@@ -68,7 +68,7 @@ extern crate stable_deref_trait;
#[cfg(test)]
extern crate env_logger;
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
extern crate libc;
#[cfg(windows)]
@@ -98,6 +98,8 @@ mod core;
mod compression;
mod indexer;
mod common;
#[allow(unused_doc_comment)]
mod error;
mod analyzer;
mod datastruct;
@@ -116,7 +118,7 @@ pub use directory::Directory;
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
pub use indexer::IndexWriter;
pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use core::{SegmentReader, InvertedIndexReader};
pub use self::common::TimerTree;
pub use postings::DocSet;
@@ -254,7 +256,7 @@ mod tests {
}
#[test]
fn test_docfreq() {
fn test_docfreq1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
@@ -293,7 +295,6 @@ mod tests {
}
}
#[test]
fn test_fieldnorm() {
let mut schema_builder = SchemaBuilder::default();
@@ -382,15 +383,24 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -416,16 +426,25 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -451,13 +470,22 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -465,7 +493,9 @@ mod tests {
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_c, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
@@ -489,6 +519,7 @@ mod tests {
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -512,6 +543,7 @@ mod tests {
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -574,10 +606,17 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_af, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);
@@ -619,29 +658,43 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![0, 1, 2]
);
}
}
}
@@ -678,7 +731,8 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let other_text_field = schema_builder.add_text_field("text2", TEXT);
let document = doc!(text_field => "tantivy",
let document =
doc!(text_field => "tantivy",
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);

View File

@@ -52,6 +52,33 @@ pub trait DocSet {
}
}
/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
/// If that many `DocId`s are available, the method should
/// fill the entire buffer and return the length of the buffer.
///
/// If we reach the end of the `DocSet` before filling
/// it entirely, then the buffer is filled up to this point, and
/// return value is the number of elements that were filled.
///
/// # Warning
///
/// This method is only here for specific high-performance
/// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
} else {
return i;
}
}
return buffer.len();
}
/// Returns the current document
fn doc(&self) -> DocId;

View File

@@ -1,125 +0,0 @@
use compression::BlockDecoder;
use common::VInt;
use common::BinarySerializable;
use compression::{CompositeDecoder, VIntDecoder};
use postings::SegmentPostingsOption;
use compression::NUM_DOCS_PER_BLOCK;
/// `FreqHandler` is in charge of decompressing
/// frequencies and/or positions.
pub struct FreqHandler {
freq_decoder: BlockDecoder,
positions: Vec<u32>,
option: SegmentPostingsOption,
positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1],
}
fn read_positions(data: &[u8]) -> Vec<u32> {
let mut composite_reader = CompositeDecoder::new();
let mut readable: &[u8] = data;
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
composite_reader.uncompress_unsorted(readable, uncompressed_len);
composite_reader.into()
}
impl FreqHandler {
/// Returns a `FreqHandler` that just decodes `DocId`s.
pub fn new_without_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::with_val(1u32),
positions: Vec::new(),
option: SegmentPostingsOption::NoFreq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
pub fn new_with_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: Vec::new(),
option: SegmentPostingsOption::Freq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
let positions = read_positions(position_data);
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: positions,
option: SegmentPostingsOption::FreqAndPositions,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
fn fill_positions_offset(&mut self) {
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
let mut i: usize = 0;
self.positions_offsets[i] = cur_position;
let mut last_cur_position = cur_position;
for &doc_freq in self.freq_decoder.output_array() {
i += 1;
let mut cumulated_pos = 0u32;
// this next loop decodes delta positions into normal positions.
for j in last_cur_position..(last_cur_position + (doc_freq as usize)) {
cumulated_pos += self.positions[j];
self.positions[j] = cumulated_pos;
}
cur_position += doc_freq as usize;
self.positions_offsets[i] = cur_position;
last_cur_position = cur_position;
}
}
/// Accessor to term frequency
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Accessor to the positions
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn positions(&self, idx: usize) -> &[u32] {
let start = self.positions_offsets[idx];
let stop = self.positions_offsets[idx + 1];
&self.positions[start..stop]
}
/// Decompresses a complete frequency block
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match self.option {
SegmentPostingsOption::NoFreq => data,
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
SegmentPostingsOption::FreqAndPositions => {
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
self.fill_positions_offset();
remaining
}
}
}
/// Decompresses an incomplete frequency block
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
match self.option {
SegmentPostingsOption::NoFreq => {}
SegmentPostingsOption::Freq => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
}
SegmentPostingsOption::FreqAndPositions => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
self.fill_positions_offset();
}
}
}
}

View File

@@ -16,14 +16,14 @@ mod term_info;
mod vec_postings;
mod segment_postings;
mod intersection;
mod freq_handler;
mod docset;
mod segment_postings_option;
pub use self::docset::{SkipResult, DocSet};
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
pub use self::serializer::PostingsSerializer;
pub use self::serializer::{InvertedIndexSerializer, FieldSerializer};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
@@ -32,7 +32,6 @@ pub use self::vec_postings::VecPostings;
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
pub use common::HasLen;
@@ -64,21 +63,25 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut segment = index.new_segment();
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec![1, 2, 3, 2];
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
{
let mut field_serializer = posting_serializer.new_field(text_field).unwrap();
field_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer
.write_doc(doc_id, 2, &delta_positions)
.unwrap();
}
field_serializer.close_term().unwrap();
}
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert!(read.len() <= 16);
assert!(read.len() <= 140);
}
#[test]
pub fn test_position_and_fieldnorm() {
pub fn test_position_and_fieldnorm1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -87,8 +90,8 @@ mod tests {
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema)
.unwrap();
let mut segment_writer =
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
@@ -134,13 +137,17 @@ mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.read_postings(&term_a, FreqAndPositions)
.is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
@@ -148,6 +155,7 @@ mod tests {
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);
@@ -162,6 +170,7 @@ mod tests {
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader
.inverted_index(term_e.field())
.read_postings(&term_e, FreqAndPositions)
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
@@ -201,8 +210,10 @@ mod tests {
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let searcher = index.searcher();
let mut term_weight = term_query.specialized_weight(&*searcher);
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
@@ -249,6 +260,7 @@ mod tests {
for i in 0..num_docs - 1 {
for j in i + 1..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -262,6 +274,7 @@ mod tests {
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -282,6 +295,7 @@ mod tests {
// check that filtering works
{
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -291,6 +305,7 @@ mod tests {
}
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -315,6 +330,7 @@ mod tests {
// make sure seeking still works
for i in 0..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -330,6 +346,7 @@ mod tests {
// now try with a longer sequence
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -365,12 +382,14 @@ mod tests {
// finally, check that it's empty
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -437,11 +456,12 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut segment_postings = segment_reader
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
}
#[bench]
@@ -450,21 +470,27 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let segment_postings_a = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_b = segment_reader
.inverted_index(TERM_B.field())
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_c = segment_reader
.inverted_index(TERM_C.field())
.read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_d = segment_reader
.inverted_index(TERM_D.field())
.read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
.unwrap();
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d]);
let mut intersection = IntersectionDocSet::from(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
while intersection.advance() {}
});
}
@@ -475,6 +501,7 @@ mod tests {
let docs = tests::sample(segment_reader.num_docs(), p);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -491,6 +518,7 @@ mod tests {
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
for doc in &existing_docs {
@@ -528,6 +556,7 @@ mod tests {
b.iter(|| {
let n: u32 = test::black_box(17);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let mut s = 0u32;

View File

@@ -1,7 +1,7 @@
use DocId;
use schema::Term;
use schema::FieldValue;
use postings::PostingsSerializer;
use postings::{InvertedIndexSerializer, FieldSerializer};
use std::io;
use postings::Recorder;
use analyzer::SimpleTokenizer;
@@ -16,9 +16,10 @@ use schema::FieldEntry;
use schema::FieldType;
use schema::TextIndexingOptions;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
heap: &'a Heap)
-> Box<PostingsWriter + 'a> {
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
@@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| {
posting_from_field_entry(field_entry, heap)
})
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
@@ -78,7 +77,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
/// It pushes all term, one field at a time, towards the
/// postings serializer.
#[allow(needless_range_loop)]
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _v)| k);
@@ -101,8 +100,13 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
postings_writer
.serialize(field, &term_offsets[start..stop], serializer, self.heap)?;
let mut field_serializer = serializer.new_field(field)?;
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
self.heap,
)?;
field_serializer.close()?;
}
Ok(())
}
@@ -126,30 +130,33 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap);
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap,
);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
fn index_text<'a>(
&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap,
) -> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
@@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap) {
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap,
) {
debug_assert!(term.as_slice().len() >= 4);
let recorder: &mut Rec = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
@@ -213,20 +222,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
recorder.record_position(position, heap);
}
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
serializer.new_field(field);
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
try!(serializer.new_term(term_bytes));
try!(recorder.serialize(addr, serializer, heap));
try!(serializer.close_term());
serializer.new_term(term_bytes)?;
recorder.serialize(addr, serializer, heap)?;
serializer.close_term()?;
}
Ok(())
}
}

View File

@@ -1,6 +1,6 @@
use DocId;
use std::io;
use postings::PostingsSerializer;
use postings::FieldSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
@@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Pushes the postings information to the serializer.
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
}
/// Only records the doc ids
@@ -64,13 +65,14 @@ impl Recorder for NothingRecorder {
fn close_doc(&mut self, _heap: &Heap) {}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
@@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder {
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
let mut doc_iter = self.stack.iter(self_addr, heap).chain(
Some(self.current_tf)
.into_iter(),
);
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
let term_freq = doc_iter.next().expect(
"The IndexWriter recorded a doc without a term freq.",
);
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
@@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder {
self.stack.push(POSITION_END, heap);
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
@@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder {
prev_position = position;
}
}
try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions));
serializer.write_doc(
doc,
doc_positions.len() as u32,
&doc_positions,
)?;
}
Ok(())
}

View File

@@ -1,12 +1,65 @@
use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
use compression::{COMPRESSION_BLOCK_SIZE, BlockDecoder, VIntDecoder, CompressedIntStream};
use DocId;
use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
use postings::{Postings, DocSet, HasLen, SkipResult};
use std::cmp;
use fastfield::DeleteBitSet;
use fst::Streamer;
use fastfield::DeleteBitSet;
use std::cell::UnsafeCell;
use directory::{SourceRead, ReadOnlySource};
const EMPTY_DATA: [u8; 0] = [0u8; 0];
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
struct PositionComputer {
// store the amount of position int
// before reading positions.
//
// if none, position are already loaded in
// the positions vec.
position_to_skip: Option<usize>,
positions: Vec<u32>,
positions_stream: CompressedIntStream,
}
impl PositionComputer {
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
PositionComputer {
position_to_skip: None,
positions: vec![],
positions_stream: positions_stream,
}
}
pub fn add_skip(&mut self, num_skip: usize) {
self.position_to_skip = Some(
self.position_to_skip
.map(|prev_skip| prev_skip + num_skip)
.unwrap_or(0),
);
}
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
if let Some(num_skip) = self.position_to_skip {
self.positions.resize(term_freq, 0u32);
self.positions_stream.skip(num_skip);
self.positions_stream.read(&mut self.positions[..term_freq]);
let mut cum = 0u32;
for i in 0..term_freq as usize {
cum += self.positions[i];
self.positions[i] = cum;
}
self.position_to_skip = None;
}
&self.positions[..term_freq]
}
}
/// `SegmentPostings` represents the inverted list or postings associated to
@@ -14,42 +67,60 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
block_cursor: BlockSegmentPostings<'a>,
pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
delete_bitset: DeleteBitSet,
position_computer: Option<UnsafeCell<PositionComputer>>,
}
impl<'a> SegmentPostings<'a> {
impl SegmentPostings {
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
delete_bitset: DeleteBitSet)
-> SegmentPostings<'a> {
pub fn from_block_postings(
segment_block_postings: BlockSegmentPostings,
delete_bitset: DeleteBitSet,
positions_stream_opt: Option<CompressedIntStream>,
) -> SegmentPostings {
let position_computer =
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
SegmentPostings {
block_cursor: segment_block_postings,
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
delete_bitset: delete_bitset,
position_computer: position_computer,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
pub fn empty() -> SegmentPostings {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: NUM_DOCS_PER_BLOCK,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
}
}
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
if let Some(ref position_computer) = self.position_computer.as_ref() {
let num_skips = num_skips_fn();
unsafe {
(*position_computer.get()).add_skip(num_skips);
}
}
}
}
impl<'a> DocSet for SegmentPostings<'a> {
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
@@ -59,10 +130,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = NUM_DOCS_PER_BLOCK;
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
self.position_add_skip(|| self.term_freq() as usize);
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
}
@@ -75,6 +147,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
return SkipResult::End;
}
// in the following, thanks to the call to advance above,
// we know that the position is not loaded and we need
// to skip every doc_freq we cross.
// skip blocks until one that might contain the target
loop {
// check if we need to go to the next block
@@ -83,13 +159,26 @@ impl<'a> DocSet for SegmentPostings<'a> {
(block_docs[self.cur], block_docs[block_docs.len() - 1])
};
if target > last_doc_in_block {
// we add skip for the current term independantly,
// so that position_add_skip will decide if it should
// just set itself to Some(0) or effectively
// add the term freq.
//let num_skips: u32 = ;
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
let sum_freq: u32 = freqs_skipped.iter().cloned().sum();
sum_freq as usize
});
if !self.block_cursor.advance() {
return SkipResult::End;
}
self.cur = 0;
} else {
if target < current_doc {
// We've overpassed the target after the first `advance` call
// We've passed the target after the first `advance` call
// or we're at the beginning of a block.
// Either way, we're on the first `DocId` greater than `target`
return SkipResult::OverStep;
@@ -135,6 +224,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
// `doc` is now >= `target`
let doc = block_docs[start];
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
let sum_freqs: u32 = freqs_skipped.iter().sum();
sum_freqs as usize
});
self.cur = start;
if !self.delete_bitset.is_deleted(doc) {
@@ -156,31 +252,41 @@ impl<'a> DocSet for SegmentPostings<'a> {
self.len()
}
/// Return the current document's `DocId`.
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
assert!(self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc().");
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc()."
);
docs[self.cur]
}
}
impl<'a> HasLen for SegmentPostings<'a> {
impl HasLen for SegmentPostings {
fn len(&self) -> usize {
self.block_cursor.doc_freq()
}
}
impl<'a> Postings for SegmentPostings<'a> {
impl Postings for SegmentPostings {
fn term_freq(&self) -> u32 {
self.block_cursor.freq_handler().freq(self.cur)
self.block_cursor.freq(self.cur)
}
fn positions(&self) -> &[u32] {
self.block_cursor.freq_handler().positions(self.cur)
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| unsafe {
(&mut *position_computer.get()).positions(term_freq as usize)
})
.unwrap_or(&EMPTY_POSITIONS[..])
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
@@ -188,28 +294,35 @@ impl<'a> Postings for SegmentPostings<'a> {
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings<'a> {
block_decoder: BlockDecoder,
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
has_freq: bool,
doc_freq: usize,
doc_offset: DocId,
num_binpacked_blocks: usize,
num_vint_docs: usize,
remaining_data: &'a [u8],
freq_handler: FreqHandler,
remaining_data: SourceRead,
}
impl<'a> BlockSegmentPostings<'a> {
pub(crate) fn from_data(doc_freq: usize,
data: &'a [u8],
freq_handler: FreqHandler)
-> BlockSegmentPostings<'a> {
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
has_freq: bool,
) -> BlockSegmentPostings {
let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks;
BlockSegmentPostings {
num_binpacked_blocks: num_binpacked_blocks,
num_vint_docs: num_vint_docs,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: has_freq,
remaining_data: data,
doc_offset: 0,
doc_freq: doc_freq,
@@ -226,9 +339,9 @@ impl<'a> BlockSegmentPostings<'a> {
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) {
let num_binpacked_blocks: usize = doc_freq / NUM_DOCS_PER_BLOCK;
let num_vint_docs = doc_freq & (NUM_DOCS_PER_BLOCK - 1);
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
self.num_binpacked_blocks = num_binpacked_blocks;
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
@@ -250,7 +363,25 @@ impl<'a> BlockSegmentPostings<'a> {
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.block_decoder.output_array()
self.doc_decoder.output_array()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
@@ -260,13 +391,7 @@ impl<'a> BlockSegmentPostings<'a> {
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.block_decoder.output_len
}
/// Returns a reference to the frequency handler.
pub fn freq_handler(&self) -> &FreqHandler {
&self.freq_handler
self.doc_decoder.output_len
}
/// Advance to the next block.
@@ -274,21 +399,35 @@ impl<'a> BlockSegmentPostings<'a> {
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_binpacked_blocks > 0 {
self.remaining_data =
self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
);
self.remaining_data.advance(num_consumed_bytes);
if self.has_freq {
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(
self.remaining_data.as_ref(),
);
self.remaining_data.advance(num_consumed_bytes);
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
self.num_binpacked_blocks -= 1;
true
} else if self.num_vint_docs > 0 {
self.remaining_data =
self.block_decoder
.uncompress_vint_sorted(self.remaining_data,
self.doc_offset,
self.num_vint_docs);
self.freq_handler
.read_freq_vint(self.remaining_data, self.num_vint_docs);
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
if self.has_freq {
self.freq_decoder.uncompress_vint_unsorted(
self.remaining_data.as_ref(),
self.num_vint_docs,
);
}
self.num_vint_docs = 0;
true
} else {
@@ -297,20 +436,23 @@ impl<'a> BlockSegmentPostings<'a> {
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings<'static> {
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_binpacked_blocks: 0,
num_vint_docs: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: false,
remaining_data: From::from(ReadOnlySource::empty()),
doc_offset: 0,
doc_freq: 0,
}
}
}
impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> {
impl<'b> Streamer<'b> for BlockSegmentPostings {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
@@ -366,11 +508,13 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
let mut block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let term_info = inverted_index.get_term_info(&term).unwrap();
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -406,17 +550,20 @@ mod tests {
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
block_segments =
segment_reader
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let term_info = segment_reader.get_term_info(&term).unwrap();
segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[1, 3, 5]);

View File

@@ -16,6 +16,26 @@ pub enum SegmentPostingsOption {
FreqAndPositions,
}
impl SegmentPostingsOption {
/// Returns true iff this option includes encoding
/// term frequencies.
pub fn has_freq(&self) -> bool {
match *self {
SegmentPostingsOption::NoFreq => false,
_ => true,
}
}
/// Returns true iff this option include encoding
/// term positions.
pub fn has_positions(&self) -> bool {
match *self {
SegmentPostingsOption::FreqAndPositions => true,
_ => false,
}
}
}
#[cfg(test)]
mod tests {

View File

@@ -5,16 +5,14 @@ use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
use schema::Schema;
use schema::TextIndexingOptions;
use directory::WritePtr;
use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
use compression::{COMPRESSION_BLOCK_SIZE, BlockEncoder};
use DocId;
use core::Segment;
use std::io::{self, Write};
use compression::VIntEncoder;
use common::VInt;
use common::BinarySerializable;
use common::CountingWriter;
use common::CompositeWrite;
use termdict::TermDictionaryBuilder;
@@ -49,74 +47,127 @@ use termdict::TermDictionaryBuilder;
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>,
postings_write: CountingWriter<WritePtr>,
positions_write: CountingWriter<WritePtr>,
last_doc_id_encoded: u32,
positions_encoder: CompositeEncoder,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
position_deltas: Vec<u32>,
pub struct InvertedIndexSerializer {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
text_indexing_options: TextIndexingOptions,
term_open: bool,
current_term_info: TermInfo,
}
impl PostingsSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn new(terms_write: WritePtr,
postings_write: WritePtr,
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: CountingWriter::wrap(postings_write),
positions_write: CountingWriter::wrap(positions_write),
last_doc_id_encoded: 0u32,
positions_encoder: CompositeEncoder::new(),
block_encoder: BlockEncoder::new(),
doc_ids: Vec::new(),
term_freqs: Vec::new(),
position_deltas: Vec::new(),
schema: schema,
text_indexing_options: TextIndexingOptions::Unindexed,
term_open: false,
current_term_info: TermInfo::default(),
})
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
) -> Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer {
terms_write: terms_write,
postings_write: postings_write,
positions_write: positions_write,
schema: schema,
})
}
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{TERMS, POSTINGS, POSITIONS};
PostingsSerializer::new(segment.open_write(TERMS)?,
segment.open_write(POSTINGS)?,
segment.open_write(POSITIONS)?,
segment.schema())
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
segment.schema(),
)
}
/// Must be called before starting pushing terms of
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field) {
pub fn new_field(&mut self, field: Field) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U64(ref int_options) |
FieldType::I64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized
}
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
let positions_write = self.positions_write.for_field(field);
FieldSerializer::new(
field_entry.field_type().clone(),
term_dictionary_write,
postings_write,
positions_write,
)
}
/// Closes the serializer.
pub fn close(self) -> io::Result<()> {
self.terms_write.close()?;
self.postings_write.close()?;
self.positions_write.close()?;
Ok(())
}
}
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
}
impl<'a> FieldSerializer<'a> {
fn new(
field_type: FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
let text_indexing_options = text_options.get_indexing_options();
(
text_indexing_options.is_termfreq_enabled(),
text_indexing_options.is_position_enabled(),
)
}
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write))
} else {
None
};
Ok(FieldSerializer {
term_dictionary_builder: term_dictionary_builder,
postings_serializer: postings_serializer,
positions_serializer_opt: positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
})
}
fn current_term_info(&self) -> TermInfo {
let (filepos, offset) = self.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u32, 0u8));
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
positions_offset: filepos,
positions_inner_offset: offset,
}
}
/// Starts the postings for a new term.
@@ -124,70 +175,16 @@ impl PostingsSerializer {
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
}
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
);
self.term_open = true;
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
self.position_deltas.clear();
self.current_term_info = TermInfo {
doc_freq: 0,
postings_offset: self.postings_write.written_bytes() as u32,
positions_offset: self.positions_write.written_bytes() as u32,
};
self.terms_fst_builder.insert_key(term)
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.terms_fst_builder
.insert_value(&self.current_term_info)?;
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded =
self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.text_indexing_options.is_termfreq_enabled() {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
}
// On the other hand, positions are entirely buffered until the
// end of the term, at which point they are compressed and written.
if self.text_indexing_options.is_position_enabled() {
let posdelta_len = VInt(self.position_deltas.len() as u64);
posdelta_len.serialize(&mut self.positions_write)?;
let positions_encoded: &[u8] = self.positions_encoder
.compress_unsorted(&self.position_deltas[..]);
self.positions_write.write_all(positions_encoded)?;
self.position_deltas.clear();
}
self.term_open = false;
}
Ok(())
}
/// Serialize the information that a document contains the current term,
/// its term frequency, and the position deltas.
///
@@ -197,32 +194,93 @@ impl PostingsSerializer {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32])
-> io::Result<()> {
pub fn write_doc(
&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32],
) -> io::Result<()> {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq)?;
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.write(position_deltas)?;
}
Ok(())
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.term_dictionary_builder.insert_value(
&self.current_term_info,
)?;
self.postings_serializer.close_term()?;
self.term_open = false;
}
Ok(())
}
/// Closes the current current field.
pub fn close(mut self) -> io::Result<()> {
self.close_term()?;
if let Some(positions_serializer) = self.positions_serializer_opt {
positions_serializer.close()?;
}
self.postings_serializer.close()?;
self.term_dictionary_builder.finish()?;
Ok(())
}
}
struct PostingsSerializer<W: Write> {
postings_write: CountingWriter<W>,
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
termfreq_enabled: bool,
}
impl<W: Write> PostingsSerializer<W> {
fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
PostingsSerializer {
postings_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
doc_ids: vec![],
term_freqs: vec![],
last_doc_id_encoded: 0u32,
termfreq_enabled: termfreq_enabled,
}
}
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
self.term_freqs.push(term_freq as u32);
}
if self.text_indexing_options.is_position_enabled() {
self.position_deltas.extend_from_slice(position_deltas);
}
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] =
self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
}
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
@@ -231,12 +289,93 @@ impl PostingsSerializer {
Ok(())
}
/// Closes the serializer.
pub fn close(mut self) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
try!(self.positions_write.flush());
fn close_term(&mut self) -> io::Result<()> {
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder.compress_vint_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder.compress_vint_unsorted(
&self.term_freqs[..],
);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
}
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.postings_write.flush()
}
fn addr(&self) -> u32 {
self.postings_write.written_bytes() as u32
}
fn clear(&mut self) {
self.doc_ids.clear();
self.term_freqs.clear();
self.last_doc_id_encoded = 0;
}
}
struct PositionSerializer<W: Write> {
buffer: Vec<u32>,
write: CountingWriter<W>, // See if we can offset the original counting writer.
block_encoder: BlockEncoder,
}
impl<W: Write> PositionSerializer<W> {
fn new(write: W) -> PositionSerializer<W> {
PositionSerializer {
buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE),
write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
}
}
fn addr(&self) -> (u32, u8) {
(self.write.written_bytes() as u32, self.buffer.len() as u8)
}
fn write_block(&mut self) -> io::Result<()> {
assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE);
let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer);
self.write.write_all(block_compressed)?;
self.buffer.clear();
Ok(())
}
fn write(&mut self, mut vals: &[u32]) -> io::Result<()> {
let mut buffer_len = self.buffer.len();
while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE {
let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len;
self.buffer.extend_from_slice(&vals[..len_to_completion]);
self.write_block()?;
vals = &vals[len_to_completion..];
buffer_len = self.buffer.len();
}
self.buffer.extend_from_slice(&vals);
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32);
self.write_block()?;
self.write.flush()
}
}

View File

@@ -12,7 +12,7 @@ use std::io;
/// * `postings_offset` : an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
@@ -20,6 +20,8 @@ pub struct TermInfo {
pub postings_offset: u32,
/// Offset within the position (`.pos`) file.
pub positions_offset: u32,
/// Offset within the position block.
pub positions_inner_offset: u8,
}
@@ -27,17 +29,20 @@ impl BinarySerializable for TermInfo {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.positions_offset.serialize(writer)
self.positions_offset.serialize(writer)?;
self.positions_inner_offset.serialize(writer)
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let postings_offset = try!(u32::deserialize(reader));
let positions_offset = try!(u32::deserialize(reader));
let doc_freq = u32::deserialize(reader)?;
let postings_offset = u32::deserialize(reader)?;
let positions_offset = u32::deserialize(reader)?;
let positions_inner_offset = u8::deserialize(reader)?;
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: postings_offset,
positions_offset: positions_offset,
})
doc_freq: doc_freq,
postings_offset: postings_offset,
positions_offset: positions_offset,
positions_inner_offset: positions_inner_offset,
})
}
}

View File

@@ -37,10 +37,12 @@ impl Query for BooleanQuery {
}
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
let sub_weights = try!(self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect());
let sub_weights = try!(
self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect()
);
let occurs: Vec<Occur> = self.subqueries
.iter()
.map(|&(ref occur, ref _subquery)| *occur)
@@ -57,10 +59,9 @@ impl BooleanQuery {
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
.into_iter()
.map(|term| {
let term_query: Box<Query> = box TermQuery::new(term,
SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
.collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -55,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
.map(|posting| posting.doc())
.enumerate()
.map(|(ord, doc)| {
HeapItem {
doc: doc,
ord: ord as u32,
}
})
HeapItem {
doc: doc,
ord: ord as u32,
}
})
.collect();
BooleanScorer {
scorers: non_empty_scorers,

View File

@@ -22,11 +22,12 @@ impl BooleanWeight {
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> =
try!(self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect());
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect()
);
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}

View File

@@ -64,8 +64,10 @@ mod tests {
}
let make_term_query = |text: &str| {
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq,
);
let query: Box<Query> = box term_query;
query
};
@@ -87,19 +89,25 @@ mod tests {
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
}
{

View File

@@ -61,9 +61,9 @@ mod tests {
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::from(terms);
searcher
.search(&phrase_query, &mut test_collector)
.expect("search should succeed");
searcher.search(&phrase_query, &mut test_collector).expect(
"search should succeed",
);
test_collector.docs()
};

View File

@@ -5,12 +5,12 @@ use postings::Postings;
use postings::IntersectionDocSet;
use DocId;
pub struct PhraseScorer<'a> {
pub intersection_docset: IntersectionDocSet<SegmentPostings<'a>>,
pub struct PhraseScorer {
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
}
impl<'a> PhraseScorer<'a> {
impl PhraseScorer {
fn phrase_match(&self) -> bool {
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
.docsets()
@@ -54,7 +54,7 @@ impl<'a> PhraseScorer<'a> {
}
}
impl<'a> DocSet for PhraseScorer<'a> {
impl DocSet for PhraseScorer {
fn advance(&mut self) -> bool {
while self.intersection_docset.advance() {
if self.phrase_match() {
@@ -74,7 +74,7 @@ impl<'a> DocSet for PhraseScorer<'a> {
}
impl<'a> Scorer for PhraseScorer<'a> {
impl Scorer for PhraseScorer {
fn score(&self) -> f32 {
1f32
}

View File

@@ -22,14 +22,17 @@ impl Weight for PhraseWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
let inverted_index = reader.inverted_index(term.field());
let term_postings_option =
reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions);
if let Some(term_postings) = term_postings_option {
term_postings_list.push(term_postings);
} else {
return Ok(box EmptyScorer);
}
}
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
Ok(box PhraseScorer {
intersection_docset: IntersectionDocSet::from(term_postings_list),
})
}
}

View File

@@ -61,10 +61,8 @@ pub trait Query: fmt::Debug {
/// - iterate throw the matched documents and push them to the collector.
///
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<TimerTree> {
let mut timer_tree = TimerTree::default();
let weight = try!(self.weight(searcher));
{
let mut search_timer = timer_tree.open("search");
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {

View File

@@ -3,7 +3,8 @@ use combine::char::*;
use super::user_input_ast::*;
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
@@ -11,27 +12,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
phrase.or(word)
};
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map(
|(s1, s2): (char, String)| format!("{}{}", s1, s2),
);
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field = (
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name:
Some(field_name),
phrase: phrase,
}
});
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,
}
});
let term_default_field = term_val().map(|phrase| {
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
try(term_query)
.or(term_default_field)
.map(UserInputAST::from)
@@ -40,25 +43,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
(char('-'), parser(literal))
.map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or((char('+'), parser(literal)).map(|(_, expr)| {
UserInputAST::Must(box expr)
}))
.or(parser(literal))
.parse_stream(input)
}
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
sep_by(parser(leaf), spaces())
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
.parse_stream(input)
}

View File

@@ -117,20 +117,22 @@ impl QueryParser {
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) = parse_to_ast(query)
.map_err(|_| QueryParserError::SyntaxError)?;
let (user_input_ast, _remaining) = parse_to_ast(query).map_err(
|_| QueryParserError::SyntaxError,
)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
self.schema.get_field(field_name).ok_or_else(|| {
QueryParserError::FieldDoesNotExist(String::from(field_name))
})
}
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
fn compute_logical_ast(
&self,
user_input_ast: UserInputAST,
) -> Result<LogicalAST, QueryParserError> {
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
@@ -138,10 +140,11 @@ impl QueryParser {
Ok(ast)
}
fn compute_logical_ast_for_leaf(&self,
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
@@ -174,7 +177,9 @@ impl QueryParser {
if terms.is_empty() {
Ok(None)
} else if terms.len() == 1 {
Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
Ok(Some(
LogicalLiteral::Term(terms.into_iter().next().unwrap()),
))
} else {
Ok(Some(LogicalLiteral::Phrase(terms)))
}
@@ -191,18 +196,24 @@ impl QueryParser {
}
}
fn compute_logical_ast_with_occur(&self,
user_input_ast: UserInputAST)
-> Result<(Occur, LogicalAST), QueryParserError> {
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
) -> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
})
.collect());
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
sub_queries
.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| {
(compose_occur(default_occur, occur), sub_ast)
})
})
.collect()
);
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
@@ -320,9 +331,10 @@ mod test {
}
fn parse_query_to_logical_ast(query: &str,
default_conjunction: bool)
-> Result<LogicalAST, QueryParserError> {
fn parse_query_to_logical_ast(
query: &str,
default_conjunction: bool,
) -> Result<LogicalAST, QueryParserError> {
let mut query_parser = make_query_parser();
if default_conjunction {
query_parser.set_conjunction_by_default();
@@ -330,9 +342,11 @@ mod test {
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str,
expected: &str,
default_conjunction: bool) {
fn test_parse_query_to_logical_ast_helper(
query: &str,
expected: &str,
default_conjunction: bool,
) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
@@ -358,21 +372,29 @@ mod test {
}
};
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text")));
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64")));
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64")));
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
101, 32, 119, 111, 114, 100, 116, 119, 111])",
false);
false,
);
}
#[test]
@@ -381,82 +403,115 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
test_parse_query_to_logical_ast_helper("unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false);
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false,
);
test_parse_query_to_logical_ast_helper("signed:-2324",
&format!("{:?}",
Term::from_field_i64(Field(2u32), -2324)),
false);
test_parse_query_to_logical_ast_helper(
"signed:-2324",
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false,
);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
false,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
false);
false,
);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 0, 0, 0, 97]) \
true,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(+Term([0, 0, 0, 0, 97]) \
+(Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
true);
true,
);
}
}

View File

@@ -44,8 +44,10 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let term_weight = term_query.weight(&searcher).unwrap();
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();

View File

@@ -7,7 +7,8 @@ use postings::Postings;
use fastfield::FastFieldReader;
pub struct TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub idf: Score,
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
@@ -15,7 +16,8 @@ pub struct TermScorer<TPostings>
}
impl<TPostings> TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub fn postings(&self) -> &TPostings {
&self.postings
@@ -23,7 +25,8 @@ impl<TPostings> TermScorer<TPostings>
}
impl<TPostings> DocSet for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn advance(&mut self) -> bool {
self.postings.advance()
@@ -40,7 +43,8 @@ impl<TPostings> DocSet for TermScorer<TPostings>
}
impl<TPostings> Scorer for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn score(&self) -> Score {
let doc = self.postings.doc();

View File

@@ -27,24 +27,28 @@ impl TermWeight {
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
}
pub fn specialized_scorer<'a>(&'a self,
reader: &'a SegmentReader)
-> Result<TermScorer<SegmentPostings<'a>>> {
/// If the field is not found, returns an empty `DocSet`.
pub fn specialized_scorer(
&self,
reader: &SegmentReader,
) -> Result<TermScorer<SegmentPostings>> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
Ok(reader
.read_postings(&self.term, self.segment_postings_options)
.map(|segment_postings| {
TermScorer {
idf: self.idf(),
fieldnorm_reader_opt: fieldnorm_reader_opt,
postings: segment_postings,
}
})
.unwrap_or(TermScorer {
idf: 1f32,
fieldnorm_reader_opt: None,
postings: SegmentPostings::empty(),
}))
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.segment_postings_options);
if let Some(segment_postings) = postings_opt {
Ok(TermScorer {
idf: self.idf(),
fieldnorm_reader_opt: fieldnorm_reader_opt,
postings: segment_postings,
})
} else {
Ok(TermScorer {
idf: 1f32,
fieldnorm_reader_opt: None,
postings: SegmentPostings::empty(),
})
}
}
}

View File

@@ -10,7 +10,7 @@ use common::BinarySerializable;
///
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
/// Value 255 is reserved.
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub struct Field(pub u32);
impl BinarySerializable for Field {

View File

@@ -89,7 +89,8 @@ impl FieldEntry {
impl Serialize for FieldEntry {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut s = serializer.serialize_struct("field_entry", 3)?;
s.serialize_field("name", &self.name)?;
@@ -115,7 +116,8 @@ impl Serialize for FieldEntry {
impl<'de> Deserialize<'de> for FieldEntry {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(field_identifier, rename_all = "lowercase")]
@@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
}
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
where V: MapAccess<'de>
where
V: MapAccess<'de>,
{
let mut name = None;
let mut ty = None;
@@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry {
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
let field_type = field_type
.ok_or_else(|| de::Error::missing_field("options"))?;
let field_type = field_type.ok_or_else(
|| de::Error::missing_field("options"),
)?;
Ok(FieldEntry {
name: name,
field_type: field_type,
})
name: name,
field_type: field_type,
})
}
}

View File

@@ -80,8 +80,9 @@ impl FieldType {
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) |
FieldType::I64(_) => {
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}",
json)))
Err(ValueParsingError::TypeError(
format!("Expected an integer, got {:?}", json),
))
}
}
}
@@ -110,9 +111,11 @@ impl FieldType {
}
}
_ => {
let msg = format!("Json value not supported error {:?}. Expected {:?}",
json,
self);
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json,
self
);
Err(ValueParsingError::TypeError(msg))
}
}

View File

@@ -105,9 +105,9 @@ impl SchemaBuilder {
/// This will consume your `SchemaBuilder`
pub fn build(self) -> Schema {
Schema(Arc::new(InnerSchema {
fields: self.fields,
fields_map: self.fields_map,
}))
fields: self.fields,
fields_map: self.fields_map,
}))
}
}
@@ -206,15 +206,14 @@ impl Schema {
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json)
.map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let mut doc = Document::default();
for (field_name, json_value) in json_obj.iter() {
@@ -225,18 +224,15 @@ impl Schema {
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = try!(field_type
.value_from_json(json_item)
.map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
let value =
try!(field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value = try!(field_type
.value_from_json(json_value)
.map_err(|e| {
let value = try!(field_type.value_from_json(json_value).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
@@ -259,7 +255,8 @@ impl fmt::Debug for Schema {
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
for e in &self.0.fields {
@@ -271,7 +268,8 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct SchemaVisitor;
@@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema {
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de>
where
A: SeqAccess<'de>,
{
let mut schema = SchemaBuilder {
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
@@ -430,12 +429,14 @@ mod tests {
}
{
let doc = schema
.parse_document(r#"{
.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10
}"#)
}"#,
)
.unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
@@ -443,13 +444,15 @@ mod tests {
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
assert_eq!(field_name, "jambon");
@@ -460,13 +463,15 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": "5",
"popularity": "10",
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
@@ -477,12 +482,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": -5,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -493,12 +500,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 9223372036854775808,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
@@ -509,12 +518,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
"popularity": 9223372036854775808
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -525,11 +536,13 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
}"#);
}"#,
);
match json_err {
Err(NotJSON(_)) => {
assert!(true);

View File

@@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
pub struct Term<B = Vec<u8>>(B)
where
B: AsRef<[u8]>;
impl Term {
/// Builds a term given a field, and a u64-value
@@ -109,7 +111,8 @@ impl Term {
}
impl<B> Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
/// Wraps a source of data
pub fn wrap(data: B) -> Term<B> {
@@ -166,7 +169,8 @@ impl<B> Term<B>
}
impl<B> AsRef<[u8]> for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn as_ref(&self) -> &[u8] {
self.0.as_ref()

View File

@@ -2,7 +2,7 @@ use std::ops::BitOr;
/// Define how a text field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: TextIndexingOptions,
stored: bool,
@@ -45,10 +45,10 @@ impl Default for TextOptions {
/// Describe how a field should be indexed
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)]
pub enum TextIndexingOptions {
/// Unindexed fields will not generate any postings. They will not be searchable either.
#[serde(rename="unindexed")]
#[serde(rename = "unindexed")]
Unindexed,
/// Untokenized means that the field text will not be split into tokens before being indexed.
/// A field with the value "Hello world", will have the document suscribe to one single
@@ -56,23 +56,23 @@ pub enum TextIndexingOptions {
///
/// It will **not** be searchable if the user enter "hello" for instance.
/// This can be useful for tags, or ids for instance.
#[serde(rename="untokenized")]
#[serde(rename = "untokenized")]
Untokenized,
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
/// to the posting lists associated to all of the tokens.
/// The frequence of appearance of the term in the document however will be lost.
/// The term frequency used in the TfIdf formula will always be 1.
#[serde(rename="tokenize")]
#[serde(rename = "tokenize")]
TokenizedNoFreq,
/// TokenizedWithFreq will tokenize the field value, and encode
/// both the docid and the term frequency in the posting lists associated to all
#[serde(rename="freq")]
#[serde(rename = "freq")]
TokenizedWithFreq,
/// Like TokenizedWithFreq, but also encodes the positions of the
/// terms in a separate file. This option is required for phrase queries.
/// Don't use this if you are certain you won't need it, the term positions file
/// can be very big.
#[serde(rename="position")]
#[serde(rename = "position")]
TokenizedWithFreqAndPosition,
}

View File

@@ -16,7 +16,8 @@ pub enum Value {
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
@@ -28,7 +29,8 @@ impl Serialize for Value {
impl<'de> Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct ValueVisitor;
@@ -162,9 +164,13 @@ mod binary_serialize {
Ok(Value::I64(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData,
format!("No field type is associated with code {:?}",
type_code)))
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"No field type is associated with code {:?}",
type_code
),
))
}
}
}

View File

@@ -54,17 +54,19 @@ mod tests {
fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
let mut schema_builder = SchemaBuilder::default();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
let field_title = schema_builder
.add_text_field("title", TextOptions::default().set_stored());
let field_title =
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
let lorem = String::from(
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.");
laborum.",
);
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..num_docs {
@@ -96,8 +98,10 @@ mod tests {
let store_source = directory.open_read(path).unwrap();
let store = StoreReader::from_source(store_source);
for i in 0..1_000 {
assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i));
assert_eq!(
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i)
);
}
}
@@ -106,9 +110,9 @@ mod tests {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = Path::new("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
}

View File

@@ -49,7 +49,7 @@ impl StoreReader {
let mut cursor = &total_buffer[block_offset..];
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..
(block_offset + 4 + block_length as usize)];
(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = try!(lz4::Decoder::new(block_array));
*self.current_block_offset.borrow_mut() = usize::max_value();
try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()));
@@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
let offset = offset as usize;
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
(
data.slice(0, offset),
data.slice(offset, footer_offset),
max_doc,
)
}

View File

@@ -49,12 +49,15 @@ impl StoreWriter {
///
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
try!((field_values.len() as u32).serialize(
&mut self.intermediary_buffer,
));
for field_value in field_values {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.current_block)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.current_block,
)?;
self.current_block.write_all(&self.intermediary_buffer[..])?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
@@ -66,16 +69,22 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer));
let mut encoder = try!(lz4::EncoderBuilder::new().build(
&mut self.intermediary_buffer,
));
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.writer)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.writer,
)?;
self.writer.write_all(&self.intermediary_buffer)?;
self.offset_index_writer
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
self.offset_index_writer.insert(
self.doc,
&(self.writer.written_bytes() as
u64),
)?;
self.current_block.clear();
Ok(())
}
@@ -90,8 +99,7 @@ impl StoreWriter {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.writer.written_bytes() as u64;
try!(self.offset_index_writer
.write(&mut self.writer));
try!(self.offset_index_writer.write(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
try!(self.doc.serialize(&mut self.writer));
self.writer.flush()

View File

@@ -1,23 +1,17 @@
use fst::{IntoStreamer, Streamer};
use fst::map::{StreamBuilder, Stream};
use common::BinarySerializable;
use postings::TermInfo;
use super::TermDictionaryImpl;
use termdict::{TermStreamerBuilder, TermStreamer};
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
pub struct TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
fst_map: &'a TermDictionaryImpl<V>,
pub struct TermStreamerBuilderImpl<'a> {
fst_map: &'a TermDictionaryImpl,
stream_builder: StreamBuilder<'a>,
}
impl<'a, V> TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
pub(crate) fn new(fst_map: &'a TermDictionaryImpl<V>,
stream_builder: StreamBuilder<'a>)
-> Self {
impl<'a> TermStreamerBuilderImpl<'a> {
pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self {
TermStreamerBuilderImpl {
fst_map: fst_map,
stream_builder: stream_builder,
@@ -25,10 +19,8 @@ impl<'a, V> TermStreamerBuilderImpl<'a, V>
}
}
impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
type Streamer = TermStreamerImpl<'a, V>;
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
type Streamer = TermStreamerImpl<'a>;
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.stream_builder = self.stream_builder.ge(bound);
@@ -56,35 +48,30 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
stream: self.stream_builder.into_stream(),
offset: 0u64,
current_key: Vec::with_capacity(100),
current_value: V::default(),
current_value: TermInfo::default(),
}
}
}
/// See [`TermStreamer`](./trait.TermStreamer.html)
pub struct TermStreamerImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
fst_map: &'a TermDictionaryImpl<V>,
pub struct TermStreamerImpl<'a> {
fst_map: &'a TermDictionaryImpl,
stream: Stream<'a>,
offset: u64,
current_key: Vec<u8>,
current_value: V,
current_value: TermInfo,
}
impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
where V: BinarySerializable + Default
{
impl<'a> TermStreamer for TermStreamerImpl<'a> {
fn advance(&mut self) -> bool {
if let Some((term, offset)) = self.stream.next() {
self.current_key.clear();
self.current_key.extend_from_slice(term);
self.offset = offset;
self.current_value =
self.fst_map
.read_value(self.offset)
.expect("Fst data is corrupted. Failed to deserialize a value.");
self.current_value = self.fst_map.read_value(self.offset).expect(
"Fst data is corrupted. Failed to deserialize a value.",
);
true
} else {
false
@@ -95,7 +82,7 @@ impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
&self.current_key
}
fn value(&self) -> &V {
fn value(&self) -> &TermInfo {
&self.current_value
}
}

View File

@@ -3,7 +3,7 @@ use fst;
use fst::raw::Fst;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use schema::FieldType;
use postings::TermInfo;
use termdict::{TermDictionary, TermDictionaryBuilder};
use super::{TermStreamerImpl, TermStreamerBuilderImpl};
@@ -13,18 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
}
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
pub struct TermDictionaryBuilderImpl<W, V = TermInfo>
where W: Write,
V: BinarySerializable + Default
{
pub struct TermDictionaryBuilderImpl<W> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W, V> TermDictionaryBuilderImpl<W, V>
where W: Write,
V: BinarySerializable + Default
impl<W> TermDictionaryBuilderImpl<W>
where
W: Write,
{
/// # Warning
/// Horribly dangerous internal API
@@ -43,26 +39,25 @@ impl<W, V> TermDictionaryBuilderImpl<W, V>
/// # Warning
///
/// Horribly dangerous internal API. See `.insert_key(...)`.
pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> {
pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> {
value.serialize(&mut self.data)?;
Ok(())
}
}
impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
where W: Write,
V: BinarySerializable + Default
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
where
W: Write,
{
fn new(w: W) -> io::Result<Self> {
fn new(w: W, _field_type: FieldType) -> io::Result<Self> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilderImpl {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
fst_builder: fst_builder,
data: Vec::new(),
})
}
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &V) -> io::Result<()> {
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
let key = key_ref.as_ref();
self.fst_builder
.insert(key, self.data.len() as u64)
@@ -81,73 +76,65 @@ impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
}
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
fn open_fst_index(source: ReadOnlySource) -> fst::Map {
let fst = match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
}
};
Ok(fst::Map::from(fst))
fst::Map::from(fst)
}
/// See [`TermDictionary`](./trait.TermDictionary.html)
pub struct TermDictionaryImpl<V = TermInfo>
where V: BinarySerializable + Default
{
pub struct TermDictionaryImpl {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
impl<V> TermDictionaryImpl<V>
where V: BinarySerializable + Default
{
impl TermDictionaryImpl {
/// Deserialize and returns the value at address `offset`
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
pub(crate) fn read_value(&self, offset: u64) -> io::Result<TermInfo> {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor)
TermInfo::deserialize(&mut cursor)
}
}
impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl<V>
where V: BinarySerializable + Default + 'a
{
type Streamer = TermStreamerImpl<'a, V>;
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
type Streamer = TermStreamerImpl<'a>;
type StreamBuilder = TermStreamerBuilderImpl<'a, V>;
type StreamBuilder = TermStreamerBuilderImpl<'a>;
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
fn from_source(source: ReadOnlySource) -> Self {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
let footer_size = u32::deserialize(&mut split_len_buffer).expect(
"Deserializing 4 bytes should always work",
) as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source)?;
Ok(TermDictionaryImpl {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
let fst_index = open_fst_index(fst_source);
TermDictionaryImpl {
fst_index: fst_index,
values_mmap: values_source,
}
}
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| {
self.read_value(offset)
.expect("The fst is corrupted. Failed to deserialize a value.")
})
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
self.fst_index.get(key).map(|offset| {
self.read_value(offset).expect(
"The fst is corrupted. Failed to deserialize a value.",
)
})
}
fn range(&self) -> TermStreamerBuilderImpl<V> {
fn range(&self) -> TermStreamerBuilderImpl {
TermStreamerBuilderImpl::new(self, self.fst_index.range())
}
}

View File

@@ -1,42 +1,30 @@
use std::collections::BinaryHeap;
use core::SegmentReader;
use termdict::TermStreamerImpl;
use common::BinarySerializable;
use postings::TermInfo;
use std::cmp::Ordering;
use termdict::TermStreamer;
use termdict::TermDictionary;
use schema::Term;
pub struct HeapItem<'a, V>
where V: 'a + BinarySerializable + Default
{
pub streamer: TermStreamerImpl<'a, V>,
pub struct HeapItem<'a> {
pub streamer: TermStreamerImpl<'a>,
pub segment_ord: usize,
}
impl<'a, V> PartialEq for HeapItem<'a, V>
where V: 'a + BinarySerializable + Default
{
impl<'a> PartialEq for HeapItem<'a> {
fn eq(&self, other: &Self) -> bool {
self.segment_ord == other.segment_ord
}
}
impl<'a, V> Eq for HeapItem<'a, V> where V: 'a + BinarySerializable + Default {}
impl<'a> Eq for HeapItem<'a> {}
impl<'a, V> PartialOrd for HeapItem<'a, V>
where V: 'a + BinarySerializable + Default
{
fn partial_cmp(&self, other: &HeapItem<'a, V>) -> Option<Ordering> {
impl<'a> PartialOrd for HeapItem<'a> {
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a, V> Ord for HeapItem<'a, V>
where V: 'a + BinarySerializable + Default
{
fn cmp(&self, other: &HeapItem<'a, V>) -> Ordering {
impl<'a> Ord for HeapItem<'a> {
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
}
}
@@ -48,28 +36,27 @@ impl<'a, V> Ord for HeapItem<'a, V>
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
pub struct TermMerger<'a, V>
where V: 'a + BinarySerializable + Default
{
heap: BinaryHeap<HeapItem<'a, V>>,
current_streamers: Vec<HeapItem<'a, V>>,
pub struct TermMerger<'a> {
heap: BinaryHeap<HeapItem<'a>>,
current_streamers: Vec<HeapItem<'a>>,
}
impl<'a, V> TermMerger<'a, V>
where V: 'a + BinarySerializable + Default
{
fn new(streams: Vec<TermStreamerImpl<'a, V>>) -> TermMerger<'a, V> {
impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary
///
///
pub fn new(streams: Vec<TermStreamerImpl<'a>>) -> TermMerger<'a> {
TermMerger {
heap: BinaryHeap::new(),
current_streamers: streams
.into_iter()
.enumerate()
.map(|(ord, streamer)| {
HeapItem {
streamer: streamer,
segment_ord: ord,
}
})
HeapItem {
streamer: streamer,
segment_ord: ord,
}
})
.collect(),
}
}
@@ -125,7 +112,7 @@ impl<'a, V> TermMerger<'a, V>
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn current_kvs(&self) -> &[HeapItem<'a, V>] {
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
&self.current_streamers[..]
}
@@ -139,14 +126,3 @@ impl<'a, V> TermMerger<'a, V>
}
}
}
impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> {
fn from(segment_readers: &'a [SegmentReader]) -> TermMerger<'a, TermInfo> {
TermMerger::new(segment_readers
.iter()
.map(|reader| reader.terms().stream())
.collect())
}
}

View File

@@ -1,36 +1,10 @@
/*!
The term dictionary is one of the key datastructure of
tantivy. It associates sorted `terms` to their respective
posting list.
tantivy. It associates sorted `terms` to a `TermInfo` struct
that serves as an address in their respective posting list.
The term dictionary makes it possible to iterate through
the keys in a sorted manner.
# Example
```
extern crate tantivy;
use tantivy::termdict::*;
use tantivy::directory::ReadOnlySource;
# fn main() {
# run().expect("Test failed");
# }
# fn run() -> tantivy::Result<()> {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec!())?;
// keys have to be insert in order.
term_dictionary_builder.insert("apple", &1u32)?;
term_dictionary_builder.insert("grape", &2u32)?;
term_dictionary_builder.insert("pear", &3u32)?;
let buffer: Vec<u8> = term_dictionary_builder.finish()?;
let source = ReadOnlySource::from(buffer);
let term_dictionary = TermDictionaryImpl::from_source(source)?;
assert_eq!(term_dictionary.get("grape"), Some(2u32));
# Ok(())
# }
The term dictionary API makes it possible to iterate through
a range of keys in a sorted manner.
```
@@ -74,48 +48,45 @@ followed by a streaming through at most `1024` elements in the
term `stream`.
*/
use schema::{Field, Term};
use common::BinarySerializable;
use schema::{Field, Term, FieldType};
use directory::ReadOnlySource;
use postings::TermInfo;
pub use self::merger::TermMerger;
#[cfg(not(feature="streamdict"))]
#[cfg(not(feature = "streamdict"))]
mod fstdict;
#[cfg(not(feature="streamdict"))]
#[cfg(not(feature = "streamdict"))]
pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
TermStreamerBuilderImpl};
#[cfg(feature="streamdict")]
#[cfg(feature = "streamdict")]
mod streamdict;
#[cfg(feature="streamdict")]
#[cfg(feature = "streamdict")]
pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
TermStreamerBuilderImpl};
mod merger;
use std::io;
/// Dictionary associating sorted `&[u8]` to values
pub trait TermDictionary<'a, V>
where V: BinarySerializable + Default + 'a,
Self: Sized
pub trait TermDictionary<'a>
where
Self: Sized,
{
/// Streamer type associated to the term dictionary
type Streamer: TermStreamer<V> + 'a;
type Streamer: TermStreamer + 'a;
/// StreamerBuilder type associated to the term dictionary
type StreamBuilder: TermStreamerBuilder<V, Streamer = Self::Streamer> + 'a;
type StreamBuilder: TermStreamerBuilder<Streamer = Self::Streamer> + 'a;
/// Opens a `TermDictionary` given a data source.
fn from_source(source: ReadOnlySource) -> io::Result<Self>;
fn from_source(source: ReadOnlySource) -> Self;
/// Lookups the value corresponding to the key.
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V>;
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo>;
/// Returns a range builder, to stream all of the terms
/// within an interval.
@@ -140,17 +111,17 @@ pub trait TermDictionary<'a, V>
/// Builder for the new term dictionary.
///
/// Inserting must be done in the order of the `keys`.
pub trait TermDictionaryBuilder<W, V>: Sized
where W: io::Write,
V: BinarySerializable + Default
pub trait TermDictionaryBuilder<W>: Sized
where
W: io::Write,
{
/// Creates a new `TermDictionaryBuilder`
fn new(write: W) -> io::Result<Self>;
fn new(write: W, field_type: FieldType) -> io::Result<Self>;
/// Inserts a `(key, value)` pair in the term dictionary.
///
/// *Keys have to be inserted in order.*
fn insert<K: AsRef<[u8]>>(&mut self, key: K, value: &V) -> io::Result<()>;
fn insert<K: AsRef<[u8]>>(&mut self, key: K, value: &TermInfo) -> io::Result<()>;
/// Finalize writing the builder, and returns the underlying
/// `Write` object.
@@ -160,7 +131,7 @@ pub trait TermDictionaryBuilder<W, V>: Sized
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
/// Terms are guaranteed to be sorted.
pub trait TermStreamer<V>: Sized {
pub trait TermStreamer: Sized {
/// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream
/// is an unitialized state.
@@ -187,10 +158,10 @@ pub trait TermStreamer<V>: Sized {
///
/// Calling `.value()` before the first call to `.advance()` returns
/// `V::default()`.
fn value(&self) -> &V;
fn value(&self) -> &TermInfo;
/// Return the next `(key, value)` pair.
fn next(&mut self) -> Option<(Term<&[u8]>, &V)> {
fn next(&mut self) -> Option<(Term<&[u8]>, &TermInfo)> {
if self.advance() {
Some((Term::wrap(self.key()), self.value()))
} else {
@@ -202,11 +173,9 @@ pub trait TermStreamer<V>: Sized {
/// `TermStreamerBuilder` is an helper object used to define
/// a range of terms that should be streamed.
pub trait TermStreamerBuilder<V>
where V: BinarySerializable + Default
{
pub trait TermStreamerBuilder {
/// Associated `TermStreamer` type that this builder is building.
type Streamer: TermStreamer<V>;
type Streamer: TermStreamer;
/// Limit the range to terms greater or equal to the bound
fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self;
@@ -231,60 +200,70 @@ mod tests {
use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl};
use directory::{RAMDirectory, Directory, ReadOnlySource};
use std::path::PathBuf;
use schema::{Term, SchemaBuilder, Document, TEXT};
use schema::{FieldType, Term, SchemaBuilder, Document, TEXT};
use core::Index;
use std::str;
use termdict::TermStreamer;
use termdict::TermStreamerBuilder;
use termdict::TermDictionary;
use termdict::TermDictionaryBuilder;
use postings::TermInfo;
const BLOCK_SIZE: usize = 1_500;
fn make_term_info(val: u32) -> TermInfo {
TermInfo {
doc_freq: val,
positions_offset: val * 2u32,
postings_offset: val * 3u32,
positions_inner_offset: 5u8,
}
}
#[test]
fn test_term_dictionary() {
fn test_term_dictionary_simple() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("TermDictionary");
{
let write = directory.open_write(&path).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write).unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &34u32)
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
.unwrap();
term_dictionary_builder
.insert("abcd".as_bytes(), &346u32)
.insert("abc".as_bytes(), &make_term_info(34u32))
.unwrap();
term_dictionary_builder
.insert("abcd".as_bytes(), &make_term_info(346u32))
.unwrap();
term_dictionary_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source).unwrap();
assert_eq!(term_dict.get("abc"), Some(34u32));
assert_eq!(term_dict.get("abcd"), Some(346u32));
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
let mut stream = term_dict.stream();
{
{
let (k, v) = stream.next().unwrap();
assert_eq!(k.as_ref(), "abc".as_bytes());
assert_eq!(v, &34u32);
assert_eq!(v.doc_freq, 34u32);
}
assert_eq!(stream.key(), "abc".as_bytes());
assert_eq!(*stream.value(), 34u32);
assert_eq!(stream.value().doc_freq, 34u32);
}
{
{
let (k, v) = stream.next().unwrap();
assert_eq!(k.as_slice(), "abcd".as_bytes());
assert_eq!(v, &346u32);
assert_eq!(v.doc_freq, 346u32);
}
assert_eq!(stream.key(), "abcd".as_bytes());
assert_eq!(*stream.value(), 346u32);
assert_eq!(stream.value().doc_freq, 346u32);
}
assert!(!stream.advance());
}
#[test]
fn test_term_iterator() {
let mut schema_builder = SchemaBuilder::default();
@@ -319,7 +298,9 @@ mod tests {
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut term_it = searcher.terms();
let field_searcher = searcher.field(text_field);
let mut term_it = field_searcher.terms();
let mut term_string = String::new();
while term_it.advance() {
let term = Term::from_bytes(term_it.key());
@@ -334,23 +315,26 @@ mod tests {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder.insert(id.as_bytes(), i).unwrap();
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
{
let mut streamer = term_dictionary.stream();
let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() {
let &(ref key, ref v) = &ids[i];
assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_v, v);
assert_eq!(streamer_v, &make_term_info(*v));
i += 1;
}
}
@@ -359,23 +343,59 @@ mod tests {
term_dictionary.get(key.as_bytes());
}
#[test]
fn test_stream_high_range_prefix_suffix() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
// term requires more than 16bits
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
.unwrap();
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
.unwrap();
term_dictionary_builder
.insert("abr", &make_term_info(2))
.unwrap();
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
let mut kv_stream = term_dictionary.stream();
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(1));
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
assert_eq!(kv_stream.value(), &make_term_info(2));
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abr".as_bytes());
assert!(!kv_stream.advance());
}
#[test]
fn test_stream_range() {
let ids: Vec<_> = (0u32..50_000u32)
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder.insert(id.as_bytes(), i).unwrap();
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
{
for i in (0..20).chain(6000..8_000) {
let &(ref target_key, _) = &ids[i];
@@ -387,7 +407,8 @@ mod tests {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j];
assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
assert_eq!(streamer_v, v);
assert_eq!(streamer_v.doc_freq, *v);
assert_eq!(streamer_v, &make_term_info(*v));
}
}
}
@@ -403,7 +424,7 @@ mod tests {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j + 1];
assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_v, v);
assert_eq!(streamer_v.doc_freq, *v);
}
}
}
@@ -430,45 +451,56 @@ mod tests {
#[test]
fn test_stream_range_boundaries() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder.insert(&number_arr, &i).unwrap();
term_dictionary_builder
.insert(&number_arr, &make_term_info(i as u32))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl<u8> = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
let value_list = |mut streamer: TermStreamerImpl<u8>| {
let mut res: Vec<u8> = vec![];
while let Some((_, &v)) = streamer.next() {
res.push(v);
let value_list = |mut streamer: TermStreamerImpl| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
res.push(v.doc_freq);
}
res
};
{
let range = term_dictionary.range().ge([2u8]).into_stream();
assert_eq!(value_list(range),
vec![2u8, 3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]);
assert_eq!(
value_list(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).into_stream();
assert_eq!(value_list(range), vec![3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]);
assert_eq!(
value_list(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).into_stream();
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8]);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]);
}
{
let range = term_dictionary.range().le([6u8]).into_stream();
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8, 6u8]);
assert_eq!(
value_list(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8]);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
}

View File

@@ -0,0 +1,175 @@
use postings::TermInfo;
use super::CheckPoint;
use std::mem;
use common::BinarySerializable;
/// Returns the len of the longest
/// common prefix of `s1` and `s2`.
///
/// ie: the greatest `L` such that
/// for all `0 <= i < L`, `s1[i] == s2[i]`
fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize {
s1.iter()
.zip(s2.iter())
.take_while(|&(a, b)| a == b)
.count()
}
#[derive(Default)]
pub struct TermDeltaEncoder {
last_term: Vec<u8>,
prefix_len: usize,
}
impl TermDeltaEncoder {
pub fn encode<'a>(&mut self, term: &'a [u8]) {
self.prefix_len = common_prefix_len(term, &self.last_term);
self.last_term.truncate(self.prefix_len);
self.last_term.extend_from_slice(&term[self.prefix_len..]);
}
pub fn term(&self) -> &[u8] {
&self.last_term[..]
}
pub fn prefix_suffix(&mut self) -> (usize, &[u8]) {
(self.prefix_len, &self.last_term[self.prefix_len..])
}
}
#[derive(Default)]
pub struct TermDeltaDecoder {
term: Vec<u8>,
}
impl TermDeltaDecoder {
pub fn with_previous_term(term: Vec<u8>) -> TermDeltaDecoder {
TermDeltaDecoder { term: Vec::from(term) }
}
#[inline(always)]
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 {
let b = cursor[0];
cursor = &cursor[1..];
let prefix_len = (b & 15u8) as usize;
let suffix_len = (b >> 4u8) as usize;
(prefix_len, suffix_len)
} else {
let prefix_len = u32::deserialize(&mut cursor).unwrap();
let suffix_len = u32::deserialize(&mut cursor).unwrap();
(prefix_len as usize, suffix_len as usize)
};
unsafe { self.term.set_len(prefix_len) };
self.term.extend_from_slice(&(*cursor)[..suffix_len]);
&cursor[suffix_len..]
}
pub fn term(&self) -> &[u8] {
&self.term[..]
}
}
#[derive(Default)]
pub struct DeltaTermInfo {
pub doc_freq: u32,
pub delta_postings_offset: u32,
pub delta_positions_offset: u32,
pub positions_inner_offset: u8,
}
pub struct TermInfoDeltaEncoder {
term_info: TermInfo,
pub has_positions: bool,
}
impl TermInfoDeltaEncoder {
pub fn new(has_positions: bool) -> Self {
TermInfoDeltaEncoder {
term_info: TermInfo::default(),
has_positions: has_positions,
}
}
pub fn term_info(&self) -> &TermInfo {
&self.term_info
}
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
let mut delta_term_info = DeltaTermInfo {
doc_freq: term_info.doc_freq,
delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset,
delta_positions_offset: 0,
positions_inner_offset: 0,
};
if self.has_positions {
delta_term_info.delta_positions_offset = term_info.positions_offset -
self.term_info.positions_offset;
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
}
mem::replace(&mut self.term_info, term_info);
delta_term_info
}
}
pub struct TermInfoDeltaDecoder {
term_info: TermInfo,
has_positions: bool,
}
#[inline(always)]
pub fn make_mask(num_bytes: usize) -> u32 {
const MASK: [u32; 4] = [0xffu32, 0xffffu32, 0xffffffu32, 0xffffffffu32];
*unsafe { MASK.get_unchecked(num_bytes.wrapping_sub(1) as usize) }
}
impl TermInfoDeltaDecoder {
pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder {
TermInfoDeltaDecoder {
term_info: term_info,
has_positions: has_positions,
}
}
pub fn from_checkpoint(checkpoint: &CheckPoint, has_positions: bool) -> TermInfoDeltaDecoder {
TermInfoDeltaDecoder {
term_info: TermInfo {
doc_freq: 0u32,
postings_offset: checkpoint.postings_offset,
positions_offset: checkpoint.positions_offset,
positions_inner_offset: 0u8,
},
has_positions: has_positions,
}
}
#[inline(always)]
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize + 1;
let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize + 1;
let mut v: u64 = unsafe { *(cursor.as_ptr() as *const u64) };
let doc_freq: u32 = (v as u32) & make_mask(num_bytes_docfreq);
v >>= (num_bytes_docfreq as u64) * 8u64;
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
self.term_info.doc_freq = doc_freq;
self.term_info.postings_offset += delta_postings_offset;
if self.has_positions {
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } &
make_mask(num_bytes_positions_offset);
self.term_info.positions_offset += delta_positions_offset;
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
&cursor[num_bytes_positions_offset + 1..]
} else {
cursor
}
}
pub fn term_info(&self) -> &TermInfo {
&self.term_info
}
}

View File

@@ -1,8 +1,42 @@
use std::io::{self, Write, Read};
use common::BinarySerializable;
mod termdict;
mod streamer;
mod delta_encoder;
pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder};
pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder, DeltaTermInfo};
pub use self::termdict::TermDictionaryImpl;
pub use self::termdict::TermDictionaryBuilderImpl;
pub use self::streamer::TermStreamerImpl;
pub use self::streamer::TermStreamerBuilderImpl;
#[derive(Debug)]
pub struct CheckPoint {
pub stream_offset: u32,
pub postings_offset: u32,
pub positions_offset: u32,
}
impl BinarySerializable for CheckPoint {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.stream_offset.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.positions_offset.serialize(writer)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let stream_offset = u32::deserialize(reader)?;
let postings_offset = u32::deserialize(reader)?;
let positions_offset = u32::deserialize(reader)?;
Ok(CheckPoint {
stream_offset: stream_offset,
postings_offset: postings_offset,
positions_offset: positions_offset,
})
}
}

View File

@@ -1,47 +1,54 @@
#![allow(should_implement_trait)]
use std::cmp::max;
use common::BinarySerializable;
use super::TermDictionaryImpl;
use termdict::{TermStreamerBuilder, TermStreamer};
use postings::TermInfo;
use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder};
pub(crate) fn stream_before<'a, V>(term_dictionary: &'a TermDictionaryImpl<V>,
target_key: &[u8])
-> TermStreamerImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref());
let offset: usize = offset as usize;
fn stream_before<'a>(
term_dictionary: &'a TermDictionaryImpl,
target_key: &[u8],
has_positions: bool,
) -> TermStreamerImpl<'a> {
let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref());
let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..];
TermStreamerImpl {
cursor: &term_dictionary.stream_data()[offset..],
current_key: Vec::from(prev_key),
current_value: V::default(),
cursor: stream_data,
term_delta_decoder: TermDeltaDecoder::with_previous_term(prev_key),
term_info_decoder: TermInfoDeltaDecoder::from_checkpoint(&checkpoint, has_positions),
}
}
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
pub struct TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
term_dictionary: &'a TermDictionaryImpl<V>,
pub struct TermStreamerBuilderImpl<'a> {
term_dictionary: &'a TermDictionaryImpl,
origin: usize,
offset_from: usize,
offset_to: usize,
current_key: Vec<u8>,
term_info: TermInfo,
has_positions: bool,
}
impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
type Streamer = TermStreamerImpl<'a, V>;
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
type Streamer = TermStreamerImpl<'a>;
/// Limit the range to terms greater or equal to the bound
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.lt(target_key);
let (offset_before, current_key) = get_offset(smaller_than, streamer);
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
self.current_key = current_key;
self.term_info = term_info;
self.offset_from = offset_before - self.origin;
self
}
@@ -49,10 +56,15 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
/// Limit the range to terms strictly greater than the bound
fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.le(target_key);
let (offset_before, current_key) = get_offset(smaller_than, streamer);
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
self.current_key = current_key;
self.term_info = term_info;
self.offset_from = offset_before - self.origin;
self
}
@@ -60,9 +72,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
/// Limit the range to terms lesser or equal to the bound
fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.lt(target_key);
let (offset_before, _) = get_offset(smaller_than, streamer);
let (offset_before, _, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before - self.origin;
self
}
@@ -70,9 +86,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
/// Limit the range to terms lesser or equal to the bound
fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.le(target_key);
let (offset_before, _) = get_offset(smaller_than, streamer);
let (offset_before, _, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before - self.origin;
self
}
@@ -82,10 +102,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
let data: &[u8] = self.term_dictionary.stream_data();
let start = self.offset_from;
let stop = max(self.offset_to, start);
let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key);
let term_info_decoder =
TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions);
TermStreamerImpl {
cursor: &data[start..stop],
current_key: self.current_key,
current_value: V::default(),
term_delta_decoder: term_delta_decoder,
term_info_decoder: term_info_decoder,
}
}
}
@@ -93,100 +116,77 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
/// Returns offset information for the first
/// key in the stream matching a given predicate.
///
/// returns (start offset, the data required to load the value)
fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P,
mut streamer: TermStreamerImpl<V>)
-> (usize, Vec<u8>)
where V: 'a + BinarySerializable + Default
{
/// returns
/// - the block start
/// - the index within this block
/// - the term_buffer state to initialize the block)
fn get_offset<'a, P: Fn(&[u8]) -> bool>(
predicate: P,
mut streamer: TermStreamerImpl<'a>,
) -> (usize, Vec<u8>, TermInfo) {
let mut prev: &[u8] = streamer.cursor;
let mut prev_data: Vec<u8> = streamer.current_key.clone();
let mut term_info = streamer.value().clone();
let mut prev_data: Vec<u8> = Vec::from(streamer.term_delta_decoder.term());
while let Some((iter_key, _)) = streamer.next() {
while let Some((iter_key, iter_term_info)) = streamer.next() {
if !predicate(iter_key.as_ref()) {
return (prev.as_ptr() as usize, prev_data);
return (prev.as_ptr() as usize, prev_data, term_info);
}
prev = streamer.cursor;
prev_data.clear();
prev_data.extend_from_slice(iter_key.as_ref());
term_info = iter_term_info.clone();
}
(prev.as_ptr() as usize, prev_data)
(prev.as_ptr() as usize, prev_data, term_info)
}
impl<'a, V> TermStreamerBuilderImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl<V>) -> Self {
impl<'a> TermStreamerBuilderImpl<'a> {
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self {
let data = term_dictionary.stream_data();
let origin = data.as_ptr() as usize;
TermStreamerBuilderImpl {
term_dictionary: term_dictionary,
term_info: TermInfo::default(),
origin: origin,
offset_from: 0,
offset_to: data.len(),
current_key: Vec::with_capacity(300),
has_positions: has_positions,
}
}
}
/// See [`TermStreamer`](./trait.TermStreamer.html)
pub struct TermStreamerImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
pub struct TermStreamerImpl<'a> {
cursor: &'a [u8],
current_key: Vec<u8>,
current_value: V,
term_delta_decoder: TermDeltaDecoder,
term_info_decoder: TermInfoDeltaDecoder,
}
impl<'a, V: BinarySerializable> TermStreamerImpl<'a, V>
where V: 'a + BinarySerializable + Default
{
pub(crate) fn extract_value(self) -> V {
self.current_value
}
}
fn deserialize_vint(data: &mut &[u8]) -> u64 {
let mut res = 0;
let mut shift = 0;
for i in 0.. {
let b = data[i];
res |= ((b % 128u8) as u64) << shift;
if b & 128u8 != 0u8 {
*data = &data[(i + 1)..];
break;
}
shift += 7;
}
res
}
impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
where V: BinarySerializable + Default
{
impl<'a> TermStreamer for TermStreamerImpl<'a> {
fn advance(&mut self) -> bool {
if self.cursor.is_empty() {
return false;
}
let common_length: usize = deserialize_vint(&mut self.cursor) as usize;
self.current_key.truncate(common_length);
let added_length: usize = deserialize_vint(&mut self.cursor) as usize;
self.current_key.extend(&self.cursor[..added_length]);
self.cursor = &self.cursor[added_length..];
self.current_value =
V::deserialize(&mut self.cursor)
.expect("Term dictionary corrupted. Failed to deserialize a value");
let mut cursor: &[u8] = &self.cursor;
let code: u8 = cursor[0];
cursor = self.term_delta_decoder.decode(code, &cursor[1..]);
cursor = self.term_info_decoder.decode(code, cursor);
self.cursor = cursor;
true
}
fn key(&self) -> &[u8] {
&self.current_key
self.term_delta_decoder.term()
}
fn value(&self) -> &V {
&self.current_value
fn value(&self) -> &TermInfo {
&self.term_info_decoder.term_info()
}
}

View File

@@ -1,46 +1,54 @@
#![allow(should_implement_trait)]
use std::io::{self, Write};
use super::CheckPoint;
use fst;
use fst::raw::Fst;
use common::VInt;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use common::CountingWriter;
use std::cmp::Ordering;
use postings::TermInfo;
use schema::FieldType;
use super::{TermDeltaEncoder, TermInfoDeltaEncoder, DeltaTermInfo};
use fst::raw::Node;
use super::streamer::stream_before;
use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer};
use super::{TermStreamerImpl, TermStreamerBuilderImpl};
use termdict::TermStreamerBuilder;
use std::mem::transmute;
const BLOCK_SIZE: usize = 1024;
const PADDING_SIZE: usize = 4;
const INDEX_INTERVAL: usize = 1024;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
pub struct TermDictionaryBuilderImpl<W, V = TermInfo>
where W: Write,
V: BinarySerializable + Default
{
write: CountingWriter<W>,
block_index: fst::MapBuilder<Vec<u8>>,
last_key: Vec<u8>,
len: usize,
_phantom_: PhantomData<V>,
fn has_positions(field_type: &FieldType) -> bool {
match *field_type {
FieldType::Str(ref text_options) => {
let indexing_options = text_options.get_indexing_options();
if indexing_options.is_position_enabled() {
true
} else {
false
}
}
_ => false,
}
}
fn common_prefix_length(left: &[u8], right: &[u8]) -> usize {
left.iter()
.cloned()
.zip(right.iter().cloned())
.take_while(|&(b1, b2)| b1 == b2)
.count()
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
pub struct TermDictionaryBuilderImpl<W> {
write: CountingWriter<W>,
term_delta_encoder: TermDeltaEncoder,
term_info_encoder: TermInfoDeltaEncoder,
block_index: fst::MapBuilder<Vec<u8>>,
checkpoints: Vec<u8>,
len: usize,
}
fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
while let Some(transition) = node.transitions().last() {
buffer.push(transition.inp);
@@ -48,14 +56,32 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
}
}
impl<W, V> TermDictionaryBuilderImpl<W, V>
where W: Write,
V: BinarySerializable + Default
impl<W> TermDictionaryBuilderImpl<W>
where
W: Write,
{
fn add_index_entry(&mut self) {
let stream_offset = self.write.written_bytes() as u32;
let term_info = self.term_info_encoder.term_info();
let postings_offset = term_info.postings_offset as u32;
let positions_offset = term_info.positions_offset as u32;
let checkpoint = CheckPoint {
stream_offset: stream_offset,
postings_offset: postings_offset,
positions_offset: positions_offset,
};
self.block_index
.insert(&self.last_key, self.write.written_bytes() as u64)
.unwrap();
.insert(
&self.term_delta_encoder.term(),
self.checkpoints.len() as u64,
)
.expect(
"Serializing fst on a Vec<u8> should never fail. \
Where your terms not in order maybe?",
);
checkpoint.serialize(&mut self.checkpoints).expect(
"Serializing checkpoint on a Vec<u8> should never fail.",
);
}
/// # Warning
@@ -66,59 +92,131 @@ impl<W, V> TermDictionaryBuilderImpl<W, V>
///
/// Prefer using `.insert(key, value)`
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
if self.len % BLOCK_SIZE == 0 {
if self.len % INDEX_INTERVAL == 0 {
self.add_index_entry();
}
self.len += 1;
let common_len = common_prefix_length(key, &self.last_key);
VInt(common_len as u64).serialize(&mut self.write)?;
self.last_key.truncate(common_len);
self.last_key.extend_from_slice(&key[common_len..]);
VInt((key.len() - common_len) as u64)
.serialize(&mut self.write)?;
self.write.write_all(&key[common_len..])?;
self.term_delta_encoder.encode(key);
Ok(())
}
pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> {
value.serialize(&mut self.write)?;
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
let delta_term_info = self.term_info_encoder.encode(term_info.clone());
let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix();
write_term_kv(
prefix_len,
suffix,
&delta_term_info,
self.term_info_encoder.has_positions,
&mut self.write,
)?;
self.len += 1;
Ok(())
}
}
impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
where W: Write,
V: BinarySerializable + Default
fn num_bytes_required(mut n: u32) -> u8 {
for i in 1u8..5u8 {
if n < 256u32 {
return i;
} else {
n /= 256;
}
}
0u8
}
fn write_term_kv<W: Write>(
prefix_len: usize,
suffix: &[u8],
delta_term_info: &DeltaTermInfo,
has_positions: bool,
write: &mut W,
) -> io::Result<()> {
let suffix_len = suffix.len();
let mut code = 0u8;
let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq);
let num_bytes_postings_offset = num_bytes_required(delta_term_info.delta_postings_offset);
let num_bytes_positions_offset = num_bytes_required(delta_term_info.delta_positions_offset);
code |= (num_bytes_docfreq - 1) << 1u8;
code |= (num_bytes_postings_offset - 1) << 3u8;
code |= (num_bytes_positions_offset - 1) << 5u8;
if (prefix_len < 16) && (suffix_len < 16) {
code |= 1u8;
write.write_all(
&[
code,
(prefix_len as u8) | ((suffix_len as u8) << 4u8),
],
)?;
} else {
write.write_all(&[code])?;
(prefix_len as u32).serialize(write)?;
(suffix_len as u32).serialize(write)?;
}
write.write_all(suffix)?;
{
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.doc_freq) };
write.write_all(&bytes[0..num_bytes_docfreq as usize])?;
}
{
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) };
write.write_all(
&bytes[0..num_bytes_postings_offset as usize],
)?;
}
if has_positions {
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) };
write.write_all(
&bytes[0..num_bytes_positions_offset as usize],
)?;
write.write_all(&[delta_term_info.positions_inner_offset])?;
}
Ok(())
}
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
fn new(write: W) -> io::Result<Self> {
let buffer: Vec<u8> = vec![];
fn new(mut write: W, field_type: FieldType) -> io::Result<Self> {
let has_positions = has_positions(&field_type);
let has_positions_code = if has_positions { 255u8 } else { 0u8 };
write.write_all(&[has_positions_code])?;
Ok(TermDictionaryBuilderImpl {
write: CountingWriter::wrap(write),
block_index: fst::MapBuilder::new(buffer).expect("This cannot fail"),
last_key: Vec::with_capacity(128),
len: 0,
_phantom_: PhantomData,
})
write: CountingWriter::wrap(write),
term_delta_encoder: TermDeltaEncoder::default(),
term_info_encoder: TermInfoDeltaEncoder::new(has_positions),
block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"),
checkpoints: vec![],
len: 0,
})
}
/// Inserts a `(key, value)` pair in the term dictionary.
///
/// *Keys have to be inserted in order.*
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &V) -> io::Result<()> {
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
let key = key_ref.as_ref();
self.insert_key(key)?;
self.insert_value(value)
self.insert_value(value)?;
Ok(())
}
/// Finalize writing the builder, and returns the underlying
/// `Write` object.
fn finish(mut self) -> io::Result<W> {
self.add_index_entry();
let (mut w, split_len) = self.write.finish()?;
self.write.write_all(&[0u8; PADDING_SIZE])?;
let fst_addr = self.write.written_bytes();
let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?;
w.write_all(&fst_write)?;
(split_len as u64).serialize(&mut w)?;
self.write.write_all(&fst_write)?;
let check_points_addr = self.write.written_bytes();
let (mut w, _) = self.write.finish()?;
w.write_all(&self.checkpoints)?;
(fst_addr as u64).serialize(&mut w)?;
(check_points_addr as u64).serialize(&mut w)?;
w.flush()?;
Ok(w)
}
@@ -126,34 +224,37 @@ impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => {
try!(Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error))
}
ReadOnlySource::Mmap(mmap_readonly) => {
try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error))
}
}))
use self::ReadOnlySource::*;
let fst_result = match source {
Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len),
Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly),
};
let fst = fst_result.map_err(convert_fst_error)?;
Ok(fst::Map::from(fst))
}
/// See [`TermDictionary`](./trait.TermDictionary.html)
pub struct TermDictionaryImpl<V = TermInfo>
where V: BinarySerializable + Default
{
pub struct TermDictionaryImpl {
stream_data: ReadOnlySource,
fst_index: fst::Map,
_phantom_: PhantomData<V>,
checkpoints_data: ReadOnlySource,
has_positions: bool,
}
impl<V> TermDictionaryImpl<V>
where V: BinarySerializable + Default
{
impl TermDictionaryImpl {
pub(crate) fn stream_data(&self) -> &[u8] {
self.stream_data.as_slice()
}
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, u64) {
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, CheckPoint) {
let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key);
let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..];
let checkpoint =
CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted");
(term, checkpoint)
}
fn strictly_previous_key_checkpoint_offset(&self, key: &[u8]) -> (Vec<u8>, usize) {
let fst_map = &self.fst_index;
let fst = fst_map.as_fst();
let mut node = fst.root();
@@ -186,12 +287,12 @@ impl<V> TermDictionaryImpl<V>
result.push(last_transition.inp);
let fork_node = fst.node(last_transition.addr);
fill_last(fst, fork_node, &mut result);
let val = fst_map.get(&result).unwrap();
let val = fst_map.get(&result).expect("Fst data corrupted") as usize;
return (result, val);
} else if cur_node.is_final() {
// the previous key is a prefix
let result_buffer = Vec::from(&key[..i]);
let val = fst_map.get(&result_buffer).unwrap();
let val = fst_map.get(&result_buffer).expect("Fst data corrupted") as usize;
return (result_buffer, val);
}
}
@@ -200,51 +301,70 @@ impl<V> TermDictionaryImpl<V>
}
impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl<V>
where V: BinarySerializable + Default + 'a
{
type Streamer = TermStreamerImpl<'a, V>;
type StreamBuilder = TermStreamerBuilderImpl<'a, V>;
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
type Streamer = TermStreamerImpl<'a>;
type StreamBuilder = TermStreamerBuilderImpl<'a>;
/// Opens a `TermDictionary` given a data source.
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
let total_len = source.len();
let length_offset = total_len - 8;
let split_len: usize = {
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
u64::deserialize(&mut split_len_buffer)? as usize
};
let stream_data = source.slice(0, split_len);
let fst_data = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_data)?;
fn from_source(mut source: ReadOnlySource) -> Self {
let has_positions = source.slice(0, 1)[0] == 255u8;
source = source.slice_from(1);
Ok(TermDictionaryImpl {
stream_data: stream_data,
fst_index: fst_index,
_phantom_: PhantomData,
})
let total_len = source.len();
let (body, footer) = source.split(total_len - 16);
let mut footer_buffer: &[u8] = footer.as_slice();
let fst_addr = u64::deserialize(&mut footer_buffer).expect(
"deserializing 8 byte should never fail",
) as usize;
let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect(
"deserializing 8 byte should never fail",
) as usize;
let stream_data = body.slice(0, fst_addr - PADDING_SIZE);
let fst_data = body.slice(fst_addr, checkpoints_addr);
let checkpoints_data = body.slice_from(checkpoints_addr);
let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted");
TermDictionaryImpl {
has_positions: has_positions,
stream_data: stream_data,
checkpoints_data: checkpoints_data,
fst_index: fst_index,
}
}
/// Lookups the value corresponding to the key.
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V> {
let mut streamer = stream_before(self, target_key.as_ref());
while streamer.advance() {
let position = streamer.key().cmp(target_key.as_ref());
match position {
Ordering::Less => {}
Ordering::Equal => return Some(streamer.extract_value()),
Ordering::Greater => {
return None;
}
}
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo> {
let mut streamer = self.range().ge(&target_key).into_stream();
if streamer.advance() && streamer.key() == target_key.as_ref() {
Some(streamer.value().clone())
} else {
None
}
None
}
/// Returns a range builder, to stream all of the terms
/// within an interval.
fn range(&'a self) -> Self::StreamBuilder {
Self::StreamBuilder::new(self)
Self::StreamBuilder::new(self, self.has_positions)
}
}
#[cfg(test)]
mod tests {
use super::num_bytes_required;
#[test]
fn test_num_bytes_required() {
assert_eq!(num_bytes_required(0), 1);
assert_eq!(num_bytes_required(1), 1);
assert_eq!(num_bytes_required(255), 1);
assert_eq!(num_bytes_required(256), 2);
assert_eq!(num_bytes_required(u32::max_value()), 4);
}
}