This commit is contained in:
Paul Masurel
2017-08-28 17:42:26 +09:00
parent 8d05b8f7b2
commit f8710bd4b0
90 changed files with 2291 additions and 1795 deletions

View File

@@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);

View File

@@ -38,10 +38,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())

View File

@@ -45,11 +45,11 @@ mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -15,8 +15,9 @@ use SegmentLocalId;
/// Facet collector for i64/u64 fast field
pub struct FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
counters: HashMap<T::ValueType, u64>,
field: Field,
@@ -25,8 +26,9 @@ pub struct FacetCollector<T>
impl<T> FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
/// Creates a new facet collector for aggregating a given field.
pub fn new(field: Field) -> FacetCollector<T> {
@@ -40,8 +42,9 @@ impl<T> FacetCollector<T>
impl<T> Collector for FacetCollector<T>
where T: FastFieldReader,
T::ValueType: Eq + Hash
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
@@ -51,7 +54,9 @@ impl<T> Collector for FacetCollector<T>
fn collect(&mut self, doc: DocId, _: Score) {
let val = self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.expect(
"collect() was called before set_segment. This should never happen.",
)
.get(doc);
*(self.counters.entry(val).or_insert(0)) += 1;
}

View File

@@ -51,20 +51,22 @@ pub use self::chained_collector::chain;
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()>;
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -169,12 +171,12 @@ pub mod tests {
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> {
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader)
-> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}
@@ -53,8 +54,8 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::from(vec![&mut top_collector,
&mut count_collector]);
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);

View File

@@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc {
impl Ord for GlobalScoredDoc {
#[inline]
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
other
.score
.partial_cmp(&self.score)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
other.score.partial_cmp(&self.score).unwrap_or_else(|| {
other.doc_address.cmp(&self.doc_address)
})
}
}
@@ -87,7 +86,9 @@ impl TopCollector {
scored_docs.sort();
scored_docs
.into_iter()
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
.map(|GlobalScoredDoc { score, doc_address }| {
(score, doc_address)
})
.collect()
}
@@ -108,14 +109,13 @@ impl Collector for TopCollector {
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc =
*self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect(
"Top collector with size 0 is forbidden",
);
if limit_doc.score < score {
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
let mut mut_head = self.heap.peek_mut().expect(
"Top collector with size 0 is forbidden",
);
mut_head.score = score;
mut_head.doc_address = DocAddress(self.segment_id, doc);
}

View File

@@ -88,7 +88,8 @@ impl BitPacker {
pub struct BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u64,
@@ -96,7 +97,8 @@ pub struct BitUnpacker<Data>
}
impl<Data> BitUnpacker<Data>
where Data: Deref<Target = [u8]>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
let mask: u64 = if num_bits == 64 {
@@ -121,8 +123,10 @@ impl<Data> BitUnpacker<Data>
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes.");
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)

View File

@@ -10,13 +10,12 @@ use common::BinarySerializable;
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W=WritePtr> {
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<Field, usize>,
}
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
@@ -43,7 +42,8 @@ impl<W: Write> CompositeWrite<W> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets.iter()
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(field, offset)| (offset, field))
.collect();
@@ -51,7 +51,9 @@ impl<W: Write> CompositeWrite<W> {
let mut prev_offset = 0;
for (offset, field) in offset_fields {
VInt( (offset - prev_offset) as u64).serialize(&mut self.write)?;
VInt((offset - prev_offset) as u64).serialize(
&mut self.write,
)?;
field.serialize(&mut self.write)?;
prev_offset = *offset;
}
@@ -77,7 +79,6 @@ pub struct CompositeFile {
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: ReadOnlySource) -> io::Result<CompositeFile> {
@@ -90,8 +91,8 @@ impl CompositeFile {
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut fields = vec!();
let mut offsets = vec!();
let mut fields = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
@@ -106,7 +107,7 @@ impl CompositeFile {
for i in 0..num_fields {
let field = fields[i];
let start_offset = offsets[i];
let end_offset = offsets[i+1];
let end_offset = offsets[i + 1];
field_index.insert(field, (start_offset, end_offset));
}
@@ -128,11 +129,9 @@ impl CompositeFile {
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
self.offsets_index
.get(&field)
.map(|&(from, to)| {
self.data.slice(from, to)
})
self.offsets_index.get(&field).map(|&(from, to)| {
self.data.slice(from, to)
})
}
}
@@ -189,4 +188,4 @@ mod test {
}
}
}

View File

@@ -101,9 +101,9 @@ impl BinarySerializable for String {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
reader.take(string_length as u64).read_to_string(
&mut result,
)?;
Ok(result)
}
}

View File

@@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> {
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self) {
self.timer_tree
.timings
.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
}
}

View File

@@ -47,7 +47,12 @@ impl BinarySerializable for VInt {
}
shift += 7;
}
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
_ => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer",
))
}
}
}
Ok(VInt(result))

View File

@@ -5,13 +5,13 @@ mod stream;
pub use self::stream::CompressedIntStream;
#[cfg(not(feature="simdcompression"))]
#[cfg(not(feature = "simdcompression"))]
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder};
}
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder};
@@ -19,13 +19,13 @@ mod pack {
pub use self::pack::{BlockEncoder, BlockDecoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
mod vint {
mod compression_vint_nosimd;
pub(crate) use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
mod vint {
mod compression_vint_simd;
pub(crate) use self::compression_vint_simd::*;
@@ -70,21 +70,19 @@ pub trait VIntDecoder {
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> usize;
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize;
/// Uncompress an array of `u32s`, compressed using variable
/// byte encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> usize;
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
@@ -98,19 +96,17 @@ impl VIntEncoder for BlockEncoder {
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize)
-> usize {
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self,
compressed_data: &'a [u8],
num_els: usize)
-> usize {
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
@@ -125,7 +121,6 @@ pub mod tests {
use super::*;
use tests;
use test::Bencher;
use std::iter;
#[test]
fn test_encode_sorted_block() {
@@ -236,7 +231,7 @@ pub mod tests {
#[test]
fn test_all_docs_compression_numbits() {
for num_bits in 0..33 {
let mut data: Vec<u32> = iter::repeat(0u32).take(128).collect();
let mut data = [0u32; 128];
if num_bits > 0 {
data[0] = 1 << (num_bits - 1);
}
@@ -262,7 +257,9 @@ pub mod tests {
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -25,9 +25,9 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
bit_packer.write(*val, &mut output).unwrap();
}
1 +
bit_packer
.close(&mut output)
.expect("packing in memory should never fail")
bit_packer.close(&mut output).expect(
"packing in memory should never fail",
)
}
@@ -56,10 +56,9 @@ impl BlockEncoder {
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size: usize = {
let mut output: &mut [u8] = &mut self.output;
let max = vals.iter()
.cloned()
.max()
.expect("compress unsorted called with an empty array");
let max = vals.iter().cloned().max().expect(
"compress unsorted called with an empty array",
);
let num_bits = compute_num_bits(max);
output.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
@@ -67,9 +66,9 @@ impl BlockEncoder {
bit_packer.write(*val, &mut output).unwrap();
}
1 +
bit_packer
.close(&mut output)
.expect("packing in memory should never fail")
bit_packer.close(&mut output).expect(
"packing in memory should never fail",
)
};
&self.output[..compressed_size]
}
@@ -93,10 +92,11 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted<'a>(&mut self,
compressed_data: &'a [u8],
mut offset: u32)
-> &'a [u8] {
pub fn uncompress_block_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
mut offset: u32,
) -> &'a [u8] {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);

View File

@@ -8,10 +8,11 @@ mod simdcomp {
extern "C" {
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
pub fn uncompress_sorted(compressed_data: *const u8,
output: *mut u32,
offset: u32)
-> size_t;
pub fn uncompress_sorted(
compressed_data: *const u8,
output: *mut u32,
offset: u32,
) -> size_t;
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
@@ -78,10 +79,7 @@ impl BlockDecoder {
}
}
pub fn uncompress_block_sorted(&mut self,
compressed_data: &[u8],
offset: u32)
-> usize {
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size

View File

@@ -16,7 +16,6 @@ pub struct CompressedIntStream {
}
impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
@@ -35,17 +34,21 @@ impl CompressedIntStream {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if num_els >= available {
if available > 0 {
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..];
let uncompressed_block = &self.block_decoder.output_array()
[self.inner_offset..];
&mut output[start..start + available].clone_from_slice(uncompressed_block);
}
num_els -= available;
start += available;
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref());
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = 0;
}
else {
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..self.inner_offset + num_els];
} else {
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..
self.inner_offset +
num_els];
&output[start..start + num_els].clone_from_slice(uncompressed_block);
self.inner_offset += num_els;
break;
@@ -62,8 +65,7 @@ impl CompressedIntStream {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if available >= skip_len {
self.inner_offset += skip_len;
}
else {
} else {
skip_len -= available;
// entirely skip decompressing some blocks.
while skip_len >= COMPRESSION_BLOCK_SIZE {
@@ -72,7 +74,9 @@ impl CompressedIntStream {
let block_len = compressed_block_size(num_bits);
self.buffer.advance(block_len);
}
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref());
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
self.buffer.as_ref(),
);
self.buffer.advance(num_consumed_bytes);
self.inner_offset = skip_len;
}
@@ -90,7 +94,7 @@ pub mod tests {
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {
let mut buffer: Vec<u8> = vec!();
let mut buffer: Vec<u8> = vec![];
let mut encoder = BlockEncoder::new();
let vals: Vec<u32> = (0u32..1_025u32).collect();
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {

View File

@@ -1,6 +1,10 @@
#[inline(always)]
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
@@ -43,10 +47,11 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
}
#[inline(always)]
pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> &'a [u8] {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();

View File

@@ -4,24 +4,27 @@ mod streamvbyte {
use libc::size_t;
extern "C" {
pub fn streamvbyte_delta_encode(data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32,
) -> size_t;
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32)
-> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32,
) -> size_t;
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
pub fn streamvbyte_decode(compressed_data: *const u8,
output: *mut u32,
num_els: usize)
-> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize,
) -> size_t;
}
}
@@ -29,10 +32,12 @@ mod streamvbyte {
#[inline(always)]
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset,
)
};
&output[..compress_length]
}
@@ -47,15 +52,18 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
}
#[inline(always)]
pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8],
output: &mut [u32],
offset: u32)
-> usize {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
unsafe {
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset,
)
}
}

View File

@@ -48,9 +48,10 @@ impl Index {
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory)
.expect("Creating a managed directory from a brand new RAM directory \
should never fail.");
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
@@ -127,10 +128,11 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(&self,
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
open_index_writer(self, num_threads, heap_size_in_bytes)
}
@@ -155,10 +157,12 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
Ok(
self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect(),
)
}
#[doc(hidden)]
@@ -190,10 +194,12 @@ impl Index {
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
Ok(
self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect(),
)
}
/// Creates a new generation of searchers after
@@ -203,10 +209,12 @@ impl Index {
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect());
let segment_readers: Vec<SegmentReader> = try!(
searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect()
);
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();

View File

@@ -9,7 +9,7 @@ use core::SegmentMeta;
/// * the index docstamp
/// * the schema
///
#[derive(Clone,Debug,Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,

View File

@@ -1,6 +1,5 @@
use directory::{SourceRead, ReadOnlySource};
use termdict::{TermDictionary, TermDictionaryImpl};
use std::io;
use postings::{SegmentPostings, BlockSegmentPostings};
use postings::TermInfo;
use postings::SegmentPostingsOption;
@@ -33,22 +32,21 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict_source: ReadOnlySource,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
) -> io::Result<InvertedIndexReader> {
) -> InvertedIndexReader {
Ok(InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source)?,
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
postings_source: postings_source,
positions_source: positions_source,
delete_bitset: delete_bitset,
schema: schema,
})
}
}
/// Returns the term info associated with the term.
@@ -72,9 +70,11 @@ impl InvertedIndexReader {
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings) {
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
@@ -88,27 +88,30 @@ impl InvertedIndexReader {
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> BlockSegmentPostings {
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let has_freq = option.has_freq();
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
has_freq)
has_freq,
)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(&self,
term_info: &TermInfo,
option: SegmentPostingsOption)
-> SegmentPostings {
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: SegmentPostingsOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
let position_stream = {
@@ -118,16 +121,11 @@ impl InvertedIndexReader {
let mut stream = CompressedIntStream::wrap(positions_source);
stream.skip(term_info.positions_inner_offset as usize);
Some(stream)
}
else {
} else {
None
}
};
SegmentPostings::from_block_postings(
block_postings,
delete_bitset,
position_stream
)
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
}
/// Returns the segment postings associated with the term, and with the given option,
@@ -140,16 +138,20 @@ impl InvertedIndexReader {
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self,
term: &Term,
option: SegmentPostingsOption)
-> Option<SegmentPostings> {
pub fn read_postings(
&self,
term: &Term,
option: SegmentPostingsOption,
) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
Some(self.read_postings_from_terminfo(
&term_info,
best_effort_option,
))
}
/// Returns the number of documents containing the term.

View File

@@ -76,8 +76,11 @@ impl<T> Pool<T> {
if former_generation >= generation {
break;
}
self.freshest_generation
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
self.freshest_generation.compare_and_swap(
former_generation,
generation,
Ordering::SeqCst,
);
}
}
@@ -91,9 +94,9 @@ impl<T> Pool<T> {
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
};
} else {
// this searcher is obsolete,
// removing it from the pool.
@@ -113,25 +116,26 @@ impl<T> Deref for LeasedItem<T> {
fn deref(&self) -> &T {
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect(
"Unwrapping a leased item should never fail",
);
self.recycle_queue.push(gen_item);
}
}

View File

@@ -47,10 +47,7 @@ impl Searcher {
self.segment_readers
.iter()
.map(|segment_reader| {
segment_reader
.inverted_index(term.field())
.unwrap() // TODO error handling
.doc_freq(term)
segment_reader.inverted_index(term.field()).doc_freq(term)
})
.fold(0u32, |acc, val| acc + val)
}
@@ -70,16 +67,13 @@ impl Searcher {
query.search(self, collector)
}
///
pub fn field(&self, field: Field) -> Result<FieldSearcher> {
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| {
segment_reader.inverted_index(field)
})
.collect::<Result<Vec<_>>>()?;
Ok(FieldSearcher::new(inv_index_readers))
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
}
@@ -92,11 +86,8 @@ pub struct FieldSearcher {
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher {
inv_index_readers: inv_index_readers,
}
FieldSearcher { inv_index_readers: inv_index_readers }
}
@@ -105,9 +96,7 @@ impl FieldSearcher {
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| {
inverted_index.terms().stream()
})
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}

View File

@@ -76,18 +76,20 @@ impl Segment {
}
/// Open one of the component file for a *regular* read.
pub fn open_read(&self,
component: SegmentComponent)
-> result::Result<ReadOnlySource, OpenReadError> {
pub fn open_read(
&self,
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
Ok(source)
}
/// Open one of the component file for *regular* write.
pub fn open_write(&mut self,
component: SegmentComponent)
-> result::Result<WritePtr, OpenWriteError> {
pub fn open_write(
&mut self,
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
Ok(write)
@@ -125,11 +127,11 @@ mod tests {
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(|| { living_files.clone() });
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(|| { living_files });
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}

View File

@@ -28,13 +28,15 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE];
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -64,16 +64,14 @@ impl SegmentMeta {
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => {
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
}
});
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}
@@ -111,8 +109,8 @@ impl SegmentMeta {
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
});
}
}

View File

@@ -4,7 +4,6 @@ use core::SegmentId;
use core::SegmentComponent;
use std::sync::RwLock;
use common::HasLen;
use error::ErrorKind;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
@@ -87,17 +86,17 @@ impl SegmentReader {
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
&self,
field: Field,
) -> fastfield::Result<TFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
Err(FastFieldNotAvailableError::new(field_entry))
} else {
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| {
FastFieldNotAvailableError::new(field_entry)
})
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(TFastFieldReader::open)
}
}
@@ -111,9 +110,9 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_composite
.open_read(field)
.map(U64FastFieldReader::open)
self.fieldnorms_composite.open_read(field).map(
U64FastFieldReader::open,
)
}
/// Accessor to the segment's `StoreReader`.
@@ -131,13 +130,12 @@ impl SegmentReader {
let store_reader = StoreReader::from_source(store_source);
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(postings_source)?;
let postings_composite = CompositeFile::open(postings_source)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(source)?
}
else {
} else {
CompositeFile::empty()
}
};
@@ -159,17 +157,17 @@ impl SegmentReader {
let schema = segment.schema();
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
termdict_composite: termdict_composite,
postings_composite: postings_composite,
fast_fields_composite: fast_fields_composite,
fieldnorms_composite: fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
delete_bitset: delete_bitset,
positions_composite: positions_composite,
schema: schema,
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
termdict_composite: termdict_composite,
postings_composite: postings_composite,
fast_fields_composite: fast_fields_composite,
fieldnorms_composite: fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
delete_bitset: delete_bitset,
positions_composite: positions_composite,
schema: schema,
})
}
@@ -179,32 +177,27 @@ impl SegmentReader {
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Result<Arc<InvertedIndexReader>> {
if let Some(inv_idx_reader) = self.inv_idx_reader_cache.read()
.expect("Lock poisoned. This should never happen")
.get(&field) {
return Ok(inv_idx_reader.clone());
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) =
self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
inv_idx_reader.clone();
}
let termdict_source: ReadOnlySource = self.termdict_composite
.open_read(field)
.ok_or_else(|| {
ErrorKind::SchemaError(
format!("Could not find {:?} term dictionary", field)
)
})?;
let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect(
"Index corrupted. Failed to open field term dictionary in composite file.",
);
let postings_source = self.postings_composite
.open_read(field)
.ok_or_else(|| {
ErrorKind::SchemaError(format!("Could not find {:?} postings", field))
})?;
let postings_source = self.postings_composite.open_read(field).expect(
"Index corrupted. Failed to open field postings in composite file.",
);
let positions_source = self.positions_composite
.open_read(field)
.ok_or_else(|| {
ErrorKind::SchemaError(format!("Could not find {:?} positions", field))
})?;
let positions_source = self.positions_composite.open_read(field).expect(
"Index corrupted. Failed to open field positions in composite file.",
);
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
termdict_source,
@@ -212,15 +205,18 @@ impl SegmentReader {
positions_source,
self.delete_bitset.clone(),
self.schema.clone(),
)?);
));
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
self.inv_idx_reader_cache
.write()
.expect("Field reader cache lock poisoned. This should never happen.")
.expect(
"Field reader cache lock poisoned. This should never happen.",
)
.insert(field, inv_idx_reader.clone());
Ok(inv_idx_reader)
inv_idx_reader
}
/// Returns the document (or to be accurate, its stored field)

View File

@@ -39,11 +39,11 @@ impl<T: BinarySerializable> LayerBuilder<T> {
doc_id.serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
}
}
@@ -78,8 +78,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) => {
try!(self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset))
try!(self.get_skip_layer(layer_id).insert(
skip_doc_id,
&skip_offset,
))
}
None => {
return Ok(());

View File

@@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
};
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.take_while(|num_bits: &usize| {
compute_table_size(*num_bits) < table_size_limit
})
.last()
.expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget));
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
@@ -174,13 +179,10 @@ impl<'a> HashMap<'a> {
}
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
}
@@ -282,8 +284,10 @@ mod tests {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes()));
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
@@ -303,13 +307,13 @@ mod tests {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}

View File

@@ -144,7 +144,8 @@ impl InnerHeap {
addr
} else {
if self.next_heap.is_none() {
info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,);
info!(r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
@@ -154,10 +155,9 @@ impl InnerHeap {
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
self.next_heap.as_ref().unwrap().get_slice(BytesRef(
start - self.buffer_len,
))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
@@ -167,10 +167,10 @@ impl InnerHeap {
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_slice(
start - self.buffer_len,
stop - self.buffer_len,
)
} else {
&mut self.buffer[start as usize..stop as usize]
}
@@ -188,10 +188,9 @@ impl InnerHeap {
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut(
addr - self.buffer_len,
)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
@@ -200,10 +199,9 @@ impl InnerHeap {
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
self.next_heap.as_mut().unwrap().get_mut_ref(
addr - self.buffer_len,
)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
@@ -213,10 +211,10 @@ impl InnerHeap {
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
self.next_heap.as_mut().unwrap().set(
addr - self.buffer_len,
val,
);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;

View File

@@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for writing: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
)
}
}
}
@@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => {
write!(f,
"an io error occurred while opening a file for reading: '{}'",
err)
write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
)
}
}
}

View File

@@ -45,10 +45,9 @@ pub struct FileProtection {
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_informations_wlock = directory.meta_informations.write().expect(
"Managed file lock poisoned",
);
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
@@ -68,9 +67,10 @@ impl Drop for FileProtection {
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>)
-> io::Result<()> {
fn save_managed_paths(
directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
write!(&mut w, "\n")?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
@@ -84,22 +84,22 @@ impl ManagedDirectory {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> =
serde_json::from_str(&managed_files_json)
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
serde_json::from_str(&managed_files_json).chain_err(|| {
ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone())
})?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files:
HashMap::default(),
})),
})
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => {
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
})
directory: box directory,
meta_informations: Arc::default(),
})
}
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
}
@@ -116,15 +116,14 @@ impl ManagedDirectory {
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect<L: FnOnce()-> HashSet<PathBuf> >(&mut self, get_living_files: L) {
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock =
self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
let meta_informations_rlock = self.meta_informations.read().expect(
"Managed directory rlock poisoned in garbage collect.",
);
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
@@ -177,9 +176,9 @@ impl ManagedDirectory {
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed directory wlock poisoned (2).",
);
{
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
@@ -202,13 +201,13 @@ impl ManagedDirectory {
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
let mut meta_informations_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned on protect",
);
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
@@ -224,9 +223,9 @@ impl ManagedDirectory {
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let mut meta_wlock = self.meta_informations.write().expect(
"Managed file lock poisoned",
);
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if has_changed {
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
@@ -241,8 +240,9 @@ impl Directory for ManagedDirectory {
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.register_file_as_managed(path).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
self.directory.open_write(path)
}
@@ -257,9 +257,9 @@ impl Directory for ManagedDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
let metas_rlock = self.meta_informations.read().expect(
"poisoned lock in managed directory meta",
);
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()));
@@ -327,7 +327,7 @@ mod tests {
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
@@ -343,7 +343,7 @@ mod tests {
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -366,7 +366,7 @@ mod tests {
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
@@ -374,7 +374,7 @@ mod tests {
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| { living_files });
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
@@ -398,11 +398,11 @@ mod tests {
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(|| { living_files.clone() });
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));

View File

@@ -24,15 +24,17 @@ use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let file = File::open(&full_path)
.map_err(|e| if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let file = File::open(&full_path).map_err(|e| if e.kind() ==
io::ErrorKind::NotFound
{
OpenReadError::FileDoesNotExist(full_path.clone())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
let meta_data = file.metadata().map_err(|e| {
IOError::with_path(full_path.to_owned(), e)
})?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
@@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
}
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
@@ -58,7 +60,7 @@ pub struct CacheCounters {
pub miss_weak: usize,
}
#[derive(Clone,Debug,Serialize,Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
@@ -113,31 +115,31 @@ impl MmapCache {
self.cleanup();
}
Ok(match self.cache.entry(full_path.clone()) {
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
} else {
None
}
}
})
}
}
@@ -180,15 +182,19 @@ impl MmapDirectory {
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
Err(OpenDirectoryError::DoesNotExist(
PathBuf::from(directory_path),
))
} else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
Err(OpenDirectoryError::NotADirectory(
PathBuf::from(directory_path),
))
} else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
@@ -215,9 +221,9 @@ impl MmapDirectory {
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
open_opts.write(true).custom_flags(
winbase::FILE_FLAG_BACKUP_SEMANTICS,
);
}
let fd = try!(open_opts.open(&self.root_path));
@@ -270,46 +276,50 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
Ok(
mmap_cache
.get_mmap(full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())),
)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
.open(full_path);
let open_res = OpenOptions::new().write(true).create_new(true).open(
full_path,
);
let mut file = open_res
.map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
let mut file = open_res.map_err(|err| if err.kind() ==
io::ErrorKind::AlreadyExists
{
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
})?;
// making sure the file is created.
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.flush().map_err(
|e| IOError::with_path(path.to_owned(), e),
)?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
@@ -318,22 +328,23 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| {
let msg = format!("Failed to acquired write lock \
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => {
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into())
self.sync_directory().map_err(|e| {
IOError::with_path(path.to_owned(), e).into()
})
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
@@ -355,8 +366,9 @@ impl Directory for MmapDirectory {
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
file.read_to_end(&mut buffer).map_err(|e| {
IOError::with_path(path.to_owned(), e)
})?;
Ok(buffer)
}
Err(e) => {

View File

@@ -41,8 +41,10 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path)
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path
)
}
}
}
@@ -62,8 +64,10 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
try!(self.shared_directory
.write(self.path.clone(), self.data.get_ref()));
try!(self.shared_directory.write(
self.path.clone(),
self.data.get_ref(),
));
Ok(())
}
}
@@ -79,11 +83,11 @@ impl InnerDirectory {
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = try!(self.0
.write()
.map_err(|_| {
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
path))
let mut map = try!(self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
}));
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
@@ -93,17 +97,21 @@ impl InnerDirectory {
self.0
.read()
.map_err(|_| {
let msg = format!("Failed to acquire read lock for the \
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
})
}
@@ -111,16 +119,18 @@ impl InnerDirectory {
self.0
.write()
.map_err(|_| {
let msg = format!("Failed to acquire write lock for the \
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
fn exists(&self, path: &Path) -> bool {
@@ -164,9 +174,11 @@ impl Directory for RAMDirectory {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err(
|err| {
IOError::with_path(path.to_owned(), err)
},
)?;
// force the creation of the file to mimic the MMap directory.
if exists {

View File

@@ -114,7 +114,7 @@ impl From<Vec<u8>> for ReadOnlySource {
/// Acts as a owning cursor over the data backed up by a ReadOnlySource
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8]
cursor: &'static [u8],
}
impl SourceRead {
@@ -131,7 +131,6 @@ impl AsRef<[u8]> for SourceRead {
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();

View File

@@ -112,12 +112,9 @@ impl From<schema::DocParsingError> for Error {
impl From<OpenWriteError> for Error {
fn from(error: OpenWriteError) -> Error {
match error {
OpenWriteError::FileAlreadyExists(filepath) => {
ErrorKind::FileAlreadyExists(filepath)
}
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}
.into()
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}.into()
}
}

View File

@@ -100,8 +100,7 @@ mod tests {
{
let composite_file = CompositeFile::open(source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(field_source);
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -190,9 +189,11 @@ mod tests {
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i);
add_single_field_doc(
&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i,
);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
@@ -208,8 +209,10 @@ mod tests {
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
assert_eq!(
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
}
}
@@ -339,13 +342,13 @@ mod tests {
fn bench_intfastfield_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
a
});
}
#[bench]
@@ -403,13 +406,13 @@ mod tests {
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a) as u32;
}
a
});
}
}
}

View File

@@ -2,7 +2,7 @@ use directory::ReadOnlySource;
use common::{self, BinarySerializable};
use common::bitpacker::{compute_num_bits, BitUnpacker};
use DocId;
use schema::{SchemaBuilder};
use schema::SchemaBuilder;
use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
@@ -106,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader {
let amplitude: u64;
{
let mut cursor = data.as_slice();
min_value = u64::deserialize(&mut cursor)
.expect("Failed to read the min_value of fast field.");
amplitude = u64::deserialize(&mut cursor)
.expect("Failed to read the amplitude of fast field.");
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
let max_value = min_value + amplitude;
@@ -130,15 +130,14 @@ impl From<Vec<u64>> for U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let path = Path::new("__dummy__");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let write: WritePtr = directory.open_write(path).expect("With a RAMDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write).expect("With a RAMDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
// TODO Error not unwrap
{
let fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
let fast_field_writer = fast_field_writers.get_field_writer(field).expect("With a RAMDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val);
}
@@ -147,13 +146,12 @@ impl From<Vec<u64>> for U64FastFieldReader {
serializer.close().unwrap();
}
let source = directory
.open_read(path)
.expect("Failed to open the file");
let composite_file = CompositeFile::open(source)
.expect("Failed to read the composite file");
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(source).expect("Failed to read the composite file");
let field_source = composite_file.open_read(field)
let field_source = composite_file
.open_read(field)
.expect("File component not found");
U64FastFieldReader::open(field_source)
}

View File

@@ -31,30 +31,22 @@ pub struct FastFieldSerializer {
}
impl FastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer {
composite_write: composite_write,
})
Ok(FastFieldSerializer { composite_write: composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self,
field: Field,
min_value: u64,
max_value: u64)
-> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self
.composite_write
.for_field(field);
FastSingleFieldSerializer::open(
field_write,
min_value,
max_value)
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field(field);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
@@ -73,10 +65,11 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
fn open(write: &'a mut W,
min_value: u64,
max_value: u64) -> io::Result<FastSingleFieldSerializer<'a, W>> {
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;

View File

@@ -58,9 +58,9 @@ impl FastFieldsWriter {
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.field_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
self.field_writers.iter_mut().find(|field_writer| {
field_writer.field == field
})
}
@@ -155,9 +155,9 @@ impl IntFastFieldWriter {
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
VInt(val).serialize(&mut self.vals).expect(
"unable to serialize VInt to Vec",
);
if val > self.val_max {
self.val_max = val;

View File

@@ -40,9 +40,9 @@ impl DeleteQueue {
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(Arc::new(Block {
operations: Arc::default(),
next: next_block,
}));
operations: Arc::default(),
next: next_block,
}));
}
delete_queue
@@ -59,9 +59,11 @@ impl DeleteQueue {
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect("Failed to unwrap last_block. This should never happen
.expect(
"Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible");
initialization possible",
);
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
@@ -92,9 +94,9 @@ impl DeleteQueue {
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let mut self_wlock = self.inner.write().expect(
"Failed to acquire write lock on delete queue writer",
);
let delete_operations;
{
@@ -108,9 +110,9 @@ impl DeleteQueue {
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
}));
operations: Arc::new(delete_operations),
next: next_block,
}));
}
self_wlock.last_block.clone()
}
@@ -132,18 +134,18 @@ impl From<DeleteQueue> for NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
let next_read_lock = self.0.read().expect(
"Failed to acquire write lock in delete queue",
);
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
return Some(block.clone());
}
}
let next_block;
{
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
let mut next_write_lock = self.0.write().expect(
"Failed to acquire write lock in delete queue",
);
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());

View File

@@ -56,8 +56,10 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value());
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
}
#[test]

View File

@@ -102,14 +102,17 @@ impl !Sync for IndexWriter {}
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn open_index_writer(index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize)
-> Result<IndexWriter> {
pub fn open_index_writer(
index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
panic!(format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT
));
}
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
@@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index,
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64)
-> Result<bool> {
pub fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64,
) -> Result<bool> {
let mut might_have_changed = false;
@@ -177,9 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
if let Some(mut docset) = inverted_index.read_postings(
&delete_op.term,
SegmentPostingsOption::NoFreq,
)
{
while docset.advance() {
let deleted_doc = docset.doc();
if deleted_doc < limit_doc {
@@ -199,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
/// Advance delete for the given segment up
/// to the target opstamp.
pub fn advance_deletes(mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64)
-> Result<Option<FileProtection>> {
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
@@ -223,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment,
let delete_cursor = segment_entry.delete_cursor();
compute_deleted_bitset(&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp)?;
compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
delete_cursor,
&DocToOpstampMapping::None,
target_opstamp,
)?;
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
@@ -248,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment,
Ok(file_protect)
}
fn index_documents(heap: &mut Heap,
table_size: usize,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor)
-> Result<bool> {
fn index_documents(
heap: &mut Heap,
table_size: usize,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let segment_id = segment.id();
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
@@ -266,8 +277,10 @@ fn index_documents(heap: &mut Heap,
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!("Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
@@ -276,8 +289,10 @@ fn index_documents(heap: &mut Heap,
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_term_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
@@ -297,11 +312,13 @@ fn index_documents(heap: &mut Heap,
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp)?;
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
@@ -328,14 +345,15 @@ impl IndexWriter {
join_handle
.join()
.expect("Indexing Worker thread panicked")
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
.chain_err(|| {
ErrorKind::ErrorInThread("Error in indexing worker thread.".into())
})?;
}
drop(self.workers_join_handle);
let result =
self.segment_updater
.wait_merging_thread()
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
let result = self.segment_updater.wait_merging_thread().chain_err(|| {
ErrorKind::ErrorInThread("Failed to join merging thread.".into())
});
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
@@ -348,8 +366,10 @@ impl IndexWriter {
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater
.add_segment(self.generation, segment_entry);
self.segment_updater.add_segment(
self.generation,
segment_entry,
);
}
#[doc(hidden)]
@@ -373,7 +393,11 @@ impl IndexWriter {
let mut delete_cursor = self.delete_queue.cursor();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
.name(format!(
"indexing thread {} for gen {}",
self.worker_id,
generation
))
.spawn(move || {
loop {
@@ -397,14 +421,16 @@ impl IndexWriter {
return Ok(());
}
let segment = segment_updater.new_segment();
index_documents(&mut heap,
table_size,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone())?;
index_documents(
&mut heap,
table_size,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
)?;
}
})?;
@@ -437,9 +463,10 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(&mut self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.segment_updater.start_merge(segment_ids)
}
@@ -523,14 +550,15 @@ impl IndexWriter {
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result =
worker_handle
.join()
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
let indexing_worker_result = worker_handle.join().map_err(|e| {
Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e)))
})?;
indexing_worker_result?;
// add a new worker for the next generation.
@@ -624,13 +652,17 @@ mod tests {
let schema_builder = schema::SchemaBuilder::default();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = box NoMergePolicy::default();
index_writer.set_merge_policy(merge_policy);
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy"
);
}
#[test]
@@ -720,9 +752,9 @@ mod tests {
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
index_writer.wait_merging_threads().expect(
"waiting merging thread failed",
);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);

View File

@@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy {
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
.map(|(ind, num_docs)| {
(ind, (self.clip_min_size(num_docs) as f64).log2())
})
.collect();
let (first_ind, first_score) = size_sorted_log_tuples[0];
@@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy {
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.map(|ind_vec| {
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
})
.collect()
}
@@ -134,12 +138,14 @@ mod tests {
#[test]
fn test_log_merge_policy_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -147,24 +153,28 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(11),
seg_meta(12),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let test_input = vec![
seg_meta(10),
seg_meta(11),
seg_meta(12),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2)];
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -28,10 +28,11 @@ pub struct IndexMerger {
}
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet)
-> Option<(u64, u64)> {
fn compute_min_max_val(
u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet,
) -> Option<(u64, u64)> {
if max_doc == 0 {
None
} else if !delete_bitset.has_deletes() {
@@ -49,17 +50,18 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader,
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
fn extract_fieldnorm_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader,
field: Field)
-> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field)
.ok()
fn extract_fast_field_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field).ok()
}
@@ -100,10 +102,10 @@ impl IndexMerger {
}
}
Ok(IndexMerger {
schema: schema,
readers: readers,
max_doc: max_doc,
})
schema: schema,
readers: readers,
max_doc: max_doc,
})
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -114,9 +116,11 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer)
self.generic_write_fast_field(
fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer,
)
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
@@ -127,19 +131,21 @@ impl IndexMerger {
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields,
&extract_fast_field_reader,
fast_field_serializer)
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field)
-> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer)
-> Result<()> {
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
@@ -151,19 +157,25 @@ impl IndexMerger {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader,
reader.max_doc(),
reader.delete_bitset()) {
compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
)
{
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u64_readers
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
}
}
None => {
let error_msg = format!("Failed to find a u64_reader for field {:?}",
field);
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
bail!(ErrorKind::SchemaError(error_msg));
}
@@ -179,8 +191,11 @@ impl IndexMerger {
assert!(min_val <= max_val);
let mut fast_single_field_serializer = fast_field_serializer
.new_u64_fast_field(field, min_val, max_val)?;
let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(
field,
min_val,
max_val,
)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
@@ -199,9 +214,8 @@ impl IndexMerger {
let mut delta_computer = DeltaComputer::new();
let mut indexed_fields = vec!();
let mut indexed_fields = vec![];
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
// if field_entry
if field_entry.is_indexed() {
indexed_fields.push(Field(field_ord as u32));
}
@@ -211,9 +225,8 @@ impl IndexMerger {
let field_readers = self.readers
.iter()
.map(|reader|
reader.inverted_index(indexed_field))
.collect::<Result<Vec<_>>>()?;
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
let field_term_streams = field_readers
.iter()
@@ -224,7 +237,8 @@ impl IndexMerger {
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
@@ -258,8 +272,10 @@ impl IndexMerger {
let segment_postings_option = field_entry
.field_type()
.get_segment_postings_option()
.expect("Encountered a field that is not supposed to be
indexed. Have you modified the schema?");
.expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
while merged_terms.advance() {
@@ -273,9 +289,11 @@ impl IndexMerger {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(term.field()).unwrap(); // TODO fix unwrap
let mut segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option);
let inverted_index = segment_reader.inverted_index(term.field());
let mut segment_postings = inverted_index.read_postings_from_terminfo(
term_info,
segment_postings_option,
);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
@@ -304,14 +322,18 @@ impl IndexMerger {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize] {
old_to_new_doc_id[segment_postings.doc() as usize]
{
// we make sure to only write the term iff
// there is at least one document.
let positions: &[u32] = segment_postings.positions();
let term_freq = segment_postings.term_freq();
let delta_positions = delta_computer.compute_delta(positions);
field_serializer
.write_doc(remapped_doc_id, term_freq, delta_positions)?;
field_serializer.write_doc(
remapped_doc_id,
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
@@ -349,8 +371,12 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer())?;
self.write_fieldnorms(
serializer.get_fieldnorms_serializer(),
)?;
self.write_fast_fields(
serializer.get_fast_field_serializer(),
)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
Ok(self.max_doc)
@@ -429,14 +455,13 @@ mod tests {
}
}
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index_writer.wait_merging_threads().unwrap();
}
{
@@ -449,14 +474,22 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]);
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]
);
}
{
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
@@ -485,8 +518,10 @@ mod tests {
assert!(searcher.search(&query, &mut collector).is_ok());
collector.vals()
};
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]);
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]
);
}
}
}
@@ -533,14 +568,22 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]
);
}
{
// a second commit
@@ -572,20 +615,34 @@ mod tests {
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
@@ -603,33 +660,46 @@ mod tests {
}
{
// merging the segments
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -648,20 +718,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -671,13 +755,12 @@ mod tests {
}
{
// Test merging a single segment in order to remove deletes.
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -685,20 +768,34 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
@@ -710,13 +807,12 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index.searchable_segment_ids().expect(
"Searchable segments failed.",
);
index_writer.merge(&segment_ids).wait().expect(
"Merging failed",
);
index.load_searchers().unwrap();
let ref searcher = *index.searcher();

View File

@@ -44,10 +44,11 @@ pub struct SegmentEntry {
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>)
-> SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,

View File

@@ -32,31 +32,36 @@ pub struct SegmentManager {
impl Debug for SegmentManager {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let lock = self.read();
write!(f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed)
write!(
f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted,
lock.committed
)
}
}
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments())
(
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
impl SegmentManager {
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor)
-> SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: DeleteCursor,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas,
delete_cursor),
writing: HashSet::new(),
}),
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
}
@@ -94,25 +99,24 @@ impl SegmentManager {
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
registers.committed.segment_entry(segment_id).or_else(|| {
registers.uncommitted.segment_entry(segment_id)
})
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
self.registers
.read()
.expect("Failed to acquire read lock on SegmentManager.")
self.registers.read().expect(
"Failed to acquire read lock on SegmentManager.",
)
}
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
self.registers
.write()
.expect("Failed to acquire write lock on SegmentManager.")
self.registers.write().expect(
"Failed to acquire write lock on SegmentManager.",
)
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
@@ -140,9 +144,11 @@ impl SegmentManager {
}
pub fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId) {
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
@@ -150,13 +156,15 @@ impl SegmentManager {
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
@@ -185,23 +193,26 @@ impl SegmentManager {
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry) {
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
registers_lock.writing.remove(&after_merge_segment_entry
.segment_id());
let target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids) {
if registers_lock.uncommitted.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids) {
} else if registers_lock.committed.contains_all(
before_merge_segment_ids,
)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");

View File

@@ -24,7 +24,12 @@ impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
try!(write!(
f,
"{}:{}, ",
k.short_uuid_string(),
v.state().letter_code()
));
}
try!(write!(f, ")"));
Ok(())
@@ -74,9 +79,9 @@ impl SegmentRegister {
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
segment_ids.iter().all(|segment_id| {
self.segment_states.contains_key(segment_id)
})
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
@@ -91,14 +96,18 @@ impl SegmentRegister {
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.expect(
"Received a merge notification for a segment that is not registered",
)
.start_merge();
}
@@ -144,34 +153,42 @@ mod tests {
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{

View File

@@ -28,11 +28,11 @@ impl SegmentSerializer {
let postings_serializer = try!(InvertedIndexSerializer::open(segment));
Ok(SegmentSerializer {
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
postings_serializer: postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
})
}
/// Accessor to the `PostingsSerializer`.

View File

@@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema: schema,
@@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64)
-> Result<SegmentEntry> {
fn perform_merge(
segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
@@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId],
for segment_id in segment_ids {
if let Some(mut segment_entry) =
segment_updater.0.segment_manager.segment_entry(segment_id) {
segment_updater.0.segment_manager.segment_entry(segment_id)
{
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) =
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
advance_deletes(segment, &mut segment_entry, target_opstamp)?
{
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
} else {
error!("Error, had to abort merge as some of the segment is not managed anymore.");
let msg = format!("Segment {:?} requested for merge is not managed.",
segment_id);
let msg = format!(
"Segment {:?} requested for merge is not managed.",
segment_id
);
bail!(ErrorKind::InvalidArgument(msg));
}
}
@@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId],
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect(
"Creating index serializer failed",
);
let num_docs = merger
.write(segment_serializer)
.expect("Serializing merged index failed");
let num_docs = merger.write(segment_serializer).expect(
"Serializing merged index failed",
);
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
@@ -161,23 +168,24 @@ struct InnerSegmentUpdater {
}
impl SegmentUpdater {
pub fn new(index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor)
-> Result<SegmentUpdater> {
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor,
) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
})))
}
pub fn new_segment(&self) -> Segment {
@@ -199,10 +207,10 @@ impl SegmentUpdater {
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
}
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
(&self,
f: F)
-> CpuFuture<T, Error> {
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
&self,
f: F,
) -> CpuFuture<T, Error> {
let me_clone = self.clone();
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
}
@@ -211,11 +219,10 @@ impl SegmentUpdater {
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
if generation >= self.0.generation.load(Ordering::Acquire) {
self.run_async(|segment_updater| {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
}).forget();
true
} else {
false
@@ -249,46 +256,46 @@ impl SegmentUpdater {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
save_metas(self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut())
.expect("Could not save metas.");
save_metas(
self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
fn garbage_collect_files_exec(&self) {
info!("Running garbage collection");
let mut index = self.0.index.clone();
index.directory_mut().garbage_collect(|| {
self.0.segment_manager.list_files()
});
index.directory_mut().garbage_collect(
|| self.0.segment_manager.list_files(),
);
}
pub fn commit(&self, opstamp: u64) -> Result<()> {
self.run_async(move |segment_updater| if segment_updater.is_alive() {
let segment_entries = segment_updater
.purge_deletes(opstamp)
.expect("Failed purge deletes");
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
})
.wait()
let segment_entries = segment_updater.purge_deletes(opstamp).expect(
"Failed purge deletes",
);
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}).wait()
}
pub fn start_merge(&self,
segment_ids: &[SegmentId])
-> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn start_merge(
&self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
@@ -308,10 +315,12 @@ impl SegmentUpdater {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp);
let merge_result = perform_merge(
&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
@@ -345,11 +354,10 @@ impl SegmentUpdater {
.remove(&merging_thread_id);
Ok(())
});
self.0
.merging_threads
.write()
.unwrap()
.insert(merging_thread_id, merging_join_handle);
self.0.merging_threads.write().unwrap().insert(
merging_thread_id,
merging_join_handle,
);
merging_future_recv
}
@@ -368,19 +376,23 @@ impl SegmentUpdater {
}
}
fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0.segment_manager.cancel_merge(
before_merge_segment_ids,
after_merge_segment_entry,
);
}
fn end_merge(&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry)
-> Result<()> {
fn end_merge(
&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
@@ -391,28 +403,37 @@ impl SegmentUpdater {
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(segment,
&mut after_merge_segment_entry,
committed_opstamp) {
match advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e);
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids,
e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(&before_merge_segment_ids,
after_merge_segment_entry.segment_id());
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids,
after_merge_segment_entry);
segment_updater.0.segment_manager.end_merge(
&before_merge_segment_ids,
after_merge_segment_entry,
);
segment_updater.consider_merge_options();
info!("save metas");
segment_updater.save_metas(segment_updater.0.index.opstamp());
@@ -450,10 +471,9 @@ impl SegmentUpdater {
}
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
merging_thread_handle.join().map(|_| ()).map_err(|_| {
ErrorKind::ErrorInThread("Merging thread failed.".into())
})?;
}
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
@@ -522,9 +542,9 @@ mod tests {
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
index_writer.wait_merging_threads().expect(
"waiting for merging threads",
);
}
index.load_searchers().unwrap();

View File

@@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> {
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema)
-> Result<SegmentWriter<'a>> {
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter<'a>> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
/// Lay on disk the current content of the `SegmentWriter`
@@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> {
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
)?;
Ok(self.doc_opstamps)
}
@@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> {
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self,
add_operation: &AddOperation,
schema: &Schema)
-> io::Result<()> {
pub fn add_document(
&mut self,
add_operation: &AddOperation,
schema: &Schema,
) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
@@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> {
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
self.multifield_postings
.index_text(doc_id, field, &field_values)
self.multifield_postings.index_text(
doc_id,
field,
&field_values,
)
} else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
@@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> {
}
num_field_values
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
self.fieldnorms_writer.get_field_writer(field).map(
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
);
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(field_value.field(),
field_value.value().u64_value());
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> {
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(),
field_value.value().i64_value());
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.suscribe(doc_id, &term);
}
}
@@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> {
self.fast_field_writers.add_document(doc);
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
.filter(|field_value| {
schema.get_field_entry(field_value.field()).is_stored()
})
.collect();
let doc_writer = self.segment_serializer.get_store_writer();
try!(doc_writer.store(&stored_fieldvalues));
@@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> {
}
// This method is used as a trick to workaround the borrow checker
fn write(multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer)
-> Result<()> {
fn write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer,
) -> Result<()> {
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(multifield_postings.serialize(
serializer.get_postings_serializer(),
));
try!(fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
));
try!(fieldnorms_writer.serialize(
serializer.get_fieldnorms_serializer(),
));
try!(serializer.close());
Ok(())
@@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter,
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer)?;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
)?;
Ok(max_doc)
}
}

View File

@@ -68,7 +68,7 @@ extern crate stable_deref_trait;
#[cfg(test)]
extern crate env_logger;
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
extern crate libc;
#[cfg(windows)]
@@ -391,16 +391,24 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field).unwrap();
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -426,17 +434,25 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field()).unwrap();
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -462,14 +478,22 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field()).unwrap();
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
{
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert!(!postings.advance());
}
{
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -477,7 +501,9 @@ mod tests {
assert!(!postings.advance());
}
{
let mut postings = inverted_index.read_postings(&term_c, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_c, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
@@ -501,7 +527,7 @@ mod tests {
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field()).unwrap()
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -525,7 +551,7 @@ mod tests {
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field()).unwrap()
.inverted_index(term.field())
.read_postings(&term, SegmentPostingsOption::NoFreq)
.unwrap();
assert!(postings.advance());
@@ -588,11 +614,17 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field).unwrap();
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
assert!(
inverted_index
.read_postings(&term_abcd, FreqAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index.read_postings(&term_af, FreqAndPositions).unwrap();
let mut postings = inverted_index
.read_postings(&term_af, FreqAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);
@@ -634,29 +666,43 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]
);
}
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a")]),
vec![0, 1, 2]);
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![0, 1, 2]
);
}
}
}
@@ -693,7 +739,8 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let other_text_field = schema_builder.add_text_field("text2", TEXT);
let document = doc!(text_field => "tantivy",
let document =
doc!(text_field => "tantivy",
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);

View File

@@ -72,8 +72,7 @@ pub trait DocSet {
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
}
else {
} else {
return i;
}
}

View File

@@ -65,7 +65,9 @@ mod tests {
field_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer.write_doc(doc_id, 2, &delta_positions).unwrap();
field_serializer
.write_doc(doc_id, 2, &delta_positions)
.unwrap();
}
field_serializer.close_term().unwrap();
}
@@ -84,8 +86,8 @@ mod tests {
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema)
.unwrap();
let mut segment_writer =
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
@@ -131,15 +133,17 @@ mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.inverted_index(term_a.field()).unwrap()
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.is_none());
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader
.inverted_index(term_a.field()).unwrap()
.inverted_index(term_a.field())
.read_postings(&term_a, FreqAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
@@ -162,7 +166,7 @@ mod tests {
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader
.inverted_index(term_e.field()).unwrap()
.inverted_index(term_e.field())
.read_postings(&term_e, FreqAndPositions)
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
@@ -202,8 +206,10 @@ mod tests {
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let searcher = index.searcher();
let mut term_weight = term_query.specialized_weight(&*searcher);
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
@@ -250,7 +256,7 @@ mod tests {
for i in 0..num_docs - 1 {
for j in i + 1..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -264,7 +270,7 @@ mod tests {
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -285,7 +291,7 @@ mod tests {
// check that filtering works
{
let mut segment_postings = segment_reader
.inverted_index(term_0.field()).unwrap()
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -295,7 +301,7 @@ mod tests {
}
let mut segment_postings = segment_reader
.inverted_index(term_0.field()).unwrap()
.inverted_index(term_0.field())
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -320,7 +326,7 @@ mod tests {
// make sure seeking still works
for i in 0..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -336,7 +342,7 @@ mod tests {
// now try with a longer sequence
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -372,14 +378,14 @@ mod tests {
// finally, check that it's empty
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
let mut segment_postings = segment_reader
.inverted_index(term_2.field()).unwrap()
.inverted_index(term_2.field())
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -446,12 +452,12 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()).unwrap()
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
while segment_postings.advance() {}
});
}
#[bench]
@@ -460,25 +466,27 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let segment_postings_a = segment_reader
.inverted_index(TERM_A.field()).unwrap()
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_b = segment_reader
.inverted_index(TERM_B.field()).unwrap()
.inverted_index(TERM_B.field())
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_c = segment_reader
.inverted_index(TERM_C.field()).unwrap()
.inverted_index(TERM_C.field())
.read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
.unwrap();
let segment_postings_d = segment_reader
.inverted_index(TERM_D.field()).unwrap()
.inverted_index(TERM_D.field())
.read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
.unwrap();
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d]);
let mut intersection = IntersectionDocSet::from(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
while intersection.advance() {}
});
}
@@ -489,7 +497,7 @@ mod tests {
let docs = tests::sample(segment_reader.num_docs(), p);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()).unwrap()
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
@@ -506,7 +514,7 @@ mod tests {
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()).unwrap()
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
for doc in &existing_docs {
@@ -544,7 +552,7 @@ mod tests {
b.iter(|| {
let n: u32 = test::black_box(17);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()).unwrap()
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
.unwrap();
let mut s = 0u32;

View File

@@ -16,9 +16,10 @@ use schema::FieldEntry;
use schema::FieldType;
use schema::TextIndexingOptions;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
heap: &'a Heap)
-> Box<PostingsWriter + 'a> {
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
@@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| {
posting_from_field_entry(field_entry, heap)
})
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
@@ -102,7 +101,11 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (_, stop) = offsets[i + 1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
let mut field_serializer = serializer.new_field(field)?;
postings_writer.serialize(&term_offsets[start..stop], &mut field_serializer, self.heap)?;
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
self.heap,
)?;
field_serializer.close()?;
}
Ok(())
@@ -127,29 +130,33 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap);
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap,
);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
fn index_text<'a>(
&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap,
) -> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
@@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap) {
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap,
) {
debug_assert!(term.as_slice().len() >= 4);
let recorder: &mut Rec = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
@@ -213,11 +222,12 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
recorder.record_position(position, heap);
}
fn serialize(&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
serializer.new_term(term_bytes)?;
@@ -227,4 +237,3 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
Ok(())
}
}

View File

@@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Pushes the postings information to the serializer.
fn serialize(&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
}
/// Only records the doc ids
@@ -64,11 +65,12 @@ impl Recorder for NothingRecorder {
fn close_doc(&mut self, _heap: &Heap) {}
fn serialize(&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
@@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder {
}
fn serialize(&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
let mut doc_iter = self.stack.iter(self_addr, heap).chain(
Some(self.current_tf)
.into_iter(),
);
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
let term_freq = doc_iter.next().expect(
"The IndexWriter recorded a doc without a term freq.",
);
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
@@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder {
self.stack.push(POSITION_END, heap);
}
fn serialize(&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
@@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder {
prev_position = position;
}
}
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
serializer.write_doc(
doc,
doc_positions.len() as u32,
&doc_positions,
)?;
}
Ok(())
}

View File

@@ -25,11 +25,10 @@ struct PositionComputer {
}
impl PositionComputer {
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
PositionComputer {
position_to_skip: None,
positions: vec!(),
positions: vec![],
positions_stream: positions_stream,
}
}
@@ -38,9 +37,9 @@ impl PositionComputer {
self.position_to_skip = Some(
self.position_to_skip
.map(|prev_skip| prev_skip + num_skip)
.unwrap_or(0)
);
}
.unwrap_or(0),
);
}
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
if let Some(num_skip) = self.position_to_skip {
@@ -83,13 +82,13 @@ impl SegmentPostings {
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings,
delete_bitset: DeleteBitSet,
positions_stream_opt: Option<CompressedIntStream>)
-> SegmentPostings {
let position_computer = positions_stream_opt.map(|stream| {
UnsafeCell::new(PositionComputer::new(stream))
});
pub fn from_block_postings(
segment_block_postings: BlockSegmentPostings,
delete_bitset: DeleteBitSet,
positions_stream_opt: Option<CompressedIntStream>,
) -> SegmentPostings {
let position_computer =
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
SegmentPostings {
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
@@ -110,7 +109,7 @@ impl SegmentPostings {
}
fn position_add_skip<F: FnOnce()->usize>(&self, num_skips_fn: F) {
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
if let Some(ref position_computer) = self.position_computer.as_ref() {
let num_skips = num_skips_fn();
unsafe {
@@ -135,7 +134,7 @@ impl DocSet for SegmentPostings {
return false;
}
}
self.position_add_skip(|| { self.term_freq() as usize });
self.position_add_skip(|| self.term_freq() as usize);
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
}
@@ -257,8 +256,10 @@ impl DocSet for SegmentPostings {
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
debug_assert!(self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc().");
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc()."
);
docs[self.cur]
}
}
@@ -278,16 +279,11 @@ impl Postings for SegmentPostings {
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| {
unsafe {
(&mut *position_computer.get()).positions(term_freq as usize)
}
.map(|position_computer| unsafe {
(&mut *position_computer.get()).positions(term_freq as usize)
})
.unwrap_or(&EMPTY_POSITIONS[..])
}
}
@@ -311,10 +307,11 @@ pub struct BlockSegmentPostings {
}
impl BlockSegmentPostings {
pub(crate) fn from_data(doc_freq: usize,
data: SourceRead,
has_freq: bool)
-> BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
has_freq: bool,
) -> BlockSegmentPostings {
let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks;
BlockSegmentPostings {
@@ -402,15 +399,16 @@ impl BlockSegmentPostings {
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_binpacked_blocks > 0 {
// TODO could self.doc_offset be just a local variable?
let num_consumed_bytes = self
.doc_decoder
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
);
self.remaining_data.advance(num_consumed_bytes);
if self.has_freq {
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref());
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(
self.remaining_data.as_ref(),
);
self.remaining_data.advance(num_consumed_bytes);
}
// it will be used as the next offset.
@@ -418,15 +416,17 @@ impl BlockSegmentPostings {
self.num_binpacked_blocks -= 1;
true
} else if self.num_vint_docs > 0 {
let num_compressed_bytes =
self.doc_decoder
.uncompress_vint_sorted(self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs);
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
if self.has_freq {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
self.freq_decoder.uncompress_vint_unsorted(
self.remaining_data.as_ref(),
self.num_vint_docs,
);
}
self.num_vint_docs = 0;
true
@@ -508,12 +508,13 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
let mut block_segments =
inverted_index
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -549,17 +550,18 @@ mod tests {
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments =
inverted_index
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
block_segments = inverted_index.read_block_postings_from_terminfo(
&term_info,
SegmentPostingsOption::NoFreq,
);
}
assert!(block_segments.advance());
assert!(block_segments.docs() == &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}

View File

@@ -17,7 +17,6 @@ pub enum SegmentPostingsOption {
}
impl SegmentPostingsOption {
/// Returns true iff this option includes encoding
/// term frequencies.
pub fn has_freq(&self) -> bool {

View File

@@ -57,11 +57,12 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
fn new(terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema)
-> Result<InvertedIndexSerializer> {
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
) -> Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer {
terms_write: terms_write,
postings_write: postings_write,
@@ -78,7 +79,8 @@ impl InvertedIndexSerializer {
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
segment.schema())
segment.schema(),
)
}
/// Must be called before starting pushing terms of
@@ -94,7 +96,7 @@ impl InvertedIndexSerializer {
field_entry.field_type().clone(),
term_dictionary_write,
postings_write,
positions_write
positions_write,
)
}
@@ -120,7 +122,6 @@ pub struct FieldSerializer<'a> {
impl<'a> FieldSerializer<'a> {
fn new(
field_type: FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
@@ -128,25 +129,24 @@ impl<'a> FieldSerializer<'a> {
positions_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) =
match field_type {
FieldType::Str(ref text_options) => {
let text_indexing_options = text_options.get_indexing_options();
(text_indexing_options.is_termfreq_enabled(), text_indexing_options.is_position_enabled())
},
_ => {
(false, false)
}
};
let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt =
if position_enabled {
Some(PositionSerializer::new(positions_write))
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
let text_indexing_options = text_options.get_indexing_options();
(
text_indexing_options.is_termfreq_enabled(),
text_indexing_options.is_position_enabled(),
)
}
else {
None
};
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write))
} else {
None
};
Ok(FieldSerializer {
term_dictionary_builder: term_dictionary_builder,
@@ -159,9 +159,9 @@ impl<'a> FieldSerializer<'a> {
fn current_term_info(&self) -> TermInfo {
let (filepos, offset) = self.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u32, 0u8));
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u32, 0u8));
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
@@ -194,11 +194,12 @@ impl<'a> FieldSerializer<'a> {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32])
-> io::Result<()> {
pub fn write_doc(
&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32],
) -> io::Result<()> {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq)?;
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
@@ -213,7 +214,9 @@ impl<'a> FieldSerializer<'a> {
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.term_dictionary_builder.insert_value(&self.current_term_info)?;
self.term_dictionary_builder.insert_value(
&self.current_term_info,
)?;
self.postings_serializer.close_term()?;
self.term_open = false;
}
@@ -251,8 +254,8 @@ impl<W: Write> PostingsSerializer<W> {
postings_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
doc_ids: vec!(),
term_freqs: vec!(),
doc_ids: vec![],
term_freqs: vec![],
last_doc_id_encoded: 0u32,
termfreq_enabled: termfreq_enabled,
@@ -267,16 +270,17 @@ impl<W: Write> PostingsSerializer<W> {
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] =
self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
self.postings_write.write_all(block_encoded)?;
}
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
@@ -294,16 +298,18 @@ impl<W: Write> PostingsSerializer<W> {
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded =
self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
let block_encoded = self.block_encoder.compress_vint_sorted(
&self.doc_ids,
self.last_doc_id_encoded,
);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
let block_encoded = self.block_encoder.compress_vint_unsorted(
&self.term_freqs[..],
);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
@@ -373,4 +379,3 @@ impl<W: Write> PositionSerializer<W> {
self.write.flush()
}
}

View File

@@ -12,7 +12,7 @@ use std::io;
/// * `postings_offset` : an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,

View File

@@ -37,10 +37,12 @@ impl Query for BooleanQuery {
}
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
let sub_weights = try!(self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect());
let sub_weights = try!(
self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect()
);
let occurs: Vec<Occur> = self.subqueries
.iter()
.map(|&(ref occur, ref _subquery)| *occur)
@@ -57,10 +59,9 @@ impl BooleanQuery {
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
.into_iter()
.map(|term| {
let term_query: Box<Query> = box TermQuery::new(term,
SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
(Occur::Should, term_query)
})
.collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -55,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
.map(|posting| posting.doc())
.enumerate()
.map(|(ord, doc)| {
HeapItem {
doc: doc,
ord: ord as u32,
}
})
HeapItem {
doc: doc,
ord: ord as u32,
}
})
.collect();
BooleanScorer {
scorers: non_empty_scorers,

View File

@@ -22,11 +22,12 @@ impl BooleanWeight {
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> =
try!(self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect());
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect()
);
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}

View File

@@ -64,8 +64,10 @@ mod tests {
}
let make_term_query = |text: &str| {
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
SegmentPostingsOption::NoFreq,
);
let query: Box<Query> = box term_query;
query
};
@@ -87,19 +89,25 @@ mod tests {
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
}
{

View File

@@ -61,9 +61,9 @@ mod tests {
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::from(terms);
searcher
.search(&phrase_query, &mut test_collector)
.expect("search should succeed");
searcher.search(&phrase_query, &mut test_collector).expect(
"search should succeed",
);
test_collector.docs()
};

View File

@@ -22,7 +22,7 @@ impl Weight for PhraseWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
let inverted_index = reader.inverted_index(term.field())?;
let inverted_index = reader.inverted_index(term.field());
let term_postings_option =
inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions);
if let Some(term_postings) = term_postings_option {
@@ -31,6 +31,8 @@ impl Weight for PhraseWeight {
return Ok(box EmptyScorer);
}
}
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
Ok(box PhraseScorer {
intersection_docset: IntersectionDocSet::from(term_postings_list),
})
}
}

View File

@@ -66,7 +66,10 @@ pub trait Query: fmt::Debug {
let mut segment_search_timer = search_timer.open("segment_search");
{
let _ = segment_search_timer.open("set_segment");
try!(collector.set_segment(segment_ord as SegmentLocalId, segment_reader));
try!(collector.set_segment(
segment_ord as SegmentLocalId,
segment_reader,
));
}
let mut scorer = try!(weight.scorer(segment_reader));
{

View File

@@ -3,7 +3,8 @@ use combine::char::*;
use super::user_input_ast::*;
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
let term_val = || {
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
@@ -11,27 +12,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
phrase.or(word)
};
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map(
|(s1, s2): (char, String)| format!("{}{}", s1, s2),
);
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field = (
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name:
Some(field_name),
phrase: phrase,
}
});
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,
}
});
let term_default_field = term_val().map(|phrase| {
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
UserInputLiteral {
field_name: None,
phrase: phrase,
}
});
try(term_query)
.or(term_default_field)
.map(UserInputAST::from)
@@ -40,25 +43,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
(char('-'), parser(literal))
.map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or((char('+'), parser(literal)).map(|(_, expr)| {
UserInputAST::Must(box expr)
}))
.or(parser(literal))
.parse_stream(input)
}
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
where I: Stream<Item = char>
where
I: Stream<Item = char>,
{
sep_by(parser(leaf), spaces())
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
subqueries.into_iter().next().unwrap()
} else {
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
})
.parse_stream(input)
}

View File

@@ -117,20 +117,22 @@ impl QueryParser {
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
let (user_input_ast, _remaining) = parse_to_ast(query)
.map_err(|_| QueryParserError::SyntaxError)?;
let (user_input_ast, _remaining) = parse_to_ast(query).map_err(
|_| QueryParserError::SyntaxError,
)?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
self.schema.get_field(field_name).ok_or_else(|| {
QueryParserError::FieldDoesNotExist(String::from(field_name))
})
}
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
fn compute_logical_ast(
&self,
user_input_ast: UserInputAST,
) -> Result<LogicalAST, QueryParserError> {
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
@@ -138,10 +140,11 @@ impl QueryParser {
Ok(ast)
}
fn compute_logical_ast_for_leaf(&self,
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
@@ -174,7 +177,9 @@ impl QueryParser {
if terms.is_empty() {
Ok(None)
} else if terms.len() == 1 {
Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
Ok(Some(
LogicalLiteral::Term(terms.into_iter().next().unwrap()),
))
} else {
Ok(Some(LogicalLiteral::Phrase(terms)))
}
@@ -191,18 +196,24 @@ impl QueryParser {
}
}
fn compute_logical_ast_with_occur(&self,
user_input_ast: UserInputAST)
-> Result<(Occur, LogicalAST), QueryParserError> {
fn compute_logical_ast_with_occur(
&self,
user_input_ast: UserInputAST,
) -> Result<(Occur, LogicalAST), QueryParserError> {
match user_input_ast {
UserInputAST::Clause(sub_queries) => {
let default_occur = self.default_occur();
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
})
.collect());
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
sub_queries
.into_iter()
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
.map(|res| {
res.map(|(occur, sub_ast)| {
(compose_occur(default_occur, occur), sub_ast)
})
})
.collect()
);
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
}
UserInputAST::Not(subquery) => {
@@ -320,9 +331,10 @@ mod test {
}
fn parse_query_to_logical_ast(query: &str,
default_conjunction: bool)
-> Result<LogicalAST, QueryParserError> {
fn parse_query_to_logical_ast(
query: &str,
default_conjunction: bool,
) -> Result<LogicalAST, QueryParserError> {
let mut query_parser = make_query_parser();
if default_conjunction {
query_parser.set_conjunction_by_default();
@@ -330,9 +342,11 @@ mod test {
query_parser.parse_query_to_logical_ast(query)
}
fn test_parse_query_to_logical_ast_helper(query: &str,
expected: &str,
default_conjunction: bool) {
fn test_parse_query_to_logical_ast_helper(
query: &str,
expected: &str,
default_conjunction: bool,
) {
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
let query_str = format!("{:?}", query);
assert_eq!(query_str, expected);
@@ -358,21 +372,29 @@ mod test {
}
};
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text")));
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64")));
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64")));
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
test_parse_query_to_logical_ast_helper(
"nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
101, 32, 119, 111, 114, 100, 116, 119, 111])",
false);
false,
);
}
#[test]
@@ -381,82 +403,115 @@ mod test {
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok());
assert!(
query_parser
.parse_query("signed:\"-9999999999999\"")
.is_ok()
);
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err());
assert!(
query_parser
.parse_query("signed:\"18446744073709551615\"")
.is_err()
);
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok());
test_parse_query_to_logical_ast_helper("unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false);
assert!(
query_parser
.parse_query("unsigned:\"18446744073709551615\"")
.is_ok()
);
test_parse_query_to_logical_ast_helper(
"unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false,
);
test_parse_query_to_logical_ast_helper("signed:-2324",
&format!("{:?}",
Term::from_field_i64(Field(2u32), -2324)),
false);
test_parse_query_to_logical_ast_helper(
"signed:-2324",
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false,
);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
false,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", false)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
false,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
false);
false,
);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
test_parse_query_to_logical_ast_helper(
"title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true,
);
test_parse_query_to_logical_ast_helper(
"+title:toto -titi",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 0, 0, 0, 97]) \
true,
);
assert_eq!(
parse_query_to_logical_ast("-title:toto", true)
.err()
.unwrap(),
QueryParserError::AllButQueryForbidden
);
test_parse_query_to_logical_ast_helper(
"title:a b",
"(+Term([0, 0, 0, 0, 97]) \
+(Term([0, 0, 0, 0, 98]) \
Term([0, 0, 0, 1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
true,
);
test_parse_query_to_logical_ast_helper(
"title:\"a b\"",
"\"[Term([0, 0, 0, 0, 97]), \
Term([0, 0, 0, 0, 98])]\"",
true);
true,
);
}
}

View File

@@ -44,8 +44,10 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
SegmentPostingsOption::NoFreq,
);
let term_weight = term_query.weight(&searcher).unwrap();
let segment_reader = searcher.segment_reader(0);
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();

View File

@@ -7,7 +7,8 @@ use postings::Postings;
use fastfield::FastFieldReader;
pub struct TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub idf: Score,
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
@@ -15,7 +16,8 @@ pub struct TermScorer<TPostings>
}
impl<TPostings> TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
pub fn postings(&self) -> &TPostings {
&self.postings
@@ -23,7 +25,8 @@ impl<TPostings> TermScorer<TPostings>
}
impl<TPostings> DocSet for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn advance(&mut self) -> bool {
self.postings.advance()
@@ -40,7 +43,8 @@ impl<TPostings> DocSet for TermScorer<TPostings>
}
impl<TPostings> Scorer for TermScorer<TPostings>
where TPostings: Postings
where
TPostings: Postings,
{
fn score(&self) -> Score {
let doc = self.postings.doc();

View File

@@ -28,21 +28,22 @@ impl TermWeight {
}
/// If the field is not found, returns an empty `DocSet`.
pub fn specialized_scorer(&self,
reader: &SegmentReader)
-> Result<TermScorer<SegmentPostings>> {
pub fn specialized_scorer(
&self,
reader: &SegmentReader,
) -> Result<TermScorer<SegmentPostings>> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
let postings_opt: Option<SegmentPostings> = inverted_index.read_postings(&self.term, self.segment_postings_options);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.segment_postings_options);
if let Some(segment_postings) = postings_opt {
Ok(TermScorer {
idf: self.idf(),
fieldnorm_reader_opt: fieldnorm_reader_opt,
postings: segment_postings,
})
}
else {
} else {
Ok(TermScorer {
idf: 1f32,
fieldnorm_reader_opt: None,

View File

@@ -10,7 +10,7 @@ use common::BinarySerializable;
///
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
/// Value 255 is reserved.
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
pub struct Field(pub u32);
impl BinarySerializable for Field {

View File

@@ -89,7 +89,8 @@ impl FieldEntry {
impl Serialize for FieldEntry {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut s = serializer.serialize_struct("field_entry", 3)?;
s.serialize_field("name", &self.name)?;
@@ -115,7 +116,8 @@ impl Serialize for FieldEntry {
impl<'de> Deserialize<'de> for FieldEntry {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(field_identifier, rename_all = "lowercase")]
@@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
}
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
where V: MapAccess<'de>
where
V: MapAccess<'de>,
{
let mut name = None;
let mut ty = None;
@@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry {
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
let field_type = field_type
.ok_or_else(|| de::Error::missing_field("options"))?;
let field_type = field_type.ok_or_else(
|| de::Error::missing_field("options"),
)?;
Ok(FieldEntry {
name: name,
field_type: field_type,
})
name: name,
field_type: field_type,
})
}
}

View File

@@ -80,8 +80,9 @@ impl FieldType {
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
FieldType::U64(_) |
FieldType::I64(_) => {
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}",
json)))
Err(ValueParsingError::TypeError(
format!("Expected an integer, got {:?}", json),
))
}
}
}
@@ -110,9 +111,11 @@ impl FieldType {
}
}
_ => {
let msg = format!("Json value not supported error {:?}. Expected {:?}",
json,
self);
let msg = format!(
"Json value not supported error {:?}. Expected {:?}",
json,
self
);
Err(ValueParsingError::TypeError(msg))
}
}

View File

@@ -105,9 +105,9 @@ impl SchemaBuilder {
/// This will consume your `SchemaBuilder`
pub fn build(self) -> Schema {
Schema(Arc::new(InnerSchema {
fields: self.fields,
fields_map: self.fields_map,
}))
fields: self.fields,
fields_map: self.fields_map,
}))
}
}
@@ -206,15 +206,14 @@ impl Schema {
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json)
.map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
let doc_json_sample: String = if doc_json.len() < 20 {
String::from(doc_json)
} else {
format!("{:?}...", &doc_json[0..20])
};
DocParsingError::NotJSON(doc_json_sample)
})?;
let mut doc = Document::default();
for (field_name, json_value) in json_obj.iter() {
@@ -225,18 +224,15 @@ impl Schema {
match *json_value {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = try!(field_type
.value_from_json(json_item)
.map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
let value =
try!(field_type.value_from_json(json_item).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value = try!(field_type
.value_from_json(json_value)
.map_err(|e| {
let value = try!(field_type.value_from_json(json_value).map_err(|e| {
DocParsingError::ValueError(field_name.clone(), e)
}));
doc.add(FieldValue::new(field, value));
@@ -259,7 +255,8 @@ impl fmt::Debug for Schema {
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
for e in &self.0.fields {
@@ -271,7 +268,8 @@ impl Serialize for Schema {
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct SchemaVisitor;
@@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema {
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de>
where
A: SeqAccess<'de>,
{
let mut schema = SchemaBuilder {
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
@@ -430,12 +429,14 @@ mod tests {
}
{
let doc = schema
.parse_document(r#"{
.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10
}"#)
}"#,
)
.unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
@@ -443,13 +444,15 @@ mod tests {
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
assert_eq!(field_name, "jambon");
@@ -460,13 +463,15 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": "5",
"popularity": "10",
"jambon": "bayonne"
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
@@ -477,12 +482,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": -5,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -493,12 +500,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 9223372036854775808,
"popularity": 10
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
@@ -509,12 +518,14 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
"popularity": 9223372036854775808
}"#);
}"#,
);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
@@ -525,11 +536,13 @@ mod tests {
}
}
{
let json_err = schema.parse_document(r#"{
let json_err = schema.parse_document(
r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
}"#);
}"#,
);
match json_err {
Err(NotJSON(_)) => {
assert!(true);

View File

@@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
pub struct Term<B = Vec<u8>>(B)
where
B: AsRef<[u8]>;
impl Term {
/// Builds a term given a field, and a u64-value
@@ -109,7 +111,8 @@ impl Term {
}
impl<B> Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
/// Wraps a source of data
pub fn wrap(data: B) -> Term<B> {
@@ -166,7 +169,8 @@ impl<B> Term<B>
}
impl<B> AsRef<[u8]> for Term<B>
where B: AsRef<[u8]>
where
B: AsRef<[u8]>,
{
fn as_ref(&self) -> &[u8] {
self.0.as_ref()

View File

@@ -2,7 +2,7 @@ use std::ops::BitOr;
/// Define how a text field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: TextIndexingOptions,
stored: bool,
@@ -45,10 +45,10 @@ impl Default for TextOptions {
/// Describe how a field should be indexed
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)]
pub enum TextIndexingOptions {
/// Unindexed fields will not generate any postings. They will not be searchable either.
#[serde(rename="unindexed")]
#[serde(rename = "unindexed")]
Unindexed,
/// Untokenized means that the field text will not be split into tokens before being indexed.
/// A field with the value "Hello world", will have the document suscribe to one single
@@ -56,23 +56,23 @@ pub enum TextIndexingOptions {
///
/// It will **not** be searchable if the user enter "hello" for instance.
/// This can be useful for tags, or ids for instance.
#[serde(rename="untokenized")]
#[serde(rename = "untokenized")]
Untokenized,
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
/// to the posting lists associated to all of the tokens.
/// The frequence of appearance of the term in the document however will be lost.
/// The term frequency used in the TfIdf formula will always be 1.
#[serde(rename="tokenize")]
#[serde(rename = "tokenize")]
TokenizedNoFreq,
/// TokenizedWithFreq will tokenize the field value, and encode
/// both the docid and the term frequency in the posting lists associated to all
#[serde(rename="freq")]
#[serde(rename = "freq")]
TokenizedWithFreq,
/// Like TokenizedWithFreq, but also encodes the positions of the
/// terms in a separate file. This option is required for phrase queries.
/// Don't use this if you are certain you won't need it, the term positions file
/// can be very big.
#[serde(rename="position")]
#[serde(rename = "position")]
TokenizedWithFreqAndPosition,
}

View File

@@ -16,7 +16,8 @@ pub enum Value {
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
where
S: Serializer,
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
@@ -28,7 +29,8 @@ impl Serialize for Value {
impl<'de> Deserialize<'de> for Value {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
where
D: Deserializer<'de>,
{
struct ValueVisitor;
@@ -162,9 +164,13 @@ mod binary_serialize {
Ok(Value::I64(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData,
format!("No field type is associated with code {:?}",
type_code)))
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"No field type is associated with code {:?}",
type_code
),
))
}
}
}

View File

@@ -54,17 +54,19 @@ mod tests {
fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
let mut schema_builder = SchemaBuilder::default();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
let field_title = schema_builder
.add_text_field("title", TextOptions::default().set_stored());
let field_title =
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
let lorem = String::from(
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.");
laborum.",
);
{
let mut store_writer = StoreWriter::new(writer);
for i in 0..num_docs {
@@ -96,8 +98,10 @@ mod tests {
let store_source = directory.open_read(path).unwrap();
let store = StoreReader::from_source(store_source);
for i in 0..1_000 {
assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i));
assert_eq!(
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
format!("Doc {}", i)
);
}
}
@@ -106,9 +110,9 @@ mod tests {
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
let path = Path::new("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
directory.delete(path).unwrap();
});
}

View File

@@ -49,7 +49,7 @@ impl StoreReader {
let mut cursor = &total_buffer[block_offset..];
let block_length = u32::deserialize(&mut cursor).unwrap();
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..
(block_offset + 4 + block_length as usize)];
(block_offset + 4 + block_length as usize)];
let mut lz4_decoder = try!(lz4::Decoder::new(block_array));
*self.current_block_offset.borrow_mut() = usize::max_value();
try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()));
@@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
let offset = offset as usize;
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
(
data.slice(0, offset),
data.slice(offset, footer_offset),
max_doc,
)
}

View File

@@ -49,12 +49,15 @@ impl StoreWriter {
///
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
self.intermediary_buffer.clear();
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
try!((field_values.len() as u32).serialize(
&mut self.intermediary_buffer,
));
for field_value in field_values {
try!((*field_value).serialize(&mut self.intermediary_buffer));
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.current_block)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.current_block,
)?;
self.current_block.write_all(&self.intermediary_buffer[..])?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
@@ -66,16 +69,22 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
self.intermediary_buffer.clear();
{
let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer));
let mut encoder = try!(lz4::EncoderBuilder::new().build(
&mut self.intermediary_buffer,
));
try!(encoder.write_all(&self.current_block));
let (_, encoder_result) = encoder.finish();
try!(encoder_result);
}
(self.intermediary_buffer.len() as u32)
.serialize(&mut self.writer)?;
(self.intermediary_buffer.len() as u32).serialize(
&mut self.writer,
)?;
self.writer.write_all(&self.intermediary_buffer)?;
self.offset_index_writer
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
self.offset_index_writer.insert(
self.doc,
&(self.writer.written_bytes() as
u64),
)?;
self.current_block.clear();
Ok(())
}
@@ -90,8 +99,7 @@ impl StoreWriter {
try!(self.write_and_compress_block());
}
let header_offset: u64 = self.writer.written_bytes() as u64;
try!(self.offset_index_writer
.write(&mut self.writer));
try!(self.offset_index_writer.write(&mut self.writer));
try!(header_offset.serialize(&mut self.writer));
try!(self.doc.serialize(&mut self.writer));
self.writer.flush()

View File

@@ -5,17 +5,13 @@ use super::TermDictionaryImpl;
use termdict::{TermStreamerBuilder, TermStreamer};
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
pub struct TermStreamerBuilderImpl<'a>
{
pub struct TermStreamerBuilderImpl<'a> {
fst_map: &'a TermDictionaryImpl,
stream_builder: StreamBuilder<'a>,
}
impl<'a> TermStreamerBuilderImpl<'a>
{
pub(crate) fn new(fst_map: &'a TermDictionaryImpl,
stream_builder: StreamBuilder<'a>)
-> Self {
impl<'a> TermStreamerBuilderImpl<'a> {
pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self {
TermStreamerBuilderImpl {
fst_map: fst_map,
stream_builder: stream_builder,
@@ -23,8 +19,7 @@ impl<'a> TermStreamerBuilderImpl<'a>
}
}
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
{
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
type Streamer = TermStreamerImpl<'a>;
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
@@ -60,8 +55,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
/// See [`TermStreamer`](./trait.TermStreamer.html)
pub struct TermStreamerImpl<'a>
{
pub struct TermStreamerImpl<'a> {
fst_map: &'a TermDictionaryImpl,
stream: Stream<'a>,
offset: u64,
@@ -69,17 +63,15 @@ pub struct TermStreamerImpl<'a>
current_value: TermInfo,
}
impl<'a> TermStreamer for TermStreamerImpl<'a>
{
impl<'a> TermStreamer for TermStreamerImpl<'a> {
fn advance(&mut self) -> bool {
if let Some((term, offset)) = self.stream.next() {
self.current_key.clear();
self.current_key.extend_from_slice(term);
self.offset = offset;
self.current_value =
self.fst_map
.read_value(self.offset)
.expect("Fst data is corrupted. Failed to deserialize a value.");
self.current_value = self.fst_map.read_value(self.offset).expect(
"Fst data is corrupted. Failed to deserialize a value.",
);
true
} else {
false

View File

@@ -13,14 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
}
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
pub struct TermDictionaryBuilderImpl<W>
{
pub struct TermDictionaryBuilderImpl<W> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
}
impl<W> TermDictionaryBuilderImpl<W>
where W: Write
where
W: Write,
{
/// # Warning
/// Horribly dangerous internal API
@@ -46,14 +46,15 @@ impl<W> TermDictionaryBuilderImpl<W>
}
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
where W: Write
where
W: Write,
{
fn new(w: W, _field_type: FieldType) -> io::Result<Self> {
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilderImpl {
fst_builder: fst_builder,
data: Vec::new(),
})
fst_builder: fst_builder,
data: Vec::new(),
})
}
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
@@ -75,28 +76,25 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
}
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
fn open_fst_index(source: ReadOnlySource) -> fst::Map {
let fst = match source {
ReadOnlySource::Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
.map_err(convert_fst_error)?
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
}
ReadOnlySource::Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
}
};
Ok(fst::Map::from(fst))
fst::Map::from(fst)
}
/// See [`TermDictionary`](./trait.TermDictionary.html)
pub struct TermDictionaryImpl
{
pub struct TermDictionaryImpl {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
}
impl TermDictionaryImpl
{
impl TermDictionaryImpl {
/// Deserialize and returns the value at address `offset`
pub(crate) fn read_value(&self, offset: u64) -> io::Result<TermInfo> {
let buffer = self.values_mmap.as_slice();
@@ -106,34 +104,34 @@ impl TermDictionaryImpl
}
impl<'a> TermDictionary<'a> for TermDictionaryImpl
{
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
type Streamer = TermStreamerImpl<'a>;
type StreamBuilder = TermStreamerBuilderImpl<'a>;
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
fn from_source(source: ReadOnlySource) -> Self {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
let footer_size = u32::deserialize(&mut split_len_buffer).expect(
"Deserializing 4 bytes should always work",
) as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source)?;
Ok(TermDictionaryImpl {
fst_index: fst_index,
values_mmap: values_source,
})
let fst_index = open_fst_index(fst_source);
TermDictionaryImpl {
fst_index: fst_index,
values_mmap: values_source,
}
}
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
self.fst_index
.get(key)
.map(|offset| {
self.read_value(offset)
.expect("The fst is corrupted. Failed to deserialize a value.")
})
self.fst_index.get(key).map(|offset| {
self.read_value(offset).expect(
"The fst is corrupted. Failed to deserialize a value.",
)
})
}
fn range(&self) -> TermStreamerBuilderImpl {

View File

@@ -4,30 +4,26 @@ use std::cmp::Ordering;
use termdict::TermStreamer;
use schema::Term;
pub struct HeapItem<'a>
{
pub struct HeapItem<'a> {
pub streamer: TermStreamerImpl<'a>,
pub segment_ord: usize,
}
impl<'a> PartialEq for HeapItem<'a>
{
impl<'a> PartialEq for HeapItem<'a> {
fn eq(&self, other: &Self) -> bool {
self.segment_ord == other.segment_ord
}
}
impl<'a> Eq for HeapItem<'a> {}
impl<'a> Eq for HeapItem<'a> {}
impl<'a> PartialOrd for HeapItem<'a>
{
impl<'a> PartialOrd for HeapItem<'a> {
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Ord for HeapItem<'a>
{
impl<'a> Ord for HeapItem<'a> {
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
}
@@ -40,15 +36,12 @@ impl<'a> Ord for HeapItem<'a>
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
pub struct TermMerger<'a>
{
pub struct TermMerger<'a> {
heap: BinaryHeap<HeapItem<'a>>,
current_streamers: Vec<HeapItem<'a>>,
}
impl<'a> TermMerger<'a>
{
impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary
///
///
@@ -59,11 +52,11 @@ impl<'a> TermMerger<'a>
.into_iter()
.enumerate()
.map(|(ord, streamer)| {
HeapItem {
streamer: streamer,
segment_ord: ord,
}
})
HeapItem {
streamer: streamer,
segment_ord: ord,
}
})
.collect(),
}
}
@@ -133,5 +126,3 @@ impl<'a> TermMerger<'a>
}
}
}

View File

@@ -54,16 +54,16 @@ use postings::TermInfo;
pub use self::merger::TermMerger;
#[cfg(not(feature="streamdict"))]
#[cfg(not(feature = "streamdict"))]
mod fstdict;
#[cfg(not(feature="streamdict"))]
#[cfg(not(feature = "streamdict"))]
pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
TermStreamerBuilderImpl};
#[cfg(feature="streamdict")]
#[cfg(feature = "streamdict")]
mod streamdict;
#[cfg(feature="streamdict")]
#[cfg(feature = "streamdict")]
pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
TermStreamerBuilderImpl};
@@ -72,7 +72,9 @@ use std::io;
/// Dictionary associating sorted `&[u8]` to values
pub trait TermDictionary<'a> where Self: Sized
pub trait TermDictionary<'a>
where
Self: Sized,
{
/// Streamer type associated to the term dictionary
type Streamer: TermStreamer + 'a;
@@ -81,7 +83,7 @@ pub trait TermDictionary<'a> where Self: Sized
type StreamBuilder: TermStreamerBuilder<Streamer = Self::Streamer> + 'a;
/// Opens a `TermDictionary` given a data source.
fn from_source(source: ReadOnlySource) -> io::Result<Self>;
fn from_source(source: ReadOnlySource) -> Self;
/// Lookups the value corresponding to the key.
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo>;
@@ -110,7 +112,8 @@ pub trait TermDictionary<'a> where Self: Sized
///
/// Inserting must be done in the order of the `keys`.
pub trait TermDictionaryBuilder<W>: Sized
where W: io::Write
where
W: io::Write,
{
/// Creates a new `TermDictionaryBuilder`
fn new(write: W, field_type: FieldType) -> io::Result<Self>;
@@ -170,8 +173,7 @@ pub trait TermStreamer: Sized {
/// `TermStreamerBuilder` is an helper object used to define
/// a range of terms that should be streamed.
pub trait TermStreamerBuilder
{
pub trait TermStreamerBuilder {
/// Associated `TermStreamer` type that this builder is building.
type Streamer: TermStreamer;
@@ -226,7 +228,8 @@ mod tests {
{
let write = directory.open_write(&path).unwrap();
let field_type = FieldType::Str(TEXT);
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
.unwrap();
term_dictionary_builder
.insert("abc".as_bytes(), &make_term_info(34u32))
.unwrap();
@@ -236,7 +239,7 @@ mod tests {
term_dictionary_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap();
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
let mut stream = term_dict.stream();
@@ -296,7 +299,7 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let field_searcher = searcher.field(text_field).unwrap();
let field_searcher = searcher.field(text_field);
let mut term_it = field_searcher.terms();
let mut term_string = String::new();
while term_it.advance() {
@@ -314,15 +317,17 @@ mod tests {
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap();
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
{
let mut streamer = term_dictionary.stream();
let mut i = 0;
@@ -343,16 +348,22 @@ mod tests {
fn test_stream_high_range_prefix_suffix() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
// term requires more than 16bits
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)).unwrap();
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)).unwrap();
term_dictionary_builder.insert("abr", &make_term_info(2)).unwrap();
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
.unwrap();
term_dictionary_builder
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
.unwrap();
term_dictionary_builder
.insert("abr", &make_term_info(2))
.unwrap();
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
let mut kv_stream = term_dictionary.stream();
assert!(kv_stream.advance());
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
@@ -372,17 +383,19 @@ mod tests {
.collect();
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for &(ref id, ref i) in &ids {
term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap();
term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
{
for i in (0..20).chain(6000..8_000) {
let &(ref target_key, _) = &ids[i];
@@ -440,16 +453,18 @@ mod tests {
fn test_stream_range_boundaries() {
let field_type = FieldType::Str(TEXT);
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
.unwrap();
for i in 0u8..10u8 {
let number_arr = [i; 1];
term_dictionary_builder.insert(&number_arr, &make_term_info(i as u32)).unwrap();
term_dictionary_builder
.insert(&number_arr, &make_term_info(i as u32))
.unwrap();
}
term_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
.unwrap();
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
let value_list = |mut streamer: TermStreamerImpl| {
let mut res: Vec<u32> = vec![];
@@ -460,12 +475,17 @@ mod tests {
};
{
let range = term_dictionary.range().ge([2u8]).into_stream();
assert_eq!(value_list(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]);
assert_eq!(
value_list(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).into_stream();
assert_eq!(value_list(range), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]);
assert_eq!(
value_list(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).into_stream();
@@ -473,7 +493,10 @@ mod tests {
}
{
let range = term_dictionary.range().le([6u8]).into_stream();
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]);
assert_eq!(
value_list(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();

View File

@@ -11,7 +11,7 @@ use common::BinarySerializable;
fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize {
s1.iter()
.zip(s2.iter())
.take_while(|&(a, b)| a==b)
.take_while(|&(a, b)| a == b)
.count()
}
@@ -45,32 +45,28 @@ pub struct TermDeltaDecoder {
impl TermDeltaDecoder {
pub fn with_previous_term(term: Vec<u8>) -> TermDeltaDecoder {
TermDeltaDecoder {
term: Vec::from(term)
}
TermDeltaDecoder { term: Vec::from(term) }
}
#[inline(always)]
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
let (prefix_len, suffix_len): (usize, usize) =
if (code & 1u8) == 1u8 {
let b = cursor[0];
cursor = &cursor[1..];
let prefix_len = (b & 15u8) as usize;
let suffix_len = (b >> 4u8) as usize;
(prefix_len, suffix_len)
}
else {
let prefix_len = u32::deserialize(&mut cursor).unwrap();
let suffix_len = u32::deserialize(&mut cursor).unwrap();
(prefix_len as usize, suffix_len as usize)
};
let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 {
let b = cursor[0];
cursor = &cursor[1..];
let prefix_len = (b & 15u8) as usize;
let suffix_len = (b >> 4u8) as usize;
(prefix_len, suffix_len)
} else {
let prefix_len = u32::deserialize(&mut cursor).unwrap();
let suffix_len = u32::deserialize(&mut cursor).unwrap();
(prefix_len as usize, suffix_len as usize)
};
unsafe { self.term.set_len(prefix_len) };
self.term.extend_from_slice(&(*cursor)[..suffix_len]);
&cursor[suffix_len..]
}
pub fn term(&self) -> &[u8] {
pub fn term(&self) -> &[u8] {
&self.term[..]
}
}
@@ -89,7 +85,6 @@ pub struct TermInfoDeltaEncoder {
}
impl TermInfoDeltaEncoder {
pub fn new(has_positions: bool) -> Self {
TermInfoDeltaEncoder {
term_info: TermInfo::default(),
@@ -109,7 +104,8 @@ impl TermInfoDeltaEncoder {
positions_inner_offset: 0,
};
if self.has_positions {
delta_term_info.delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset;
delta_term_info.delta_positions_offset = term_info.positions_offset -
self.term_info.positions_offset;
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
}
mem::replace(&mut self.term_info, term_info);
@@ -131,7 +127,6 @@ pub fn make_mask(num_bytes: usize) -> u32 {
}
impl TermInfoDeltaDecoder {
pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder {
TermInfoDeltaDecoder {
term_info: term_info,
@@ -147,7 +142,7 @@ impl TermInfoDeltaDecoder {
positions_offset: checkpoint.positions_offset,
positions_inner_offset: 0u8,
},
has_positions: has_positions
has_positions: has_positions,
}
}
@@ -164,12 +159,12 @@ impl TermInfoDeltaDecoder {
self.term_info.postings_offset += delta_postings_offset;
if self.has_positions {
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } &
make_mask(num_bytes_positions_offset);
self.term_info.positions_offset += delta_positions_offset;
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
&cursor[num_bytes_positions_offset + 1..]
}
else {
} else {
cursor
}
}

View File

@@ -22,7 +22,6 @@ pub struct CheckPoint {
}
impl BinarySerializable for CheckPoint {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.stream_offset.serialize(writer)?;
self.postings_offset.serialize(writer)?;
@@ -40,4 +39,4 @@ impl BinarySerializable for CheckPoint {
positions_offset: positions_offset,
})
}
}
}

View File

@@ -7,11 +7,11 @@ use postings::TermInfo;
use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder};
fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl,
target_key: &[u8],
has_positions: bool)
-> TermStreamerImpl<'a>
{
fn stream_before<'a>(
term_dictionary: &'a TermDictionaryImpl,
target_key: &[u8],
has_positions: bool,
) -> TermStreamerImpl<'a> {
let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref());
let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..];
@@ -24,8 +24,7 @@ fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl,
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
pub struct TermStreamerBuilderImpl<'a>
{
pub struct TermStreamerBuilderImpl<'a> {
term_dictionary: &'a TermDictionaryImpl,
origin: usize,
offset_from: usize,
@@ -35,14 +34,17 @@ pub struct TermStreamerBuilderImpl<'a>
has_positions: bool,
}
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
{
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
type Streamer = TermStreamerImpl<'a>;
/// Limit the range to terms greater or equal to the bound
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.lt(target_key);
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
self.current_key = current_key;
@@ -54,7 +56,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
/// Limit the range to terms strictly greater than the bound
fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.le(target_key);
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
self.current_key = current_key;
@@ -66,7 +72,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
/// Limit the range to terms lesser or equal to the bound
fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.lt(target_key);
let (offset_before, _, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before - self.origin;
@@ -76,7 +86,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
/// Limit the range to terms lesser or equal to the bound
fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
let target_key = bound.as_ref();
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
let streamer = stream_before(
self.term_dictionary,
target_key.as_ref(),
self.has_positions,
);
let smaller_than = |k: &[u8]| k.le(target_key);
let (offset_before, _, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before - self.origin;
@@ -88,10 +102,13 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
let data: &[u8] = self.term_dictionary.stream_data();
let start = self.offset_from;
let stop = max(self.offset_to, start);
let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key);
let term_info_decoder =
TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions);
TermStreamerImpl {
cursor: &data[start..stop],
term_delta_decoder: TermDeltaDecoder::with_previous_term(self.current_key),
term_info_decoder: TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions), // TODO checkpoint
term_delta_decoder: term_delta_decoder,
term_info_decoder: term_info_decoder,
}
}
}
@@ -103,10 +120,10 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
/// - the block start
/// - the index within this block
/// - the term_buffer state to initialize the block)
fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P,
mut streamer: TermStreamerImpl<'a>)
-> (usize, Vec<u8>, TermInfo)
{
fn get_offset<'a, P: Fn(&[u8]) -> bool>(
predicate: P,
mut streamer: TermStreamerImpl<'a>,
) -> (usize, Vec<u8>, TermInfo) {
let mut prev: &[u8] = streamer.cursor;
let mut term_info = streamer.value().clone();
@@ -124,11 +141,8 @@ fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P,
(prev.as_ptr() as usize, prev_data, term_info)
}
impl<'a> TermStreamerBuilderImpl<'a>
{
pub(crate) fn new(
term_dictionary: &'a TermDictionaryImpl,
has_positions: bool) -> Self {
impl<'a> TermStreamerBuilderImpl<'a> {
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self {
let data = term_dictionary.stream_data();
let origin = data.as_ptr() as usize;
TermStreamerBuilderImpl {
@@ -146,8 +160,7 @@ impl<'a> TermStreamerBuilderImpl<'a>
/// See [`TermStreamer`](./trait.TermStreamer.html)
pub struct TermStreamerImpl<'a>
{
pub struct TermStreamerImpl<'a> {
cursor: &'a [u8],
term_delta_decoder: TermDeltaDecoder,
term_info_decoder: TermInfoDeltaDecoder,
@@ -156,8 +169,7 @@ pub struct TermStreamerImpl<'a>
impl<'a> TermStreamer for TermStreamerImpl<'a>
{
impl<'a> TermStreamer for TermStreamerImpl<'a> {
fn advance(&mut self) -> bool {
if self.cursor.is_empty() {
return false;
@@ -178,4 +190,3 @@ impl<'a> TermStreamer for TermStreamerImpl<'a>
&self.term_info_decoder.term_info()
}
}

View File

@@ -30,20 +30,16 @@ fn has_positions(field_type: &FieldType) -> bool {
let indexing_options = text_options.get_indexing_options();
if indexing_options.is_position_enabled() {
true
}
else {
} else {
false
}
}
_ => {
false
}
_ => false,
}
}
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
pub struct TermDictionaryBuilderImpl<W>
{
pub struct TermDictionaryBuilderImpl<W> {
write: CountingWriter<W>,
term_delta_encoder: TermDeltaEncoder,
term_info_encoder: TermInfoDeltaEncoder,
@@ -61,7 +57,8 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
}
impl<W> TermDictionaryBuilderImpl<W>
where W: Write
where
W: Write,
{
fn add_index_entry(&mut self) {
let stream_offset = self.write.written_bytes() as u32;
@@ -74,10 +71,17 @@ impl<W> TermDictionaryBuilderImpl<W>
positions_offset: positions_offset,
};
self.block_index
.insert(&self.term_delta_encoder.term(), self.checkpoints.len() as u64)
.expect("Serializing fst on a Vec<u8> should never fail. Where your terms not in order maybe?");
checkpoint.serialize(&mut self.checkpoints)
.expect("Serializing checkpoint on a Vec<u8> should never fail.");
.insert(
&self.term_delta_encoder.term(),
self.checkpoints.len() as u64,
)
.expect(
"Serializing fst on a Vec<u8> should never fail. \
Where your terms not in order maybe?",
);
checkpoint.serialize(&mut self.checkpoints).expect(
"Serializing checkpoint on a Vec<u8> should never fail.",
);
}
/// # Warning
@@ -98,7 +102,13 @@ impl<W> TermDictionaryBuilderImpl<W>
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
let delta_term_info = self.term_info_encoder.encode(term_info.clone());
let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix();
write_term_kv(prefix_len, suffix, &delta_term_info, self.term_info_encoder.has_positions, &mut self.write)?;
write_term_kv(
prefix_len,
suffix,
&delta_term_info,
self.term_info_encoder.has_positions,
&mut self.write,
)?;
self.len += 1;
Ok(())
}
@@ -108,19 +118,20 @@ fn num_bytes_required(mut n: u32) -> u8 {
for i in 1u8..5u8 {
if n < 256u32 {
return i;
}
else {
} else {
n /= 256;
}
}
0u8
}
fn write_term_kv<W: Write>(prefix_len: usize,
suffix: &[u8],
delta_term_info: &DeltaTermInfo,
has_positions: bool,
write: &mut W) -> io::Result<()> {
fn write_term_kv<W: Write>(
prefix_len: usize,
suffix: &[u8],
delta_term_info: &DeltaTermInfo,
has_positions: bool,
write: &mut W,
) -> io::Result<()> {
let suffix_len = suffix.len();
let mut code = 0u8;
let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq);
@@ -131,9 +142,13 @@ fn write_term_kv<W: Write>(prefix_len: usize,
code |= (num_bytes_positions_offset - 1) << 5u8;
if (prefix_len < 16) && (suffix_len < 16) {
code |= 1u8;
write.write_all(&[code, (prefix_len as u8) | ((suffix_len as u8) << 4u8)])?;
}
else {
write.write_all(
&[
code,
(prefix_len as u8) | ((suffix_len as u8) << 4u8),
],
)?;
} else {
write.write_all(&[code])?;
(prefix_len as u32).serialize(write)?;
(suffix_len as u32).serialize(write)?;
@@ -145,11 +160,15 @@ fn write_term_kv<W: Write>(prefix_len: usize,
}
{
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) };
write.write_all(&bytes[0..num_bytes_postings_offset as usize])?;
write.write_all(
&bytes[0..num_bytes_postings_offset as usize],
)?;
}
if has_positions {
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) };
write.write_all(&bytes[0..num_bytes_positions_offset as usize])?;
write.write_all(
&bytes[0..num_bytes_positions_offset as usize],
)?;
write.write_all(&[delta_term_info.positions_inner_offset])?;
}
Ok(())
@@ -157,7 +176,8 @@ fn write_term_kv<W: Write>(prefix_len: usize,
}
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
where W: Write
where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
fn new(mut write: W, field_type: FieldType) -> io::Result<Self> {
@@ -169,7 +189,7 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
term_delta_encoder: TermDeltaEncoder::default(),
term_info_encoder: TermInfoDeltaEncoder::new(has_positions),
block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"),
checkpoints: vec!(),
checkpoints: vec![],
len: 0,
})
}
@@ -206,28 +226,22 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
use self::ReadOnlySource::*;
let fst_result = match source {
Anonymous(data) => {
Fst::from_shared_bytes(data.data, data.start, data.len)
}
Mmap(mmap_readonly) => {
Fst::from_mmap(mmap_readonly)
}
Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len),
Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly),
};
let fst = fst_result.map_err(convert_fst_error)?;
Ok(fst::Map::from(fst))
}
/// See [`TermDictionary`](./trait.TermDictionary.html)
pub struct TermDictionaryImpl
{
pub struct TermDictionaryImpl {
stream_data: ReadOnlySource,
fst_index: fst::Map,
checkpoints_data: ReadOnlySource,
has_positions: bool,
}
impl TermDictionaryImpl
{
impl TermDictionaryImpl {
pub(crate) fn stream_data(&self) -> &[u8] {
self.stream_data.as_slice()
}
@@ -235,8 +249,8 @@ impl TermDictionaryImpl
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, CheckPoint) {
let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key);
let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..];
let checkpoint = CheckPoint::deserialize(&mut checkpoint_data)
.expect("Checkpoint data is corrupted");
let checkpoint =
CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted");
(term, checkpoint)
}
@@ -288,47 +302,47 @@ impl TermDictionaryImpl
impl<'a> TermDictionary<'a> for TermDictionaryImpl
{
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
type Streamer = TermStreamerImpl<'a>;
type StreamBuilder = TermStreamerBuilderImpl<'a>;
/// Opens a `TermDictionary` given a data source.
fn from_source(mut source: ReadOnlySource) -> io::Result<Self> {
let has_positions = source.slice(0, 1).as_ref()[0] == 255u8;
fn from_source(mut source: ReadOnlySource) -> Self {
let has_positions = source.slice(0, 1)[0] == 255u8;
source = source.slice_from(1);
let total_len = source.len();
let (body, footer) = source.split(total_len - 16);
let mut footer_buffer: &[u8] = footer.as_slice();
let fst_addr: usize = u64::deserialize(&mut footer_buffer)? as usize;
let checkpoints_addr: usize = u64::deserialize(&mut footer_buffer)? as usize;
let fst_addr = u64::deserialize(&mut footer_buffer).expect(
"deserializing 8 byte should never fail",
) as usize;
let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect(
"deserializing 8 byte should never fail",
) as usize;
let stream_data = body.slice(0, fst_addr - PADDING_SIZE);
let fst_data = body.slice(fst_addr, checkpoints_addr);
let checkpoints_data = body.slice_from(checkpoints_addr);
let fst_index = open_fst_index(fst_data)?;
let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted");
Ok(TermDictionaryImpl {
TermDictionaryImpl {
has_positions: has_positions,
stream_data: stream_data,
checkpoints_data: checkpoints_data,
fst_index: fst_index,
})
}
}
/// Lookups the value corresponding to the key.
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo> {
let mut streamer = self.range()
.ge(&target_key)
.into_stream();
let mut streamer = self.range().ge(&target_key).into_stream();
if streamer.advance() && streamer.key() == target_key.as_ref() {
Some(streamer.value().clone())
}
else {
} else {
None
}
}
@@ -353,4 +367,4 @@ mod tests {
assert_eq!(num_bytes_required(256), 2);
assert_eq!(num_bytes_required(u32::max_value()), 4);
}
}
}