mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
Format
This commit is contained in:
@@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
@@ -38,10 +38,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
try!(self.left.set_segment(segment_local_id, segment));
|
||||
try!(self.right.set_segment(segment_local_id, segment));
|
||||
Ok(())
|
||||
|
||||
@@ -45,11 +45,11 @@ mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,8 +15,9 @@ use SegmentLocalId;
|
||||
|
||||
/// Facet collector for i64/u64 fast field
|
||||
pub struct FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
counters: HashMap<T::ValueType, u64>,
|
||||
field: Field,
|
||||
@@ -25,8 +26,9 @@ pub struct FacetCollector<T>
|
||||
|
||||
|
||||
impl<T> FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
/// Creates a new facet collector for aggregating a given field.
|
||||
pub fn new(field: Field) -> FacetCollector<T> {
|
||||
@@ -40,8 +42,9 @@ impl<T> FacetCollector<T>
|
||||
|
||||
|
||||
impl<T> Collector for FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
@@ -51,7 +54,9 @@ impl<T> Collector for FacetCollector<T>
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let val = self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.expect(
|
||||
"collect() was called before set_segment. This should never happen.",
|
||||
)
|
||||
.get(doc);
|
||||
*(self.counters.entry(val).or_insert(0)) += 1;
|
||||
}
|
||||
|
||||
@@ -51,20 +51,22 @@ pub use self::chained_collector::chain;
|
||||
pub trait Collector {
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()>;
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
}
|
||||
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
@@ -169,12 +171,12 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> {
|
||||
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
@@ -53,8 +54,8 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::from(vec![&mut top_collector,
|
||||
&mut count_collector]);
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
|
||||
@@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc {
|
||||
impl Ord for GlobalScoredDoc {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||
other
|
||||
.score
|
||||
.partial_cmp(&self.score)
|
||||
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||
other.score.partial_cmp(&self.score).unwrap_or_else(|| {
|
||||
other.doc_address.cmp(&self.doc_address)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +86,9 @@ impl TopCollector {
|
||||
scored_docs.sort();
|
||||
scored_docs
|
||||
.into_iter()
|
||||
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||
.map(|GlobalScoredDoc { score, doc_address }| {
|
||||
(score, doc_address)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -108,14 +109,13 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc =
|
||||
*self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect(
|
||||
"Top collector with size 0 is forbidden",
|
||||
);
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
let mut mut_head = self.heap.peek_mut().expect(
|
||||
"Top collector with size 0 is forbidden",
|
||||
);
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
}
|
||||
|
||||
@@ -88,7 +88,8 @@ impl BitPacker {
|
||||
|
||||
|
||||
pub struct BitUnpacker<Data>
|
||||
where Data: Deref<Target = [u8]>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
num_bits: usize,
|
||||
mask: u64,
|
||||
@@ -96,7 +97,8 @@ pub struct BitUnpacker<Data>
|
||||
}
|
||||
|
||||
impl<Data> BitUnpacker<Data>
|
||||
where Data: Deref<Target = [u8]>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
@@ -121,8 +123,10 @@ impl<Data> BitUnpacker<Data>
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes.");
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
|
||||
@@ -10,13 +10,12 @@ use common::BinarySerializable;
|
||||
|
||||
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W=WritePtr> {
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<Field, usize>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
/// Crate a new API writer that writes a composite file
|
||||
/// in a given write.
|
||||
pub fn wrap(w: W) -> CompositeWrite<W> {
|
||||
@@ -43,7 +42,8 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets.iter()
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(field, offset)| (offset, field))
|
||||
.collect();
|
||||
|
||||
@@ -51,7 +51,9 @@ impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (offset, field) in offset_fields {
|
||||
VInt( (offset - prev_offset) as u64).serialize(&mut self.write)?;
|
||||
VInt((offset - prev_offset) as u64).serialize(
|
||||
&mut self.write,
|
||||
)?;
|
||||
field.serialize(&mut self.write)?;
|
||||
prev_offset = *offset;
|
||||
}
|
||||
@@ -77,7 +79,6 @@ pub struct CompositeFile {
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
|
||||
/// Opens a composite file stored in a given
|
||||
/// `ReadOnlySource`.
|
||||
pub fn open(data: ReadOnlySource) -> io::Result<CompositeFile> {
|
||||
@@ -90,8 +91,8 @@ impl CompositeFile {
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
|
||||
let mut fields = vec!();
|
||||
let mut offsets = vec!();
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
|
||||
let mut field_index = HashMap::new();
|
||||
|
||||
@@ -106,7 +107,7 @@ impl CompositeFile {
|
||||
for i in 0..num_fields {
|
||||
let field = fields[i];
|
||||
let start_offset = offsets[i];
|
||||
let end_offset = offsets[i+1];
|
||||
let end_offset = offsets[i + 1];
|
||||
field_index.insert(field, (start_offset, end_offset));
|
||||
}
|
||||
|
||||
@@ -128,11 +129,9 @@ impl CompositeFile {
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&field)
|
||||
.map(|&(from, to)| {
|
||||
self.data.slice(from, to)
|
||||
})
|
||||
self.offsets_index.get(&field).map(|&(from, to)| {
|
||||
self.data.slice(from, to)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,4 +188,4 @@ mod test {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,9 +101,9 @@ impl BinarySerializable for String {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
.read_to_string(&mut result)?;
|
||||
reader.take(string_length as u64).read_to_string(
|
||||
&mut result,
|
||||
)?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> {
|
||||
|
||||
impl<'a> Drop for OpenTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
self.timer_tree
|
||||
.timings
|
||||
.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
self.timer_tree.timings.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,12 @@ impl BinarySerializable for VInt {
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
|
||||
_ => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Reach end of buffer",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(VInt(result))
|
||||
|
||||
@@ -5,13 +5,13 @@ mod stream;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
#[cfg(not(feature = "simdcompression"))]
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder};
|
||||
}
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
#[cfg(feature = "simdcompression")]
|
||||
mod pack {
|
||||
mod compression_pack_simd;
|
||||
pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder};
|
||||
@@ -19,13 +19,13 @@ mod pack {
|
||||
|
||||
pub use self::pack::{BlockEncoder, BlockDecoder};
|
||||
|
||||
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
|
||||
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
|
||||
mod vint {
|
||||
mod compression_vint_nosimd;
|
||||
pub(crate) use self::compression_vint_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
|
||||
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
|
||||
mod vint {
|
||||
mod compression_vint_simd;
|
||||
pub(crate) use self::compression_vint_simd::*;
|
||||
@@ -70,21 +70,19 @@ pub trait VIntDecoder {
|
||||
/// For instance, if delta encoded are `1, 3, 9`, and the
|
||||
/// `offset` is 5, then the output will be:
|
||||
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> usize;
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize;
|
||||
|
||||
/// Uncompress an array of `u32s`, compressed using variable
|
||||
/// byte encoding.
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> usize;
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
@@ -98,19 +96,17 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> usize {
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> usize {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
@@ -125,7 +121,6 @@ pub mod tests {
|
||||
use super::*;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
@@ -236,7 +231,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for num_bits in 0..33 {
|
||||
let mut data: Vec<u32> = iter::repeat(0u32).take(128).collect();
|
||||
let mut data = [0u32; 128];
|
||||
if num_bits > 0 {
|
||||
data[0] = 1 << (num_bits - 1);
|
||||
}
|
||||
@@ -262,7 +257,9 @@ pub mod tests {
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 +
|
||||
bit_packer
|
||||
.close(&mut output)
|
||||
.expect("packing in memory should never fail")
|
||||
bit_packer.close(&mut output).expect(
|
||||
"packing in memory should never fail",
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -56,10 +56,9 @@ impl BlockEncoder {
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size: usize = {
|
||||
let mut output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.expect("compress unsorted called with an empty array");
|
||||
let max = vals.iter().cloned().max().expect(
|
||||
"compress unsorted called with an empty array",
|
||||
);
|
||||
let num_bits = compute_num_bits(max);
|
||||
output.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
@@ -67,9 +66,9 @@ impl BlockEncoder {
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 +
|
||||
bit_packer
|
||||
.close(&mut output)
|
||||
.expect("packing in memory should never fail")
|
||||
bit_packer.close(&mut output).expect(
|
||||
"packing in memory should never fail",
|
||||
)
|
||||
};
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
@@ -93,10 +92,11 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32)
|
||||
-> &'a [u8] {
|
||||
pub fn uncompress_block_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32,
|
||||
) -> &'a [u8] {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
|
||||
@@ -8,10 +8,11 @@ mod simdcomp {
|
||||
extern "C" {
|
||||
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
|
||||
|
||||
pub fn uncompress_sorted(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn uncompress_sorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
|
||||
|
||||
@@ -78,10 +79,7 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self,
|
||||
compressed_data: &[u8],
|
||||
offset: u32)
|
||||
-> usize {
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
|
||||
@@ -16,7 +16,6 @@ pub struct CompressedIntStream {
|
||||
}
|
||||
|
||||
impl CompressedIntStream {
|
||||
|
||||
/// Opens a compressed int stream.
|
||||
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
|
||||
CompressedIntStream {
|
||||
@@ -35,17 +34,21 @@ impl CompressedIntStream {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if num_els >= available {
|
||||
if available > 0 {
|
||||
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..];
|
||||
let uncompressed_block = &self.block_decoder.output_array()
|
||||
[self.inner_offset..];
|
||||
&mut output[start..start + available].clone_from_slice(uncompressed_block);
|
||||
}
|
||||
num_els -= available;
|
||||
start += available;
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref());
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
|
||||
self.buffer.as_ref(),
|
||||
);
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = 0;
|
||||
}
|
||||
else {
|
||||
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..self.inner_offset + num_els];
|
||||
} else {
|
||||
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..
|
||||
self.inner_offset +
|
||||
num_els];
|
||||
&output[start..start + num_els].clone_from_slice(uncompressed_block);
|
||||
self.inner_offset += num_els;
|
||||
break;
|
||||
@@ -62,8 +65,7 @@ impl CompressedIntStream {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if available >= skip_len {
|
||||
self.inner_offset += skip_len;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
skip_len -= available;
|
||||
// entirely skip decompressing some blocks.
|
||||
while skip_len >= COMPRESSION_BLOCK_SIZE {
|
||||
@@ -72,7 +74,9 @@ impl CompressedIntStream {
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.buffer.advance(block_len);
|
||||
}
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(self.buffer.as_ref());
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
|
||||
self.buffer.as_ref(),
|
||||
);
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = skip_len;
|
||||
}
|
||||
@@ -90,7 +94,7 @@ pub mod tests {
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
let mut buffer: Vec<u8> = vec!();
|
||||
let mut buffer: Vec<u8> = vec![];
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let vals: Vec<u32> = (0u32..1_025u32).collect();
|
||||
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
pub(crate) fn compress_sorted<'a>(
|
||||
input: &[u32],
|
||||
output: &'a mut [u8],
|
||||
mut offset: u32,
|
||||
) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
@@ -43,10 +47,11 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
|
||||
@@ -4,24 +4,27 @@ mod streamvbyte {
|
||||
use libc::size_t;
|
||||
|
||||
extern "C" {
|
||||
pub fn streamvbyte_delta_encode(data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize,
|
||||
) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,10 +32,12 @@ mod streamvbyte {
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
@@ -47,15 +52,18 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> usize {
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,9 +48,10 @@ impl Index {
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory)
|
||||
.expect("Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.");
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
@@ -127,10 +128,11 @@ impl Index {
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer_with_num_threads(&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize)
|
||||
-> Result<IndexWriter> {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes)
|
||||
}
|
||||
|
||||
@@ -155,10 +157,12 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
Ok(
|
||||
self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
@@ -190,10 +194,12 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
Ok(
|
||||
self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
@@ -203,10 +209,12 @@ impl Index {
|
||||
/// published or after a merge.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
let segment_readers: Vec<SegmentReader> = try!(
|
||||
searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect()
|
||||
);
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.collect();
|
||||
|
||||
@@ -9,7 +9,7 @@ use core::SegmentMeta;
|
||||
/// * the index docstamp
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone,Debug,Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
pub schema: Schema,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use directory::{SourceRead, ReadOnlySource};
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use std::io;
|
||||
use postings::{SegmentPostings, BlockSegmentPostings};
|
||||
use postings::TermInfo;
|
||||
use postings::SegmentPostingsOption;
|
||||
@@ -33,22 +32,21 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
|
||||
pub(crate) fn new(
|
||||
termdict_source: ReadOnlySource,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
schema: Schema,
|
||||
) -> io::Result<InvertedIndexReader> {
|
||||
) -> InvertedIndexReader {
|
||||
|
||||
Ok(InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source)?,
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source: postings_source,
|
||||
positions_source: positions_source,
|
||||
delete_bitset: delete_bitset,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
@@ -72,9 +70,11 @@ impl InvertedIndexReader {
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings) {
|
||||
pub fn reset_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
@@ -88,27 +88,30 @@ impl InvertedIndexReader {
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> BlockSegmentPostings {
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let has_freq = option.has_freq();
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
has_freq)
|
||||
has_freq,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> SegmentPostings {
|
||||
pub fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption,
|
||||
) -> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
let position_stream = {
|
||||
@@ -118,16 +121,11 @@ impl InvertedIndexReader {
|
||||
let mut stream = CompressedIntStream::wrap(positions_source);
|
||||
stream.skip(term_info.positions_inner_offset as usize);
|
||||
Some(stream)
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_block_postings(
|
||||
block_postings,
|
||||
delete_bitset,
|
||||
position_stream
|
||||
)
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
@@ -140,16 +138,20 @@ impl InvertedIndexReader {
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption)
|
||||
-> Option<SegmentPostings> {
|
||||
pub fn read_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
|
||||
let best_effort_option = cmp::min(maximum_option, option);
|
||||
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
|
||||
Some(self.read_postings_from_terminfo(
|
||||
&term_info,
|
||||
best_effort_option,
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
|
||||
@@ -76,8 +76,11 @@ impl<T> Pool<T> {
|
||||
if former_generation >= generation {
|
||||
break;
|
||||
}
|
||||
self.freshest_generation
|
||||
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
|
||||
self.freshest_generation.compare_and_swap(
|
||||
former_generation,
|
||||
generation,
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,9 +94,9 @@ impl<T> Pool<T> {
|
||||
let gen_item = self.queue.pop();
|
||||
if gen_item.generation >= generation {
|
||||
return LeasedItem {
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
};
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
};
|
||||
} else {
|
||||
// this searcher is obsolete,
|
||||
// removing it from the pool.
|
||||
@@ -113,25 +116,26 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for LeasedItem<T> {
|
||||
fn drop(&mut self) {
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||
.expect("Unwrapping a leased item should never fail");
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect(
|
||||
"Unwrapping a leased item should never fail",
|
||||
);
|
||||
self.recycle_queue.push(gen_item);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,10 +47,7 @@ impl Searcher {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| {
|
||||
segment_reader
|
||||
.inverted_index(term.field())
|
||||
.unwrap() // TODO error handling
|
||||
.doc_freq(term)
|
||||
segment_reader.inverted_index(term.field()).doc_freq(term)
|
||||
})
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
@@ -70,16 +67,13 @@ impl Searcher {
|
||||
query.search(self, collector)
|
||||
}
|
||||
|
||||
|
||||
///
|
||||
pub fn field(&self, field: Field) -> Result<FieldSearcher> {
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| {
|
||||
segment_reader.inverted_index(field)
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(FieldSearcher::new(inv_index_readers))
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,11 +86,8 @@ pub struct FieldSearcher {
|
||||
|
||||
|
||||
impl FieldSearcher {
|
||||
|
||||
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
|
||||
FieldSearcher {
|
||||
inv_index_readers: inv_index_readers,
|
||||
}
|
||||
FieldSearcher { inv_index_readers: inv_index_readers }
|
||||
}
|
||||
|
||||
|
||||
@@ -105,9 +96,7 @@ impl FieldSearcher {
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| {
|
||||
inverted_index.terms().stream()
|
||||
})
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
TermMerger::new(term_streamers)
|
||||
}
|
||||
|
||||
@@ -76,18 +76,20 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(&self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<ReadOnlySource, OpenReadError> {
|
||||
pub fn open_read(
|
||||
&self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = try!(self.index.directory().open_read(&path));
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
/// Open one of the component file for *regular* write.
|
||||
pub fn open_write(&mut self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<WritePtr, OpenWriteError> {
|
||||
pub fn open_write(
|
||||
&mut self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = try!(self.index.directory_mut().open_write(&path));
|
||||
Ok(write)
|
||||
@@ -125,11 +127,11 @@ mod tests {
|
||||
{
|
||||
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
|
||||
assert!(directory.exists(&*path));
|
||||
directory.garbage_collect(|| { living_files.clone() });
|
||||
directory.garbage_collect(|| living_files.clone());
|
||||
assert!(directory.exists(&*path));
|
||||
}
|
||||
|
||||
directory.garbage_collect(|| { living_files });
|
||||
directory.garbage_collect(|| living_files);
|
||||
assert!(!directory.exists(&*path));
|
||||
}
|
||||
|
||||
|
||||
@@ -28,13 +28,15 @@ pub enum SegmentComponent {
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE];
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE,
|
||||
];
|
||||
SEGMENT_COMPONENTS.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,16 +64,14 @@ impl SegmentMeta {
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => {
|
||||
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
|
||||
}
|
||||
});
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
@@ -111,8 +109,8 @@ impl SegmentMeta {
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.deletes = Some(DeleteMeta {
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use error::ErrorKind;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
@@ -87,17 +86,17 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
|
||||
(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<TFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
} else {
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
FastFieldNotAvailableError::new(field_entry)
|
||||
})
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(TFastFieldReader::open)
|
||||
}
|
||||
}
|
||||
@@ -111,9 +110,9 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
|
||||
self.fieldnorms_composite
|
||||
.open_read(field)
|
||||
.map(U64FastFieldReader::open)
|
||||
self.fieldnorms_composite.open_read(field).map(
|
||||
U64FastFieldReader::open,
|
||||
)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
@@ -131,13 +130,12 @@ impl SegmentReader {
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(postings_source)?;
|
||||
let postings_composite = CompositeFile::open(postings_source)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
CompositeFile::open(source)?
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
@@ -159,17 +157,17 @@ impl SegmentReader {
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
termdict_composite: termdict_composite,
|
||||
postings_composite: postings_composite,
|
||||
fast_fields_composite: fast_fields_composite,
|
||||
fieldnorms_composite: fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
delete_bitset: delete_bitset,
|
||||
positions_composite: positions_composite,
|
||||
schema: schema,
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
termdict_composite: termdict_composite,
|
||||
postings_composite: postings_composite,
|
||||
fast_fields_composite: fast_fields_composite,
|
||||
fieldnorms_composite: fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
delete_bitset: delete_bitset,
|
||||
positions_composite: positions_composite,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -179,32 +177,27 @@ impl SegmentReader {
|
||||
/// The field reader is in charge of iterating through the
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Result<Arc<InvertedIndexReader>> {
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field) {
|
||||
return Ok(inv_idx_reader.clone());
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) =
|
||||
self.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
{
|
||||
inv_idx_reader.clone();
|
||||
}
|
||||
|
||||
let termdict_source: ReadOnlySource = self.termdict_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
ErrorKind::SchemaError(
|
||||
format!("Could not find {:?} term dictionary", field)
|
||||
)
|
||||
})?;
|
||||
let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field term dictionary in composite file.",
|
||||
);
|
||||
|
||||
let postings_source = self.postings_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
ErrorKind::SchemaError(format!("Could not find {:?} postings", field))
|
||||
})?;
|
||||
let postings_source = self.postings_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field postings in composite file.",
|
||||
);
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
ErrorKind::SchemaError(format!("Could not find {:?} positions", field))
|
||||
})?;
|
||||
let positions_source = self.positions_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field positions in composite file.",
|
||||
);
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
termdict_source,
|
||||
@@ -212,15 +205,18 @@ impl SegmentReader {
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
self.schema.clone(),
|
||||
)?);
|
||||
));
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
// twice, but this is fine.
|
||||
self.inv_idx_reader_cache
|
||||
.write()
|
||||
.expect("Field reader cache lock poisoned. This should never happen.")
|
||||
.expect(
|
||||
"Field reader cache lock poisoned. This should never happen.",
|
||||
)
|
||||
.insert(field, inv_idx_reader.clone());
|
||||
Ok(inv_idx_reader)
|
||||
|
||||
inv_idx_reader
|
||||
}
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
|
||||
@@ -39,11 +39,11 @@ impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
doc_id.serialize(&mut self.buffer)?;
|
||||
value.serialize(&mut self.buffer)?;
|
||||
Ok(if self.remaining == 0 {
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
} else {
|
||||
None
|
||||
})
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => {
|
||||
try!(self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset))
|
||||
try!(self.get_skip_layer(layer_id).insert(
|
||||
skip_doc_id,
|
||||
&skip_offset,
|
||||
))
|
||||
}
|
||||
None => {
|
||||
return Ok(());
|
||||
|
||||
@@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
};
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.take_while(|num_bits: &usize| {
|
||||
compute_table_size(*num_bits) < table_size_limit
|
||||
})
|
||||
.last()
|
||||
.expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget));
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
@@ -174,13 +179,10 @@ impl<'a> HashMap<'a> {
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
self.get_key_value(kv.key_value_addr)
|
||||
})
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
self.get_key_value(kv.key_value_addr)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -282,8 +284,10 @@ mod tests {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes()));
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,13 +307,13 @@ mod tests {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -144,7 +144,8 @@ impl InnerHeap {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,);
|
||||
info!(r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
@@ -154,10 +155,9 @@ impl InnerHeap {
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
self.next_heap.as_ref().unwrap().get_slice(BytesRef(
|
||||
start - self.buffer_len,
|
||||
))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
@@ -167,10 +167,10 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut_slice(
|
||||
start - self.buffer_len,
|
||||
stop - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
@@ -188,10 +188,9 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut(
|
||||
addr - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
@@ -200,10 +199,9 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(
|
||||
addr - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
@@ -213,10 +211,10 @@ impl InnerHeap {
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
self.next_heap.as_mut().unwrap().set(
|
||||
addr - self.buffer_len,
|
||||
val,
|
||||
);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
|
||||
@@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError {
|
||||
write!(f, "the file '{:?}' already exists", path)
|
||||
}
|
||||
OpenWriteError::IOError(ref err) => {
|
||||
write!(f,
|
||||
"an io error occurred while opening a file for writing: '{}'",
|
||||
err)
|
||||
write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for writing: '{}'",
|
||||
err
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
}
|
||||
OpenReadError::IOError(ref err) => {
|
||||
write!(f,
|
||||
"an io error occurred while opening a file for reading: '{}'",
|
||||
err)
|
||||
write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for reading: '{}'",
|
||||
err
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,10 +45,9 @@ pub struct FileProtection {
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let mut meta_informations_wlock = directory.meta_informations.write().expect(
|
||||
"Managed file lock poisoned",
|
||||
);
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
@@ -68,9 +67,10 @@ impl Drop for FileProtection {
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>)
|
||||
-> io::Result<()> {
|
||||
fn save_managed_paths(
|
||||
directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||
) -> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
write!(&mut w, "\n")?;
|
||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
@@ -84,22 +84,22 @@ impl ManagedDirectory {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> =
|
||||
serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
serde_json::from_str(&managed_files_json).chain_err(|| {
|
||||
ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone())
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files:
|
||||
HashMap::default(),
|
||||
})),
|
||||
})
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => {
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
}
|
||||
@@ -116,15 +116,14 @@ impl ManagedDirectory {
|
||||
/// If a file cannot be deleted (for permission reasons for instance)
|
||||
/// an error is simply logged, and the file remains in the list of managed
|
||||
/// files.
|
||||
pub fn garbage_collect<L: FnOnce()-> HashSet<PathBuf> >(&mut self, get_living_files: L) {
|
||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||
info!("Garbage collect");
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock =
|
||||
self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
let meta_informations_rlock = self.meta_informations.read().expect(
|
||||
"Managed directory rlock poisoned in garbage collect.",
|
||||
);
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
@@ -177,9 +176,9 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
let mut meta_informations_wlock = self.meta_informations.write().expect(
|
||||
"Managed directory wlock poisoned (2).",
|
||||
);
|
||||
{
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
@@ -202,13 +201,13 @@ impl ManagedDirectory {
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
let mut meta_informations_wlock = self.meta_informations.write().expect(
|
||||
"Managed file lock poisoned on protect",
|
||||
);
|
||||
*meta_informations_wlock
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
FileProtection {
|
||||
directory: self.clone(),
|
||||
@@ -224,9 +223,9 @@ impl ManagedDirectory {
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let mut meta_wlock = self.meta_informations.write().expect(
|
||||
"Managed file lock poisoned",
|
||||
);
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
|
||||
@@ -241,8 +240,9 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
|
||||
self.register_file_as_managed(path)
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
self.register_file_as_managed(path).map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
self.directory.open_write(path)
|
||||
}
|
||||
|
||||
@@ -257,9 +257,9 @@ impl Directory for ManagedDirectory {
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
let metas_rlock = self.meta_informations.read().expect(
|
||||
"poisoned lock in managed directory meta",
|
||||
);
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
if *counter > 0 {
|
||||
return Err(DeleteError::FileProtected(path.to_owned()));
|
||||
@@ -327,7 +327,7 @@ mod tests {
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -343,7 +343,7 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
@@ -366,7 +366,7 @@ mod tests {
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -374,7 +374,7 @@ mod tests {
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
@@ -398,11 +398,11 @@ mod tests {
|
||||
|
||||
{
|
||||
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
|
||||
|
||||
@@ -24,15 +24,17 @@ use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let file = File::open(&full_path)
|
||||
.map_err(|e| if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
})?;
|
||||
let file = File::open(&full_path).map_err(|e| if e.kind() ==
|
||||
io::ErrorKind::NotFound
|
||||
{
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
let meta_data = file.metadata().map_err(|e| {
|
||||
IOError::with_path(full_path.to_owned(), e)
|
||||
})?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return an anonymous mmap_cache
|
||||
@@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
|
||||
|
||||
}
|
||||
|
||||
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheCounters {
|
||||
// Number of time the cache prevents to call `mmap`
|
||||
pub hit: usize,
|
||||
@@ -58,7 +60,7 @@ pub struct CacheCounters {
|
||||
pub miss_weak: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug,Serialize,Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheInfo {
|
||||
pub counters: CacheCounters,
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
@@ -113,31 +115,31 @@ impl MmapCache {
|
||||
self.cleanup();
|
||||
}
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,15 +182,19 @@ impl MmapDirectory {
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
|
||||
Err(OpenDirectoryError::DoesNotExist(
|
||||
PathBuf::from(directory_path),
|
||||
))
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
|
||||
Err(OpenDirectoryError::NotADirectory(
|
||||
PathBuf::from(directory_path),
|
||||
))
|
||||
} else {
|
||||
Ok(MmapDirectory {
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,9 +221,9 @@ impl MmapDirectory {
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
open_opts.write(true).custom_flags(
|
||||
winbase::FILE_FLAG_BACKUP_SEMANTICS,
|
||||
);
|
||||
}
|
||||
|
||||
let fd = try!(open_opts.open(&self.root_path));
|
||||
@@ -270,46 +276,50 @@ impl Directory for MmapDirectory {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquired write lock \
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while reading {:?}",
|
||||
path);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
Ok(
|
||||
mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())),
|
||||
)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
debug!("Open Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let open_res = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(full_path);
|
||||
let open_res = OpenOptions::new().write(true).create_new(true).open(
|
||||
full_path,
|
||||
);
|
||||
|
||||
let mut file = open_res
|
||||
.map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(path.to_owned())
|
||||
} else {
|
||||
IOError::with_path(path.to_owned(), err).into()
|
||||
})?;
|
||||
let mut file = open_res.map_err(|err| if err.kind() ==
|
||||
io::ErrorKind::AlreadyExists
|
||||
{
|
||||
OpenWriteError::FileAlreadyExists(path.to_owned())
|
||||
} else {
|
||||
IOError::with_path(path.to_owned(), err).into()
|
||||
})?;
|
||||
|
||||
// making sure the file is created.
|
||||
file.flush()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
file.flush().map_err(
|
||||
|e| IOError::with_path(path.to_owned(), e),
|
||||
)?;
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
self.sync_directory().map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
@@ -318,22 +328,23 @@ impl Directory for MmapDirectory {
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquired write lock \
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
// Removing the entry in the MMap cache.
|
||||
// The munmap will appear on Drop,
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => {
|
||||
self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into())
|
||||
self.sync_directory().map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e).into()
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
@@ -355,8 +366,9 @@ impl Directory for MmapDirectory {
|
||||
let mut buffer = Vec::new();
|
||||
match File::open(&full_path) {
|
||||
Ok(mut file) => {
|
||||
file.read_to_end(&mut buffer)
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
file.read_to_end(&mut buffer).map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
Ok(buffer)
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -41,8 +41,10 @@ impl VecWriter {
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path)
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -62,8 +64,10 @@ impl Write for VecWriter {
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.is_flushed = true;
|
||||
try!(self.shared_directory
|
||||
.write(self.path.clone(), self.data.get_ref()));
|
||||
try!(self.shared_directory.write(
|
||||
self.path.clone(),
|
||||
self.data.get_ref(),
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -79,11 +83,11 @@ impl InnerDirectory {
|
||||
}
|
||||
|
||||
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||
let mut map = try!(self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
|
||||
path))
|
||||
let mut map = try!(self.0.write().map_err(|_| {
|
||||
make_io_err(format!(
|
||||
"Failed to lock the directory, when trying to write {:?}",
|
||||
path
|
||||
))
|
||||
}));
|
||||
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||
Ok(prev_value.is_some())
|
||||
@@ -93,17 +97,21 @@ impl InnerDirectory {
|
||||
self.0
|
||||
.read()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquire read lock for the \
|
||||
let msg = format!(
|
||||
"Failed to acquire read lock for the \
|
||||
directory when trying to read {:?}",
|
||||
path);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|readable_map| {
|
||||
readable_map
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())))
|
||||
.map(|data| {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
@@ -111,16 +119,18 @@ impl InnerDirectory {
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquire write lock for the \
|
||||
let msg = format!(
|
||||
"Failed to acquire write lock for the \
|
||||
directory when trying to delete {:?}",
|
||||
path);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
@@ -164,9 +174,11 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err(
|
||||
|err| {
|
||||
IOError::with_path(path.to_owned(), err)
|
||||
},
|
||||
)?;
|
||||
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if exists {
|
||||
|
||||
@@ -114,7 +114,7 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
/// Acts as a owning cursor over the data backed up by a ReadOnlySource
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
cursor: &'static [u8]
|
||||
cursor: &'static [u8],
|
||||
}
|
||||
|
||||
impl SourceRead {
|
||||
@@ -131,7 +131,6 @@ impl AsRef<[u8]> for SourceRead {
|
||||
}
|
||||
|
||||
impl From<ReadOnlySource> for SourceRead {
|
||||
|
||||
// Creates a new `SourceRead` from a given `ReadOnlySource`
|
||||
fn from(source: ReadOnlySource) -> SourceRead {
|
||||
let len = source.len();
|
||||
|
||||
@@ -112,12 +112,9 @@ impl From<schema::DocParsingError> for Error {
|
||||
impl From<OpenWriteError> for Error {
|
||||
fn from(error: OpenWriteError) -> Error {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) => {
|
||||
ErrorKind::FileAlreadyExists(filepath)
|
||||
}
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}
|
||||
.into()
|
||||
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -100,8 +100,7 @@ mod tests {
|
||||
{
|
||||
let composite_file = CompositeFile::open(source).unwrap();
|
||||
let field_source = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
U64FastFieldReader::open(field_source);
|
||||
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -190,9 +189,11 @@ mod tests {
|
||||
// forcing the amplitude to be high
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
|
||||
for i in 0u64..10_000u64 {
|
||||
add_single_field_doc(&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i);
|
||||
add_single_field_doc(
|
||||
&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i,
|
||||
);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
@@ -208,8 +209,10 @@ mod tests {
|
||||
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
|
||||
assert_eq!(
|
||||
fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -339,13 +342,13 @@ mod tests {
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -403,13 +406,13 @@ mod tests {
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use directory::ReadOnlySource;
|
||||
use common::{self, BinarySerializable};
|
||||
use common::bitpacker::{compute_num_bits, BitUnpacker};
|
||||
use DocId;
|
||||
use schema::{SchemaBuilder};
|
||||
use schema::SchemaBuilder;
|
||||
use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
@@ -106,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
let amplitude: u64;
|
||||
{
|
||||
let mut cursor = data.as_slice();
|
||||
min_value = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the min_value of fast field.");
|
||||
amplitude = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the amplitude of fast field.");
|
||||
min_value =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
amplitude =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
|
||||
}
|
||||
let max_value = min_value + amplitude;
|
||||
@@ -130,15 +130,14 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("test");
|
||||
let path = Path::new("__dummy__");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let write: WritePtr = directory.open_write(path).expect("With a RAMDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write).expect("With a RAMDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
// TODO Error not unwrap
|
||||
{
|
||||
let fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
|
||||
let fast_field_writer = fast_field_writers.get_field_writer(field).expect("With a RAMDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
@@ -147,13 +146,12 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory
|
||||
.open_read(path)
|
||||
.expect("Failed to open the file");
|
||||
let composite_file = CompositeFile::open(source)
|
||||
.expect("Failed to read the composite file");
|
||||
let source = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file =
|
||||
CompositeFile::open(source).expect("Failed to read the composite file");
|
||||
|
||||
let field_source = composite_file.open_read(field)
|
||||
let field_source = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
U64FastFieldReader::open(field_source)
|
||||
}
|
||||
|
||||
@@ -31,30 +31,22 @@ pub struct FastFieldSerializer {
|
||||
}
|
||||
|
||||
impl FastFieldSerializer {
|
||||
|
||||
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer {
|
||||
composite_write: composite_write,
|
||||
})
|
||||
Ok(FastFieldSerializer { composite_write: composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64)
|
||||
-> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self
|
||||
.composite_write
|
||||
.for_field(field);
|
||||
FastSingleFieldSerializer::open(
|
||||
field_write,
|
||||
min_value,
|
||||
max_value)
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field(field);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
|
||||
@@ -73,10 +65,11 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
|
||||
fn open(write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
|
||||
@@ -58,9 +58,9 @@ impl FastFieldsWriter {
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.field_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
self.field_writers.iter_mut().find(|field_writer| {
|
||||
field_writer.field == field
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -155,9 +155,9 @@ impl IntFastFieldWriter {
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
VInt(val)
|
||||
.serialize(&mut self.vals)
|
||||
.expect("unable to serialize VInt to Vec");
|
||||
VInt(val).serialize(&mut self.vals).expect(
|
||||
"unable to serialize VInt to Vec",
|
||||
);
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
|
||||
@@ -40,9 +40,9 @@ impl DeleteQueue {
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
|
||||
delete_queue
|
||||
@@ -59,9 +59,11 @@ impl DeleteQueue {
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
.clone()
|
||||
.expect("Failed to unwrap last_block. This should never happen
|
||||
.expect(
|
||||
"Failed to unwrap last_block. This should never happen
|
||||
as the Option<> is only here to make
|
||||
initialization possible");
|
||||
initialization possible",
|
||||
);
|
||||
let operations_len = last_block.operations.len();
|
||||
DeleteCursor {
|
||||
block: last_block,
|
||||
@@ -92,9 +94,9 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
let mut self_wlock = self.inner.write().expect(
|
||||
"Failed to acquire write lock on delete queue writer",
|
||||
);
|
||||
|
||||
let delete_operations;
|
||||
{
|
||||
@@ -108,9 +110,9 @@ impl DeleteQueue {
|
||||
let next_block = NextBlock::from(self.clone());
|
||||
{
|
||||
self_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
self_wlock.last_block.clone()
|
||||
}
|
||||
@@ -132,18 +134,18 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
let next_read_lock = self.0.read().expect(
|
||||
"Failed to acquire write lock in delete queue",
|
||||
);
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
return Some(block.clone());
|
||||
}
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
let mut next_write_lock = self.0.write().expect(
|
||||
"Failed to acquire write lock in delete queue",
|
||||
);
|
||||
match *next_write_lock {
|
||||
InnerNextBlock::Closed(ref block) => {
|
||||
return Some(block.clone());
|
||||
|
||||
@@ -56,8 +56,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_none() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value());
|
||||
assert_eq!(
|
||||
doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -102,14 +102,17 @@ impl !Sync for IndexWriter {}
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn open_index_writer(index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize)
|
||||
-> Result<IndexWriter> {
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
|
||||
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!("The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT));
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
|
||||
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
|
||||
@@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index,
|
||||
|
||||
|
||||
|
||||
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64)
|
||||
-> Result<bool> {
|
||||
pub fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
) -> Result<bool> {
|
||||
|
||||
let mut might_have_changed = false;
|
||||
|
||||
@@ -177,9 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field())?;
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) = inverted_index.read_postings(
|
||||
&delete_op.term,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
@@ -199,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64)
|
||||
-> Result<Option<FileProtection>> {
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
@@ -223,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment,
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp)?;
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
@@ -248,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment,
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
fn index_documents(heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor)
|
||||
-> Result<bool> {
|
||||
fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
|
||||
@@ -266,8 +277,10 @@ fn index_documents(heap: &mut Heap,
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!("Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
@@ -276,8 +289,10 @@ fn index_documents(heap: &mut Heap,
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!("Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -297,11 +312,13 @@ fn index_documents(heap: &mut Heap,
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp)?;
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
@@ -328,14 +345,15 @@ impl IndexWriter {
|
||||
join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
|
||||
.chain_err(|| {
|
||||
ErrorKind::ErrorInThread("Error in indexing worker thread.".into())
|
||||
})?;
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result =
|
||||
self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
let result = self.segment_updater.wait_merging_thread().chain_err(|| {
|
||||
ErrorKind::ErrorInThread("Failed to join merging thread.".into())
|
||||
});
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
@@ -348,8 +366,10 @@ impl IndexWriter {
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater
|
||||
.add_segment(self.generation, segment_entry);
|
||||
self.segment_updater.add_segment(
|
||||
self.generation,
|
||||
segment_entry,
|
||||
);
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
@@ -373,7 +393,11 @@ impl IndexWriter {
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
self.worker_id,
|
||||
generation
|
||||
))
|
||||
.spawn(move || {
|
||||
|
||||
loop {
|
||||
@@ -397,14 +421,16 @@ impl IndexWriter {
|
||||
return Ok(());
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(&mut heap,
|
||||
table_size,
|
||||
segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone())?;
|
||||
index_documents(
|
||||
&mut heap,
|
||||
table_size,
|
||||
segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
)?;
|
||||
|
||||
}
|
||||
})?;
|
||||
@@ -437,9 +463,10 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -523,14 +550,15 @@ impl IndexWriter {
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle);
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result =
|
||||
worker_handle
|
||||
.join()
|
||||
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
|
||||
let indexing_worker_result = worker_handle.join().map_err(|e| {
|
||||
Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e)))
|
||||
})?;
|
||||
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
@@ -624,13 +652,17 @@ mod tests {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -720,9 +752,9 @@ mod tests {
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
index_writer.wait_merging_threads().expect(
|
||||
"waiting merging thread failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
|
||||
@@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
|
||||
.map(|(ind, num_docs)| {
|
||||
(ind, (self.clip_min_size(num_docs) as f64).log2())
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (first_ind, first_score) = size_sorted_log_tuples[0];
|
||||
@@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy {
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -134,12 +138,14 @@ mod tests {
|
||||
#[test]
|
||||
fn test_log_merge_policy_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000)];
|
||||
let test_input = vec![
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
@@ -147,24 +153,28 @@ mod tests {
|
||||
#[test]
|
||||
fn test_log_merge_policy_within_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(11),
|
||||
seg_meta(12),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000)];
|
||||
let test_input = vec![
|
||||
seg_meta(10),
|
||||
seg_meta(11),
|
||||
seg_meta(12),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
#[test]
|
||||
fn test_log_merge_policy_small_segments() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2)];
|
||||
let test_input = vec![
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
|
||||
@@ -28,10 +28,11 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
|
||||
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet)
|
||||
-> Option<(u64, u64)> {
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &U64FastFieldReader,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet,
|
||||
) -> Option<(u64, u64)> {
|
||||
if max_doc == 0 {
|
||||
None
|
||||
} else if !delete_bitset.has_deletes() {
|
||||
@@ -49,17 +50,18 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader,
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
fn extract_fieldnorm_reader(
|
||||
segment_reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fast_field_reader(field)
|
||||
.ok()
|
||||
fn extract_fast_field_reader(
|
||||
segment_reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fast_field_reader(field).ok()
|
||||
|
||||
}
|
||||
|
||||
@@ -100,10 +102,10 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
@@ -114,9 +116,11 @@ impl IndexMerger {
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fieldnorm_fastfields,
|
||||
&extract_fieldnorm_reader,
|
||||
fast_field_serializer)
|
||||
self.generic_write_fast_field(
|
||||
fieldnorm_fastfields,
|
||||
&extract_fieldnorm_reader,
|
||||
fast_field_serializer,
|
||||
)
|
||||
}
|
||||
|
||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
@@ -127,19 +131,21 @@ impl IndexMerger {
|
||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fast_fields,
|
||||
&extract_fast_field_reader,
|
||||
fast_field_serializer)
|
||||
self.generic_write_fast_field(
|
||||
fast_fields,
|
||||
&extract_fast_field_reader,
|
||||
fast_field_serializer,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
// used both to merge field norms and regular u64 fast fields.
|
||||
fn generic_write_fast_field(&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field)
|
||||
-> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer)
|
||||
-> Result<()> {
|
||||
fn generic_write_fast_field(
|
||||
&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
|
||||
for field in fields {
|
||||
|
||||
@@ -151,19 +157,25 @@ impl IndexMerger {
|
||||
match field_reader_extractor(reader, field) {
|
||||
Some(u64_reader) => {
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset()) {
|
||||
compute_min_max_val(
|
||||
&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset(),
|
||||
)
|
||||
{
|
||||
// the segment has some non-deleted documents
|
||||
min_val = min(min_val, seg_min_val);
|
||||
max_val = max(max_val, seg_max_val);
|
||||
u64_readers
|
||||
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
|
||||
u64_readers.push((
|
||||
reader.max_doc(),
|
||||
u64_reader,
|
||||
reader.delete_bitset(),
|
||||
));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let error_msg = format!("Failed to find a u64_reader for field {:?}",
|
||||
field);
|
||||
let error_msg =
|
||||
format!("Failed to find a u64_reader for field {:?}", field);
|
||||
error!("{}", error_msg);
|
||||
bail!(ErrorKind::SchemaError(error_msg));
|
||||
}
|
||||
@@ -179,8 +191,11 @@ impl IndexMerger {
|
||||
assert!(min_val <= max_val);
|
||||
|
||||
|
||||
let mut fast_single_field_serializer = fast_field_serializer
|
||||
.new_u64_fast_field(field, min_val, max_val)?;
|
||||
let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(
|
||||
field,
|
||||
min_val,
|
||||
max_val,
|
||||
)?;
|
||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||
for doc_id in 0..max_doc {
|
||||
if !delete_bitset.is_deleted(doc_id) {
|
||||
@@ -199,9 +214,8 @@ impl IndexMerger {
|
||||
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
let mut indexed_fields = vec!();
|
||||
let mut indexed_fields = vec![];
|
||||
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
||||
// if field_entry
|
||||
if field_entry.is_indexed() {
|
||||
indexed_fields.push(Field(field_ord as u32));
|
||||
}
|
||||
@@ -211,9 +225,8 @@ impl IndexMerger {
|
||||
|
||||
let field_readers = self.readers
|
||||
.iter()
|
||||
.map(|reader|
|
||||
reader.inverted_index(indexed_field))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
.map(|reader| reader.inverted_index(indexed_field))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let field_term_streams = field_readers
|
||||
.iter()
|
||||
@@ -224,7 +237,8 @@ impl IndexMerger {
|
||||
let mut max_doc = 0;
|
||||
|
||||
// map from segment doc ids to the resulting merged segment doc id.
|
||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
|
||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
|
||||
Vec::with_capacity(self.readers.len());
|
||||
|
||||
for reader in &self.readers {
|
||||
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
||||
@@ -258,8 +272,10 @@ impl IndexMerger {
|
||||
let segment_postings_option = field_entry
|
||||
.field_type()
|
||||
.get_segment_postings_option()
|
||||
.expect("Encountered a field that is not supposed to be
|
||||
indexed. Have you modified the schema?");
|
||||
.expect(
|
||||
"Encountered a field that is not supposed to be
|
||||
indexed. Have you modified the schema?",
|
||||
);
|
||||
|
||||
while merged_terms.advance() {
|
||||
|
||||
@@ -273,9 +289,11 @@ impl IndexMerger {
|
||||
let segment_ord = heap_item.segment_ord;
|
||||
let term_info = heap_item.streamer.value();
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
let inverted_index = segment_reader.inverted_index(term.field()).unwrap(); // TODO fix unwrap
|
||||
let mut segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||
let inverted_index = segment_reader.inverted_index(term.field());
|
||||
let mut segment_postings = inverted_index.read_postings_from_terminfo(
|
||||
term_info,
|
||||
segment_postings_option,
|
||||
);
|
||||
if segment_postings.advance() {
|
||||
Some((segment_ord, segment_postings))
|
||||
} else {
|
||||
@@ -304,14 +322,18 @@ impl IndexMerger {
|
||||
// `.advance()` has been called once before the loop.
|
||||
// Hence we cannot use a `while segment_postings.advance()` loop.
|
||||
if let Some(remapped_doc_id) =
|
||||
old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
old_to_new_doc_id[segment_postings.doc() as usize]
|
||||
{
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
let positions: &[u32] = segment_postings.positions();
|
||||
let term_freq = segment_postings.term_freq();
|
||||
let delta_positions = delta_computer.compute_delta(positions);
|
||||
field_serializer
|
||||
.write_doc(remapped_doc_id, term_freq, delta_positions)?;
|
||||
field_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
delta_positions,
|
||||
)?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
@@ -349,8 +371,12 @@ impl IndexMerger {
|
||||
impl SerializableSegment for IndexMerger {
|
||||
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
|
||||
self.write_postings(serializer.get_postings_serializer())?;
|
||||
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
|
||||
self.write_fast_fields(serializer.get_fast_field_serializer())?;
|
||||
self.write_fieldnorms(
|
||||
serializer.get_fieldnorms_serializer(),
|
||||
)?;
|
||||
self.write_fast_fields(
|
||||
serializer.get_fast_field_serializer(),
|
||||
)?;
|
||||
self.write_storable_fields(serializer.get_store_writer())?;
|
||||
serializer.close()?;
|
||||
Ok(self.max_doc)
|
||||
@@ -429,14 +455,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
{
|
||||
@@ -449,14 +474,22 @@ mod tests {
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2, 4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0, 3]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||
vec![4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2, 3, 4]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2, 4]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0, 3]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||
vec![4]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2, 3, 4]
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||
@@ -485,8 +518,10 @@ mod tests {
|
||||
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||
collector.vals()
|
||||
};
|
||||
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![5, 7, 13]);
|
||||
assert_eq!(
|
||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![5, 7, 13]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -533,14 +568,22 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![1, 3]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
vec![1]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
vec![1]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![1, 3]
|
||||
);
|
||||
}
|
||||
{
|
||||
// a second commit
|
||||
@@ -572,20 +615,34 @@ mod tests {
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
@@ -603,33 +660,46 @@ mod tests {
|
||||
}
|
||||
{
|
||||
// merging the segments
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -648,20 +718,34 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -671,13 +755,12 @@ mod tests {
|
||||
}
|
||||
{
|
||||
// Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
@@ -685,20 +768,34 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -710,13 +807,12 @@ mod tests {
|
||||
{
|
||||
// Test removing all docs
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
|
||||
@@ -44,10 +44,11 @@ pub struct SegmentEntry {
|
||||
|
||||
impl SegmentEntry {
|
||||
/// Create a new `SegmentEntry`
|
||||
pub fn new(segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>)
|
||||
-> SegmentEntry {
|
||||
pub fn new(
|
||||
segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>,
|
||||
) -> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
state: SegmentState::Ready,
|
||||
|
||||
@@ -32,31 +32,36 @@ pub struct SegmentManager {
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted,
|
||||
lock.committed)
|
||||
write!(
|
||||
f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted,
|
||||
lock.committed
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
|
||||
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
pub fn get_mergeable_segments(
|
||||
segment_manager: &SegmentManager,
|
||||
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
let registers_lock = segment_manager.read();
|
||||
(registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments())
|
||||
(
|
||||
registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments(),
|
||||
)
|
||||
}
|
||||
|
||||
impl SegmentManager {
|
||||
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: DeleteCursor)
|
||||
-> SegmentManager {
|
||||
pub fn from_segments(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: DeleteCursor,
|
||||
) -> SegmentManager {
|
||||
SegmentManager {
|
||||
registers: RwLock::new(SegmentRegisters {
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas,
|
||||
delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,25 +99,24 @@ impl SegmentManager {
|
||||
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
let registers = self.read();
|
||||
registers
|
||||
.committed
|
||||
.segment_entry(segment_id)
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
registers.committed.segment_entry(segment_id).or_else(|| {
|
||||
registers.uncommitted.segment_entry(segment_id)
|
||||
})
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.read()
|
||||
.expect("Failed to acquire read lock on SegmentManager.")
|
||||
self.registers.read().expect(
|
||||
"Failed to acquire read lock on SegmentManager.",
|
||||
)
|
||||
}
|
||||
|
||||
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on SegmentManager.")
|
||||
self.registers.write().expect(
|
||||
"Failed to acquire write lock on SegmentManager.",
|
||||
)
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
@@ -140,9 +144,11 @@ impl SegmentManager {
|
||||
}
|
||||
|
||||
|
||||
pub fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId) {
|
||||
pub fn cancel_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId,
|
||||
) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
|
||||
@@ -150,13 +156,15 @@ impl SegmentManager {
|
||||
{
|
||||
let target_segment_register: &mut SegmentRegister;
|
||||
target_segment_register = {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
if registers_lock.uncommitted.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
@@ -185,23 +193,26 @@ impl SegmentManager {
|
||||
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
pub fn end_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry) {
|
||||
pub fn end_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry,
|
||||
) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock
|
||||
.writing
|
||||
.remove(&after_merge_segment_entry.segment_id());
|
||||
registers_lock.writing.remove(&after_merge_segment_entry
|
||||
.segment_id());
|
||||
|
||||
let target_register: &mut SegmentRegister = {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
if registers_lock.uncommitted.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
|
||||
@@ -24,7 +24,12 @@ impl Debug for SegmentRegister {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
try!(write!(f, "SegmentRegister("));
|
||||
for (k, v) in &self.segment_states {
|
||||
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
|
||||
try!(write!(
|
||||
f,
|
||||
"{}:{}, ",
|
||||
k.short_uuid_string(),
|
||||
v.state().letter_code()
|
||||
));
|
||||
}
|
||||
try!(write!(f, ")"));
|
||||
Ok(())
|
||||
@@ -74,9 +79,9 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
||||
segment_ids
|
||||
.iter()
|
||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||
segment_ids.iter().all(|segment_id| {
|
||||
self.segment_states.contains_key(segment_id)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
||||
@@ -91,14 +96,18 @@ impl SegmentRegister {
|
||||
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.expect(
|
||||
"Received a merge notification for a segment that is not registered",
|
||||
)
|
||||
.cancel_merge();
|
||||
}
|
||||
|
||||
pub fn start_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.expect(
|
||||
"Received a merge notification for a segment that is not registered",
|
||||
)
|
||||
.start_merge();
|
||||
}
|
||||
|
||||
@@ -144,34 +153,42 @@ mod tests {
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready
|
||||
);
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready
|
||||
);
|
||||
segment_register.start_merge(&segment_id_a);
|
||||
segment_register.start_merge(&segment_id_b);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge
|
||||
);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge
|
||||
);
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
|
||||
@@ -28,11 +28,11 @@ impl SegmentSerializer {
|
||||
|
||||
let postings_serializer = try!(InvertedIndexSerializer::open(segment));
|
||||
Ok(SegmentSerializer {
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
}
|
||||
|
||||
/// Accessor to the `PostingsSerializer`.
|
||||
|
||||
@@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
directory: &mut Directory)
|
||||
-> Result<()> {
|
||||
pub fn save_metas(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
directory: &mut Directory,
|
||||
) -> Result<()> {
|
||||
let metas = IndexMeta {
|
||||
segments: segment_metas,
|
||||
schema: schema,
|
||||
@@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
|
||||
|
||||
fn perform_merge(segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64)
|
||||
-> Result<SegmentEntry> {
|
||||
fn perform_merge(
|
||||
segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64,
|
||||
) -> Result<SegmentEntry> {
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids);
|
||||
|
||||
@@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId],
|
||||
|
||||
for segment_id in segment_ids {
|
||||
if let Some(mut segment_entry) =
|
||||
segment_updater.0.segment_manager.segment_entry(segment_id) {
|
||||
segment_updater.0.segment_manager.segment_entry(segment_id)
|
||||
{
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
if let Some(file_protection) =
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)?
|
||||
{
|
||||
file_protections.push(file_protection);
|
||||
}
|
||||
segment_entries.push(segment_entry);
|
||||
} else {
|
||||
error!("Error, had to abort merge as some of the segment is not managed anymore.");
|
||||
let msg = format!("Segment {:?} requested for merge is not managed.",
|
||||
segment_id);
|
||||
let msg = format!(
|
||||
"Segment {:?} requested for merge is not managed.",
|
||||
segment_id
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(msg));
|
||||
}
|
||||
}
|
||||
@@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId],
|
||||
// ... we just serialize this index merger in our new segment
|
||||
// to merge the two segments.
|
||||
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
|
||||
.expect("Creating index serializer failed");
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect(
|
||||
"Creating index serializer failed",
|
||||
);
|
||||
|
||||
let num_docs = merger
|
||||
.write(segment_serializer)
|
||||
.expect("Serializing merged index failed");
|
||||
let num_docs = merger.write(segment_serializer).expect(
|
||||
"Serializing merged index failed",
|
||||
);
|
||||
let mut segment_meta = SegmentMeta::new(merged_segment.id());
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
|
||||
@@ -161,23 +168,24 @@ struct InnerSegmentUpdater {
|
||||
}
|
||||
|
||||
impl SegmentUpdater {
|
||||
pub fn new(index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor)
|
||||
-> Result<SegmentUpdater> {
|
||||
pub fn new(
|
||||
index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor,
|
||||
) -> Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
})))
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
@@ -199,10 +207,10 @@ impl SegmentUpdater {
|
||||
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
|
||||
}
|
||||
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
|
||||
(&self,
|
||||
f: F)
|
||||
-> CpuFuture<T, Error> {
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
|
||||
&self,
|
||||
f: F,
|
||||
) -> CpuFuture<T, Error> {
|
||||
let me_clone = self.clone();
|
||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||
}
|
||||
@@ -211,11 +219,10 @@ impl SegmentUpdater {
|
||||
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
|
||||
if generation >= self.0.generation.load(Ordering::Acquire) {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
})
|
||||
.forget();
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
}).forget();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -249,46 +256,46 @@ impl SegmentUpdater {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
save_metas(self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut())
|
||||
.expect("Could not save metas.");
|
||||
save_metas(
|
||||
self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut(),
|
||||
).expect("Could not save metas.");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
}).wait()
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
info!("Running garbage collection");
|
||||
let mut index = self.0.index.clone();
|
||||
index.directory_mut().garbage_collect(|| {
|
||||
self.0.segment_manager.list_files()
|
||||
});
|
||||
index.directory_mut().garbage_collect(
|
||||
|| self.0.segment_manager.list_files(),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64) -> Result<()> {
|
||||
self.run_async(move |segment_updater| if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
.purge_deletes(opstamp)
|
||||
.expect("Failed purge deletes");
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
})
|
||||
.wait()
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp).expect(
|
||||
"Failed purge deletes",
|
||||
);
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
}).wait()
|
||||
}
|
||||
|
||||
|
||||
pub fn start_merge(&self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
@@ -308,10 +315,12 @@ impl SegmentUpdater {
|
||||
// first we need to apply deletes to our segment.
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp);
|
||||
let merge_result = perform_merge(
|
||||
&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp,
|
||||
);
|
||||
|
||||
match merge_result {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
@@ -345,11 +354,10 @@ impl SegmentUpdater {
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
});
|
||||
self.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(merging_thread_id, merging_join_handle);
|
||||
self.0.merging_threads.write().unwrap().insert(
|
||||
merging_thread_id,
|
||||
merging_join_handle,
|
||||
);
|
||||
merging_future_recv
|
||||
}
|
||||
|
||||
@@ -368,19 +376,23 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId) {
|
||||
self.0
|
||||
.segment_manager
|
||||
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
|
||||
fn cancel_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId,
|
||||
) {
|
||||
self.0.segment_manager.cancel_merge(
|
||||
before_merge_segment_ids,
|
||||
after_merge_segment_entry,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
fn end_merge(&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry)
|
||||
-> Result<()> {
|
||||
fn end_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry,
|
||||
) -> Result<()> {
|
||||
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
@@ -391,28 +403,37 @@ impl SegmentUpdater {
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
match advance_deletes(segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp) {
|
||||
match advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
Ok(file_protection_opt_res) => {
|
||||
_file_protection_opt = file_protection_opt_res;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids, e);
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids,
|
||||
e
|
||||
);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater.cancel_merge(&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id());
|
||||
segment_updater.cancel_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id(),
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids,
|
||||
after_merge_segment_entry);
|
||||
segment_updater.0.segment_manager.end_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry,
|
||||
);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
segment_updater.save_metas(segment_updater.0.index.opstamp());
|
||||
@@ -450,10 +471,9 @@ impl SegmentUpdater {
|
||||
}
|
||||
debug!("wait merging thread {}", new_merging_threads.len());
|
||||
for (_, merging_thread_handle) in new_merging_threads {
|
||||
merging_thread_handle
|
||||
.join()
|
||||
.map(|_| ())
|
||||
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
|
||||
merging_thread_handle.join().map(|_| ()).map_err(|_| {
|
||||
ErrorKind::ErrorInThread("Merging thread failed.".into())
|
||||
})?;
|
||||
}
|
||||
// Our merging thread may have queued their completed
|
||||
self.run_async(move |_| {}).wait()?;
|
||||
@@ -522,9 +542,9 @@ mod tests {
|
||||
assert_eq!(index.searcher().num_docs(), 302);
|
||||
|
||||
{
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting for merging threads");
|
||||
index_writer.wait_merging_threads().expect(
|
||||
"waiting for merging threads",
|
||||
);
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
@@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// the flushing behavior as a buffer limit
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema)
|
||||
-> Result<SegmentWriter<'a>> {
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
@@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(self) -> Result<Vec<u64>> {
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer)?;
|
||||
write(
|
||||
&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
)?;
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
@@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema)
|
||||
-> io::Result<()> {
|
||||
pub fn add_document(
|
||||
&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema,
|
||||
) -> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
@@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> {
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &field_values)
|
||||
self.multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&field_values,
|
||||
)
|
||||
} else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
@@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
|
||||
self.fieldnorms_writer.get_field_writer(field).map(
|
||||
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
|
||||
);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u64(field_value.field(),
|
||||
field_value.value().u64_value());
|
||||
let term = Term::from_field_u64(
|
||||
field_value.field(),
|
||||
field_value.value().u64_value(),
|
||||
);
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
@@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> {
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(),
|
||||
field_value.value().i64_value());
|
||||
let term = Term::from_field_i64(
|
||||
field_value.field(),
|
||||
field_value.value().i64_value(),
|
||||
);
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
@@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> {
|
||||
self.fast_field_writers.add_document(doc);
|
||||
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
.filter(|field_value| {
|
||||
schema.get_field_entry(field_value.field()).is_stored()
|
||||
})
|
||||
.collect();
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
try!(doc_writer.store(&stored_fieldvalues));
|
||||
@@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
|
||||
// This method is used as a trick to workaround the borrow checker
|
||||
fn write(multifield_postings: &MultiFieldPostingsWriter,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer)
|
||||
-> Result<()> {
|
||||
fn write(
|
||||
multifield_postings: &MultiFieldPostingsWriter,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
) -> Result<()> {
|
||||
|
||||
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
|
||||
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
|
||||
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
|
||||
try!(multifield_postings.serialize(
|
||||
serializer.get_postings_serializer(),
|
||||
));
|
||||
try!(fast_field_writers.serialize(
|
||||
serializer.get_fast_field_serializer(),
|
||||
));
|
||||
try!(fieldnorms_writer.serialize(
|
||||
serializer.get_fieldnorms_serializer(),
|
||||
));
|
||||
try!(serializer.close());
|
||||
|
||||
Ok(())
|
||||
@@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter,
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer)?;
|
||||
write(
|
||||
&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer,
|
||||
)?;
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
113
src/lib.rs
113
src/lib.rs
@@ -68,7 +68,7 @@ extern crate stable_deref_trait;
|
||||
#[cfg(test)]
|
||||
extern crate env_logger;
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
#[cfg(feature = "simdcompression")]
|
||||
extern crate libc;
|
||||
|
||||
#[cfg(windows)]
|
||||
@@ -391,16 +391,24 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field).unwrap();
|
||||
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -426,17 +434,25 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field()).unwrap();
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
|
||||
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -462,14 +478,22 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field()).unwrap();
|
||||
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -477,7 +501,9 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index.read_postings(&term_c, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_c, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
@@ -501,7 +527,7 @@ mod tests {
|
||||
let term = Term::from_field_u64(field, 1u64);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field()).unwrap()
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
@@ -525,7 +551,7 @@ mod tests {
|
||||
let term = Term::from_field_i64(value_field, negative_val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field()).unwrap()
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
@@ -588,11 +614,17 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field).unwrap();
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
assert!(inverted_index.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
let term_af = Term::from_field_text(text_field, "af");
|
||||
let mut postings = inverted_index.read_postings(&term_af, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_af, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 3);
|
||||
@@ -634,29 +666,43 @@ mod tests {
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||
vec![1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||
vec![1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||
vec![2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||
vec![2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a")]),
|
||||
vec![0, 1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![
|
||||
Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a"),
|
||||
]),
|
||||
vec![0, 1, 2]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -693,7 +739,8 @@ mod tests {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||
let document = doc!(text_field => "tantivy",
|
||||
let document =
|
||||
doc!(text_field => "tantivy",
|
||||
text_field => "some other value",
|
||||
other_text_field => "short");
|
||||
assert_eq!(document.len(), 3);
|
||||
|
||||
@@ -72,8 +72,7 @@ pub trait DocSet {
|
||||
for (i, buffer_val) in buffer.iter_mut().enumerate() {
|
||||
if self.advance() {
|
||||
*buffer_val = self.doc();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,7 +65,9 @@ mod tests {
|
||||
field_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
field_serializer.write_doc(doc_id, 2, &delta_positions).unwrap();
|
||||
field_serializer
|
||||
.write_doc(doc_id, 2, &delta_positions)
|
||||
.unwrap();
|
||||
}
|
||||
field_serializer.close_term().unwrap();
|
||||
}
|
||||
@@ -84,8 +86,8 @@ mod tests {
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema)
|
||||
.unwrap();
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
@@ -131,15 +133,17 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||
assert!(segment_reader
|
||||
.inverted_index(term_a.field()).unwrap()
|
||||
assert!(
|
||||
segment_reader
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.is_none());
|
||||
.is_none()
|
||||
);
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let mut postings_a = segment_reader
|
||||
.inverted_index(term_a.field()).unwrap()
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_a.len(), 1000);
|
||||
@@ -162,7 +166,7 @@ mod tests {
|
||||
{
|
||||
let term_e = Term::from_field_text(text_field, "e");
|
||||
let mut postings_e = segment_reader
|
||||
.inverted_index(term_e.field()).unwrap()
|
||||
.inverted_index(term_e.field())
|
||||
.read_postings(&term_e, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
@@ -202,8 +206,10 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher);
|
||||
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
|
||||
@@ -250,7 +256,7 @@ mod tests {
|
||||
for i in 0..num_docs - 1 {
|
||||
for j in i + 1..num_docs {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -264,7 +270,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -285,7 +291,7 @@ mod tests {
|
||||
// check that filtering works
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_0.field()).unwrap()
|
||||
.inverted_index(term_0.field())
|
||||
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -295,7 +301,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_0.field()).unwrap()
|
||||
.inverted_index(term_0.field())
|
||||
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -320,7 +326,7 @@ mod tests {
|
||||
// make sure seeking still works
|
||||
for i in 0..num_docs {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -336,7 +342,7 @@ mod tests {
|
||||
// now try with a longer sequence
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -372,14 +378,14 @@ mod tests {
|
||||
// finally, check that it's empty
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field()).unwrap()
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -446,12 +452,12 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field()).unwrap()
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -460,25 +466,27 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
b.iter(|| {
|
||||
let segment_postings_a = segment_reader
|
||||
.inverted_index(TERM_A.field()).unwrap()
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_b = segment_reader
|
||||
.inverted_index(TERM_B.field()).unwrap()
|
||||
.inverted_index(TERM_B.field())
|
||||
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_c = segment_reader
|
||||
.inverted_index(TERM_C.field()).unwrap()
|
||||
.inverted_index(TERM_C.field())
|
||||
.read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_d = segment_reader
|
||||
.inverted_index(TERM_D.field()).unwrap()
|
||||
.inverted_index(TERM_D.field())
|
||||
.read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![
|
||||
segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d,
|
||||
]);
|
||||
while intersection.advance() {}
|
||||
});
|
||||
}
|
||||
@@ -489,7 +497,7 @@ mod tests {
|
||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field()).unwrap()
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -506,7 +514,7 @@ mod tests {
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field()).unwrap()
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
for doc in &existing_docs {
|
||||
@@ -544,7 +552,7 @@ mod tests {
|
||||
b.iter(|| {
|
||||
let n: u32 = test::black_box(17);
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field()).unwrap()
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut s = 0u32;
|
||||
|
||||
@@ -16,9 +16,10 @@ use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
|
||||
heap: &'a Heap)
|
||||
-> Box<PostingsWriter + 'a> {
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
@@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| {
|
||||
posting_from_field_entry(field_entry, heap)
|
||||
})
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
@@ -102,7 +101,11 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (_, stop) = offsets[i + 1];
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
let mut field_serializer = serializer.new_field(field)?;
|
||||
postings_writer.serialize(&term_offsets[start..stop], &mut field_serializer, self.heap)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -127,29 +130,33 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap);
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
fn index_text<'a>(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
@@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn suscribe(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap) {
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
) {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
let recorder: &mut Rec = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
@@ -213,11 +222,12 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
recorder.record_position(position, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
serializer.new_term(term_bytes)?;
|
||||
@@ -227,4 +237,3 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable {
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
@@ -64,11 +65,12 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
@@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
let mut doc_iter = self.stack.iter(self_addr, heap).chain(
|
||||
Some(self.current_tf)
|
||||
.into_iter(),
|
||||
);
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
let term_freq = doc_iter
|
||||
.next()
|
||||
.expect("The IndexWriter recorded a doc without a term freq.");
|
||||
let term_freq = doc_iter.next().expect(
|
||||
"The IndexWriter recorded a doc without a term freq.",
|
||||
);
|
||||
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
@@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder {
|
||||
prev_position = position;
|
||||
}
|
||||
}
|
||||
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
|
||||
serializer.write_doc(
|
||||
doc,
|
||||
doc_positions.len() as u32,
|
||||
&doc_positions,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -25,11 +25,10 @@ struct PositionComputer {
|
||||
}
|
||||
|
||||
impl PositionComputer {
|
||||
|
||||
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
|
||||
PositionComputer {
|
||||
position_to_skip: None,
|
||||
positions: vec!(),
|
||||
positions: vec![],
|
||||
positions_stream: positions_stream,
|
||||
}
|
||||
}
|
||||
@@ -38,9 +37,9 @@ impl PositionComputer {
|
||||
self.position_to_skip = Some(
|
||||
self.position_to_skip
|
||||
.map(|prev_skip| prev_skip + num_skip)
|
||||
.unwrap_or(0)
|
||||
);
|
||||
}
|
||||
.unwrap_or(0),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
@@ -83,13 +82,13 @@ impl SegmentPostings {
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_stream_opt: Option<CompressedIntStream>)
|
||||
-> SegmentPostings {
|
||||
let position_computer = positions_stream_opt.map(|stream| {
|
||||
UnsafeCell::new(PositionComputer::new(stream))
|
||||
});
|
||||
pub fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_stream_opt: Option<CompressedIntStream>,
|
||||
) -> SegmentPostings {
|
||||
let position_computer =
|
||||
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
@@ -110,7 +109,7 @@ impl SegmentPostings {
|
||||
}
|
||||
|
||||
|
||||
fn position_add_skip<F: FnOnce()->usize>(&self, num_skips_fn: F) {
|
||||
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
|
||||
if let Some(ref position_computer) = self.position_computer.as_ref() {
|
||||
let num_skips = num_skips_fn();
|
||||
unsafe {
|
||||
@@ -135,7 +134,7 @@ impl DocSet for SegmentPostings {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.position_add_skip(|| { self.term_freq() as usize });
|
||||
self.position_add_skip(|| self.term_freq() as usize);
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
}
|
||||
@@ -257,8 +256,10 @@ impl DocSet for SegmentPostings {
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
debug_assert!(self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc().");
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
}
|
||||
@@ -278,16 +279,11 @@ impl Postings for SegmentPostings {
|
||||
let term_freq = self.term_freq();
|
||||
self.position_computer
|
||||
.as_ref()
|
||||
.map(|position_computer| {
|
||||
unsafe {
|
||||
(&mut *position_computer.get()).positions(term_freq as usize)
|
||||
}
|
||||
.map(|position_computer| unsafe {
|
||||
(&mut *position_computer.get()).positions(term_freq as usize)
|
||||
})
|
||||
.unwrap_or(&EMPTY_POSITIONS[..])
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -311,10 +307,11 @@ pub struct BlockSegmentPostings {
|
||||
}
|
||||
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(doc_freq: usize,
|
||||
data: SourceRead,
|
||||
has_freq: bool)
|
||||
-> BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: usize,
|
||||
data: SourceRead,
|
||||
has_freq: bool,
|
||||
) -> BlockSegmentPostings {
|
||||
let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks;
|
||||
BlockSegmentPostings {
|
||||
@@ -402,15 +399,16 @@ impl BlockSegmentPostings {
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_binpacked_blocks > 0 {
|
||||
// TODO could self.doc_offset be just a local variable?
|
||||
|
||||
let num_consumed_bytes = self
|
||||
.doc_decoder
|
||||
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
|
||||
if self.has_freq {
|
||||
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(self.remaining_data.as_ref());
|
||||
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(
|
||||
self.remaining_data.as_ref(),
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
@@ -418,15 +416,17 @@ impl BlockSegmentPostings {
|
||||
self.num_binpacked_blocks -= 1;
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
let num_compressed_bytes =
|
||||
self.doc_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs);
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
if self.has_freq {
|
||||
self.freq_decoder
|
||||
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
|
||||
self.freq_decoder.uncompress_vint_unsorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.num_vint_docs,
|
||||
);
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
@@ -508,12 +508,13 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
let mut block_segments =
|
||||
inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
|
||||
&term_info,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -549,17 +550,18 @@ mod tests {
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
block_segments =
|
||||
inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
block_segments = inverted_index.read_block_postings_from_terminfo(
|
||||
&term_info,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert!(block_segments.docs() == &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ pub enum SegmentPostingsOption {
|
||||
}
|
||||
|
||||
impl SegmentPostingsOption {
|
||||
|
||||
/// Returns true iff this option includes encoding
|
||||
/// term frequencies.
|
||||
pub fn has_freq(&self) -> bool {
|
||||
|
||||
@@ -57,11 +57,12 @@ pub struct InvertedIndexSerializer {
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
fn new(terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
schema: Schema)
|
||||
-> Result<InvertedIndexSerializer> {
|
||||
fn new(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
schema: Schema,
|
||||
) -> Result<InvertedIndexSerializer> {
|
||||
Ok(InvertedIndexSerializer {
|
||||
terms_write: terms_write,
|
||||
postings_write: postings_write,
|
||||
@@ -78,7 +79,8 @@ impl InvertedIndexSerializer {
|
||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||
segment.schema())
|
||||
segment.schema(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Must be called before starting pushing terms of
|
||||
@@ -94,7 +96,7 @@ impl InvertedIndexSerializer {
|
||||
field_entry.field_type().clone(),
|
||||
term_dictionary_write,
|
||||
postings_write,
|
||||
positions_write
|
||||
positions_write,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -120,7 +122,6 @@ pub struct FieldSerializer<'a> {
|
||||
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
|
||||
fn new(
|
||||
field_type: FieldType,
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
@@ -128,25 +129,24 @@ impl<'a> FieldSerializer<'a> {
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) =
|
||||
match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let text_indexing_options = text_options.get_indexing_options();
|
||||
(text_indexing_options.is_termfreq_enabled(), text_indexing_options.is_position_enabled())
|
||||
},
|
||||
_ => {
|
||||
(false, false)
|
||||
}
|
||||
};
|
||||
let term_dictionary_builder = TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt =
|
||||
if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let text_indexing_options = text_options.get_indexing_options();
|
||||
(
|
||||
text_indexing_options.is_termfreq_enabled(),
|
||||
text_indexing_options.is_position_enabled(),
|
||||
)
|
||||
}
|
||||
else {
|
||||
None
|
||||
};
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(FieldSerializer {
|
||||
term_dictionary_builder: term_dictionary_builder,
|
||||
@@ -159,9 +159,9 @@ impl<'a> FieldSerializer<'a> {
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let (filepos, offset) = self.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u32, 0u8));
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u32, 0u8));
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_offset: self.postings_serializer.addr(),
|
||||
@@ -194,11 +194,12 @@ impl<'a> FieldSerializer<'a> {
|
||||
///
|
||||
/// Term frequencies and positions may be ignored by the serializer depending
|
||||
/// on the configuration of the field in the `Schema`.
|
||||
pub fn write_doc(&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32])
|
||||
-> io::Result<()> {
|
||||
pub fn write_doc(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32],
|
||||
) -> io::Result<()> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq)?;
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
@@ -213,7 +214,9 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
self.term_dictionary_builder.insert_value(&self.current_term_info)?;
|
||||
self.term_dictionary_builder.insert_value(
|
||||
&self.current_term_info,
|
||||
)?;
|
||||
self.postings_serializer.close_term()?;
|
||||
self.term_open = false;
|
||||
}
|
||||
@@ -251,8 +254,8 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
postings_write: CountingWriter::wrap(write),
|
||||
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: vec!(),
|
||||
term_freqs: vec!(),
|
||||
doc_ids: vec![],
|
||||
term_freqs: vec![],
|
||||
|
||||
last_doc_id_encoded: 0u32,
|
||||
termfreq_enabled: termfreq_enabled,
|
||||
@@ -267,16 +270,17 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(
|
||||
&self.doc_ids,
|
||||
self.last_doc_id_encoded,
|
||||
);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
if self.termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_unsorted(&self.term_freqs);
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
@@ -294,16 +298,18 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded =
|
||||
self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded = self.block_encoder.compress_vint_sorted(
|
||||
&self.doc_ids,
|
||||
self.last_doc_id_encoded,
|
||||
);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(
|
||||
&self.term_freqs[..],
|
||||
);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
@@ -373,4 +379,3 @@ impl<W: Write> PositionSerializer<W> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::io;
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// addressing the start of the posting list associated
|
||||
/// to this term.
|
||||
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
|
||||
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
|
||||
@@ -37,10 +37,12 @@ impl Query for BooleanQuery {
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
let sub_weights = try!(self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect());
|
||||
let sub_weights = try!(
|
||||
self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect()
|
||||
);
|
||||
let occurs: Vec<Occur> = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref _subquery)| *occur)
|
||||
@@ -57,10 +59,9 @@ impl BooleanQuery {
|
||||
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query: Box<Query> = box TermQuery::new(term,
|
||||
SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
.collect();
|
||||
BooleanQuery::from(occur_term_queries)
|
||||
}
|
||||
|
||||
@@ -55,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
.map(|(ord, doc)| {
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32,
|
||||
}
|
||||
})
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
scorers: non_empty_scorers,
|
||||
|
||||
@@ -22,11 +22,12 @@ impl BooleanWeight {
|
||||
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> =
|
||||
try!(self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
|
||||
self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect()
|
||||
);
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
|
||||
@@ -64,8 +64,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let make_term_query = |text: &str| {
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, text),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let query: Box<Query> = box term_query;
|
||||
query
|
||||
};
|
||||
@@ -87,19 +89,25 @@ mod tests {
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Should, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
|
||||
}
|
||||
{
|
||||
|
||||
@@ -61,9 +61,9 @@ mod tests {
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
searcher.search(&phrase_query, &mut test_collector).expect(
|
||||
"search should succeed",
|
||||
);
|
||||
test_collector.docs()
|
||||
};
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ impl Weight for PhraseWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
let inverted_index = reader.inverted_index(term.field())?;
|
||||
let inverted_index = reader.inverted_index(term.field());
|
||||
let term_postings_option =
|
||||
inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
@@ -31,6 +31,8 @@ impl Weight for PhraseWeight {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +66,10 @@ pub trait Query: fmt::Debug {
|
||||
let mut segment_search_timer = search_timer.open("segment_search");
|
||||
{
|
||||
let _ = segment_search_timer.open("set_segment");
|
||||
try!(collector.set_segment(segment_ord as SegmentLocalId, segment_reader));
|
||||
try!(collector.set_segment(
|
||||
segment_ord as SegmentLocalId,
|
||||
segment_reader,
|
||||
));
|
||||
}
|
||||
let mut scorer = try!(weight.scorer(segment_reader));
|
||||
{
|
||||
|
||||
@@ -3,7 +3,8 @@ use combine::char::*;
|
||||
use super::user_input_ast::*;
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
let term_val = || {
|
||||
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
|
||||
@@ -11,27 +12,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
phrase.or(word)
|
||||
};
|
||||
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map(
|
||||
|(s1, s2): (char, String)| format!("{}{}", s1, s2),
|
||||
);
|
||||
|
||||
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
let field = (
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
|
||||
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name:
|
||||
Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
try(term_query)
|
||||
.or(term_default_field)
|
||||
.map(UserInputAST::from)
|
||||
@@ -40,25 +43,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
|
||||
|
||||
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
(char('-'), parser(literal))
|
||||
.map(|(_, expr)| UserInputAST::Not(box expr))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| {
|
||||
UserInputAST::Must(box expr)
|
||||
}))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
sep_by(parser(leaf), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
})
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
})
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
@@ -117,20 +117,22 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, _remaining) = parse_to_ast(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let (user_input_ast, _remaining) = parse_to_ast(query).map_err(
|
||||
|_| QueryParserError::SyntaxError,
|
||||
)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
self.schema.get_field(field_name).ok_or_else(|| {
|
||||
QueryParserError::FieldDoesNotExist(String::from(field_name))
|
||||
})
|
||||
}
|
||||
|
||||
fn compute_logical_ast(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
fn compute_logical_ast(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
@@ -138,10 +140,11 @@ impl QueryParser {
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(&self,
|
||||
field: Field,
|
||||
phrase: &str)
|
||||
-> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
fn compute_logical_ast_for_leaf(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -174,7 +177,9 @@ impl QueryParser {
|
||||
if terms.is_empty() {
|
||||
Ok(None)
|
||||
} else if terms.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
|
||||
Ok(Some(
|
||||
LogicalLiteral::Term(terms.into_iter().next().unwrap()),
|
||||
))
|
||||
} else {
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
@@ -191,18 +196,24 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_with_occur(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res| {
|
||||
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
|
||||
})
|
||||
.collect());
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
|
||||
sub_queries
|
||||
.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res| {
|
||||
res.map(|(occur, sub_ast)| {
|
||||
(compose_occur(default_occur, occur), sub_ast)
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
);
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Not(subquery) => {
|
||||
@@ -320,9 +331,10 @@ mod test {
|
||||
}
|
||||
|
||||
|
||||
fn parse_query_to_logical_ast(query: &str,
|
||||
default_conjunction: bool)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
fn parse_query_to_logical_ast(
|
||||
query: &str,
|
||||
default_conjunction: bool,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
let mut query_parser = make_query_parser();
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
@@ -330,9 +342,11 @@ mod test {
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
fn test_parse_query_to_logical_ast_helper(query: &str,
|
||||
expected: &str,
|
||||
default_conjunction: bool) {
|
||||
fn test_parse_query_to_logical_ast_helper(
|
||||
query: &str,
|
||||
expected: &str,
|
||||
default_conjunction: bool,
|
||||
) {
|
||||
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
@@ -358,21 +372,29 @@ mod test {
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64")));
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64"))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false);
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -381,82 +403,115 @@ mod test {
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok()
|
||||
);
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err()
|
||||
);
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper("unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false);
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok()
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("signed:-2324",
|
||||
&format!("{:?}",
|
||||
Term::from_field_i64(Field(2u32), -2324)),
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
false,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
false);
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
true,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
+(Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
true);
|
||||
true,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,8 +44,10 @@ mod tests {
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let term_weight = term_query.weight(&searcher).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();
|
||||
|
||||
@@ -7,7 +7,8 @@ use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
pub struct TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
|
||||
@@ -15,7 +16,8 @@ pub struct TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub fn postings(&self) -> &TPostings {
|
||||
&self.postings
|
||||
@@ -23,7 +25,8 @@ impl<TPostings> TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
@@ -40,7 +43,8 @@ impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> Scorer for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn score(&self) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
|
||||
@@ -28,21 +28,22 @@ impl TermWeight {
|
||||
}
|
||||
|
||||
/// If the field is not found, returns an empty `DocSet`.
|
||||
pub fn specialized_scorer(&self,
|
||||
reader: &SegmentReader)
|
||||
-> Result<TermScorer<SegmentPostings>> {
|
||||
pub fn specialized_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TermScorer<SegmentPostings>> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field)?;
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
let postings_opt: Option<SegmentPostings> = inverted_index.read_postings(&self.term, self.segment_postings_options);
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.segment_postings_options);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt: fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
})
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Ok(TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
|
||||
@@ -10,7 +10,7 @@ use common::BinarySerializable;
|
||||
///
|
||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
||||
/// Value 255 is reserved.
|
||||
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct Field(pub u32);
|
||||
|
||||
impl BinarySerializable for Field {
|
||||
|
||||
@@ -89,7 +89,8 @@ impl FieldEntry {
|
||||
|
||||
impl Serialize for FieldEntry {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let mut s = serializer.serialize_struct("field_entry", 3)?;
|
||||
s.serialize_field("name", &self.name)?;
|
||||
@@ -115,7 +116,8 @@ impl Serialize for FieldEntry {
|
||||
|
||||
impl<'de> Deserialize<'de> for FieldEntry {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
#[serde(field_identifier, rename_all = "lowercase")]
|
||||
@@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
}
|
||||
|
||||
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
|
||||
where V: MapAccess<'de>
|
||||
where
|
||||
V: MapAccess<'de>,
|
||||
{
|
||||
let mut name = None;
|
||||
let mut ty = None;
|
||||
@@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
|
||||
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
|
||||
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
|
||||
let field_type = field_type
|
||||
.ok_or_else(|| de::Error::missing_field("options"))?;
|
||||
let field_type = field_type.ok_or_else(
|
||||
|| de::Error::missing_field("options"),
|
||||
)?;
|
||||
|
||||
Ok(FieldEntry {
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -80,8 +80,9 @@ impl FieldType {
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}",
|
||||
json)))
|
||||
Err(ValueParsingError::TypeError(
|
||||
format!("Expected an integer, got {:?}", json),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -110,9 +111,11 @@ impl FieldType {
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!("Json value not supported error {:?}. Expected {:?}",
|
||||
json,
|
||||
self);
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json,
|
||||
self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,9 +105,9 @@ impl SchemaBuilder {
|
||||
/// This will consume your `SchemaBuilder`
|
||||
pub fn build(self) -> Schema {
|
||||
Schema(Arc::new(InnerSchema {
|
||||
fields: self.fields,
|
||||
fields_map: self.fields_map,
|
||||
}))
|
||||
fields: self.fields,
|
||||
fields_map: self.fields_map,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,15 +206,14 @@ impl Schema {
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json)
|
||||
.map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
})?;
|
||||
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
})?;
|
||||
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
@@ -225,18 +224,15 @@ impl Schema {
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = try!(field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
let value =
|
||||
try!(field_type.value_from_json(json_item).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = try!(field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| {
|
||||
let value = try!(field_type.value_from_json(json_value).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
doc.add(FieldValue::new(field, value));
|
||||
@@ -259,7 +255,8 @@ impl fmt::Debug for Schema {
|
||||
|
||||
impl Serialize for Schema {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
|
||||
for e in &self.0.fields {
|
||||
@@ -271,7 +268,8 @@ impl Serialize for Schema {
|
||||
|
||||
impl<'de> Deserialize<'de> for Schema {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct SchemaVisitor;
|
||||
|
||||
@@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where A: SeqAccess<'de>
|
||||
where
|
||||
A: SeqAccess<'de>,
|
||||
{
|
||||
let mut schema = SchemaBuilder {
|
||||
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
|
||||
@@ -430,12 +429,14 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let doc = schema
|
||||
.parse_document(r#"{
|
||||
.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10
|
||||
}"#)
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
|
||||
@@ -443,13 +444,15 @@ mod tests {
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10,
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
|
||||
assert_eq!(field_name, "jambon");
|
||||
@@ -460,13 +463,15 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": "5",
|
||||
"popularity": "10",
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
|
||||
assert!(true);
|
||||
@@ -477,12 +482,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": -5,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
@@ -493,12 +500,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 9223372036854775808,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
|
||||
@@ -509,12 +518,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
"popularity": 9223372036854775808
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
@@ -525,11 +536,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(NotJSON(_)) => {
|
||||
assert!(true);
|
||||
|
||||
@@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
|
||||
pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where
|
||||
B: AsRef<[u8]>;
|
||||
|
||||
impl Term {
|
||||
/// Builds a term given a field, and a u64-value
|
||||
@@ -109,7 +111,8 @@ impl Term {
|
||||
}
|
||||
|
||||
impl<B> Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
/// Wraps a source of data
|
||||
pub fn wrap(data: B) -> Term<B> {
|
||||
@@ -166,7 +169,8 @@ impl<B> Term<B>
|
||||
}
|
||||
|
||||
impl<B> AsRef<[u8]> for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::BitOr;
|
||||
|
||||
|
||||
/// Define how a text field should be handled by tantivy.
|
||||
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TextOptions {
|
||||
indexing: TextIndexingOptions,
|
||||
stored: bool,
|
||||
@@ -45,10 +45,10 @@ impl Default for TextOptions {
|
||||
|
||||
|
||||
/// Describe how a field should be indexed
|
||||
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum TextIndexingOptions {
|
||||
/// Unindexed fields will not generate any postings. They will not be searchable either.
|
||||
#[serde(rename="unindexed")]
|
||||
#[serde(rename = "unindexed")]
|
||||
Unindexed,
|
||||
/// Untokenized means that the field text will not be split into tokens before being indexed.
|
||||
/// A field with the value "Hello world", will have the document suscribe to one single
|
||||
@@ -56,23 +56,23 @@ pub enum TextIndexingOptions {
|
||||
///
|
||||
/// It will **not** be searchable if the user enter "hello" for instance.
|
||||
/// This can be useful for tags, or ids for instance.
|
||||
#[serde(rename="untokenized")]
|
||||
#[serde(rename = "untokenized")]
|
||||
Untokenized,
|
||||
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
|
||||
/// to the posting lists associated to all of the tokens.
|
||||
/// The frequence of appearance of the term in the document however will be lost.
|
||||
/// The term frequency used in the TfIdf formula will always be 1.
|
||||
#[serde(rename="tokenize")]
|
||||
#[serde(rename = "tokenize")]
|
||||
TokenizedNoFreq,
|
||||
/// TokenizedWithFreq will tokenize the field value, and encode
|
||||
/// both the docid and the term frequency in the posting lists associated to all
|
||||
#[serde(rename="freq")]
|
||||
#[serde(rename = "freq")]
|
||||
TokenizedWithFreq,
|
||||
/// Like TokenizedWithFreq, but also encodes the positions of the
|
||||
/// terms in a separate file. This option is required for phrase queries.
|
||||
/// Don't use this if you are certain you won't need it, the term positions file
|
||||
/// can be very big.
|
||||
#[serde(rename="position")]
|
||||
#[serde(rename = "position")]
|
||||
TokenizedWithFreqAndPosition,
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,8 @@ pub enum Value {
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match *self {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
@@ -28,7 +29,8 @@ impl Serialize for Value {
|
||||
|
||||
impl<'de> Deserialize<'de> for Value {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct ValueVisitor;
|
||||
|
||||
@@ -162,9 +164,13 @@ mod binary_serialize {
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
_ => {
|
||||
Err(io::Error::new(io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}",
|
||||
type_code)))
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"No field type is associated with code {:?}",
|
||||
type_code
|
||||
),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,17 +54,19 @@ mod tests {
|
||||
fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title = schema_builder
|
||||
.add_text_field("title", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
let lorem = String::from(
|
||||
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.");
|
||||
laborum.",
|
||||
);
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer);
|
||||
for i in 0..num_docs {
|
||||
@@ -96,8 +98,10 @@ mod tests {
|
||||
let store_source = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::from_source(store_source);
|
||||
for i in 0..1_000 {
|
||||
assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
||||
format!("Doc {}", i));
|
||||
assert_eq!(
|
||||
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,9 +110,9 @@ mod tests {
|
||||
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
let path = Path::new("store");
|
||||
b.iter(|| {
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ impl StoreReader {
|
||||
let mut cursor = &total_buffer[block_offset..];
|
||||
let block_length = u32::deserialize(&mut cursor).unwrap();
|
||||
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..
|
||||
(block_offset + 4 + block_length as usize)];
|
||||
(block_offset + 4 + block_length as usize)];
|
||||
let mut lz4_decoder = try!(lz4::Decoder::new(block_array));
|
||||
*self.current_block_offset.borrow_mut() = usize::max_value();
|
||||
try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()));
|
||||
@@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
|
||||
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
let offset = offset as usize;
|
||||
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
|
||||
(
|
||||
data.slice(0, offset),
|
||||
data.slice(offset, footer_offset),
|
||||
max_doc,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -49,12 +49,15 @@ impl StoreWriter {
|
||||
///
|
||||
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
|
||||
try!((field_values.len() as u32).serialize(
|
||||
&mut self.intermediary_buffer,
|
||||
));
|
||||
for field_value in field_values {
|
||||
try!((*field_value).serialize(&mut self.intermediary_buffer));
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32)
|
||||
.serialize(&mut self.current_block)?;
|
||||
(self.intermediary_buffer.len() as u32).serialize(
|
||||
&mut self.current_block,
|
||||
)?;
|
||||
self.current_block.write_all(&self.intermediary_buffer[..])?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
@@ -66,16 +69,22 @@ impl StoreWriter {
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
{
|
||||
let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer));
|
||||
let mut encoder = try!(lz4::EncoderBuilder::new().build(
|
||||
&mut self.intermediary_buffer,
|
||||
));
|
||||
try!(encoder.write_all(&self.current_block));
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
try!(encoder_result);
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32)
|
||||
.serialize(&mut self.writer)?;
|
||||
(self.intermediary_buffer.len() as u32).serialize(
|
||||
&mut self.writer,
|
||||
)?;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
self.offset_index_writer
|
||||
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
|
||||
self.offset_index_writer.insert(
|
||||
self.doc,
|
||||
&(self.writer.written_bytes() as
|
||||
u64),
|
||||
)?;
|
||||
self.current_block.clear();
|
||||
Ok(())
|
||||
}
|
||||
@@ -90,8 +99,7 @@ impl StoreWriter {
|
||||
try!(self.write_and_compress_block());
|
||||
}
|
||||
let header_offset: u64 = self.writer.written_bytes() as u64;
|
||||
try!(self.offset_index_writer
|
||||
.write(&mut self.writer));
|
||||
try!(self.offset_index_writer.write(&mut self.writer));
|
||||
try!(header_offset.serialize(&mut self.writer));
|
||||
try!(self.doc.serialize(&mut self.writer));
|
||||
self.writer.flush()
|
||||
|
||||
@@ -5,17 +5,13 @@ use super::TermDictionaryImpl;
|
||||
use termdict::{TermStreamerBuilder, TermStreamer};
|
||||
|
||||
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
|
||||
pub struct TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
pub struct TermStreamerBuilderImpl<'a> {
|
||||
fst_map: &'a TermDictionaryImpl,
|
||||
stream_builder: StreamBuilder<'a>,
|
||||
}
|
||||
|
||||
impl<'a> TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
pub(crate) fn new(fst_map: &'a TermDictionaryImpl,
|
||||
stream_builder: StreamBuilder<'a>)
|
||||
-> Self {
|
||||
impl<'a> TermStreamerBuilderImpl<'a> {
|
||||
pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self {
|
||||
TermStreamerBuilderImpl {
|
||||
fst_map: fst_map,
|
||||
stream_builder: stream_builder,
|
||||
@@ -23,8 +19,7 @@ impl<'a> TermStreamerBuilderImpl<'a>
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
@@ -60,8 +55,7 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
|
||||
|
||||
/// See [`TermStreamer`](./trait.TermStreamer.html)
|
||||
pub struct TermStreamerImpl<'a>
|
||||
{
|
||||
pub struct TermStreamerImpl<'a> {
|
||||
fst_map: &'a TermDictionaryImpl,
|
||||
stream: Stream<'a>,
|
||||
offset: u64,
|
||||
@@ -69,17 +63,15 @@ pub struct TermStreamerImpl<'a>
|
||||
current_value: TermInfo,
|
||||
}
|
||||
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a>
|
||||
{
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some((term, offset)) = self.stream.next() {
|
||||
self.current_key.clear();
|
||||
self.current_key.extend_from_slice(term);
|
||||
self.offset = offset;
|
||||
self.current_value =
|
||||
self.fst_map
|
||||
.read_value(self.offset)
|
||||
.expect("Fst data is corrupted. Failed to deserialize a value.");
|
||||
self.current_value = self.fst_map.read_value(self.offset).expect(
|
||||
"Fst data is corrupted. Failed to deserialize a value.",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
|
||||
@@ -13,14 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
}
|
||||
|
||||
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
|
||||
pub struct TermDictionaryBuilderImpl<W>
|
||||
{
|
||||
pub struct TermDictionaryBuilderImpl<W> {
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<W> TermDictionaryBuilderImpl<W>
|
||||
where W: Write
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// # Warning
|
||||
/// Horribly dangerous internal API
|
||||
@@ -46,14 +46,15 @@ impl<W> TermDictionaryBuilderImpl<W>
|
||||
}
|
||||
|
||||
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
where W: Write
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn new(w: W, _field_type: FieldType) -> io::Result<Self> {
|
||||
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
||||
Ok(TermDictionaryBuilderImpl {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
})
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
|
||||
@@ -75,28 +76,25 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
}
|
||||
}
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
fn open_fst_index(source: ReadOnlySource) -> fst::Map {
|
||||
let fst = match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error)?
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
|
||||
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
|
||||
}
|
||||
};
|
||||
Ok(fst::Map::from(fst))
|
||||
fst::Map::from(fst)
|
||||
}
|
||||
|
||||
/// See [`TermDictionary`](./trait.TermDictionary.html)
|
||||
pub struct TermDictionaryImpl
|
||||
{
|
||||
pub struct TermDictionaryImpl {
|
||||
fst_index: fst::Map,
|
||||
values_mmap: ReadOnlySource,
|
||||
}
|
||||
|
||||
impl TermDictionaryImpl
|
||||
{
|
||||
impl TermDictionaryImpl {
|
||||
/// Deserialize and returns the value at address `offset`
|
||||
pub(crate) fn read_value(&self, offset: u64) -> io::Result<TermInfo> {
|
||||
let buffer = self.values_mmap.as_slice();
|
||||
@@ -106,34 +104,34 @@ impl TermDictionaryImpl
|
||||
}
|
||||
|
||||
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl
|
||||
{
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a>;
|
||||
|
||||
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
|
||||
fn from_source(source: ReadOnlySource) -> Self {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer).expect(
|
||||
"Deserializing 4 bytes should always work",
|
||||
) as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
let fst_index = open_fst_index(fst_source)?;
|
||||
Ok(TermDictionaryImpl {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
})
|
||||
let fst_index = open_fst_index(fst_source);
|
||||
TermDictionaryImpl {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
}
|
||||
}
|
||||
|
||||
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
|
||||
self.fst_index
|
||||
.get(key)
|
||||
.map(|offset| {
|
||||
self.read_value(offset)
|
||||
.expect("The fst is corrupted. Failed to deserialize a value.")
|
||||
})
|
||||
self.fst_index.get(key).map(|offset| {
|
||||
self.read_value(offset).expect(
|
||||
"The fst is corrupted. Failed to deserialize a value.",
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn range(&self) -> TermStreamerBuilderImpl {
|
||||
|
||||
@@ -4,30 +4,26 @@ use std::cmp::Ordering;
|
||||
use termdict::TermStreamer;
|
||||
use schema::Term;
|
||||
|
||||
pub struct HeapItem<'a>
|
||||
{
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: TermStreamerImpl<'a>,
|
||||
pub segment_ord: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for HeapItem<'a>
|
||||
{
|
||||
impl<'a> PartialEq for HeapItem<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.segment_ord == other.segment_ord
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
|
||||
impl<'a> PartialOrd for HeapItem<'a>
|
||||
{
|
||||
impl<'a> PartialOrd for HeapItem<'a> {
|
||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for HeapItem<'a>
|
||||
{
|
||||
impl<'a> Ord for HeapItem<'a> {
|
||||
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
|
||||
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
|
||||
}
|
||||
@@ -40,15 +36,12 @@ impl<'a> Ord for HeapItem<'a>
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub struct TermMerger<'a>
|
||||
{
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> TermMerger<'a>
|
||||
{
|
||||
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
///
|
||||
///
|
||||
@@ -59,11 +52,11 @@ impl<'a> TermMerger<'a>
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(ord, streamer)| {
|
||||
HeapItem {
|
||||
streamer: streamer,
|
||||
segment_ord: ord,
|
||||
}
|
||||
})
|
||||
HeapItem {
|
||||
streamer: streamer,
|
||||
segment_ord: ord,
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
@@ -133,5 +126,3 @@ impl<'a> TermMerger<'a>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -54,16 +54,16 @@ use postings::TermInfo;
|
||||
|
||||
pub use self::merger::TermMerger;
|
||||
|
||||
#[cfg(not(feature="streamdict"))]
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
mod fstdict;
|
||||
#[cfg(not(feature="streamdict"))]
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
|
||||
|
||||
#[cfg(feature="streamdict")]
|
||||
#[cfg(feature = "streamdict")]
|
||||
mod streamdict;
|
||||
#[cfg(feature="streamdict")]
|
||||
#[cfg(feature = "streamdict")]
|
||||
pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
|
||||
@@ -72,7 +72,9 @@ use std::io;
|
||||
|
||||
|
||||
/// Dictionary associating sorted `&[u8]` to values
|
||||
pub trait TermDictionary<'a> where Self: Sized
|
||||
pub trait TermDictionary<'a>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
/// Streamer type associated to the term dictionary
|
||||
type Streamer: TermStreamer + 'a;
|
||||
@@ -81,7 +83,7 @@ pub trait TermDictionary<'a> where Self: Sized
|
||||
type StreamBuilder: TermStreamerBuilder<Streamer = Self::Streamer> + 'a;
|
||||
|
||||
/// Opens a `TermDictionary` given a data source.
|
||||
fn from_source(source: ReadOnlySource) -> io::Result<Self>;
|
||||
fn from_source(source: ReadOnlySource) -> Self;
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo>;
|
||||
@@ -110,7 +112,8 @@ pub trait TermDictionary<'a> where Self: Sized
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub trait TermDictionaryBuilder<W>: Sized
|
||||
where W: io::Write
|
||||
where
|
||||
W: io::Write,
|
||||
{
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
fn new(write: W, field_type: FieldType) -> io::Result<Self>;
|
||||
@@ -170,8 +173,7 @@ pub trait TermStreamer: Sized {
|
||||
|
||||
/// `TermStreamerBuilder` is an helper object used to define
|
||||
/// a range of terms that should be streamed.
|
||||
pub trait TermStreamerBuilder
|
||||
{
|
||||
pub trait TermStreamerBuilder {
|
||||
/// Associated `TermStreamer` type that this builder is building.
|
||||
type Streamer: TermStreamer;
|
||||
|
||||
@@ -226,7 +228,8 @@ mod tests {
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abc".as_bytes(), &make_term_info(34u32))
|
||||
.unwrap();
|
||||
@@ -236,7 +239,7 @@ mod tests {
|
||||
term_dictionary_builder.finish().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source).unwrap();
|
||||
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
|
||||
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
|
||||
let mut stream = term_dict.stream();
|
||||
@@ -296,7 +299,7 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let field_searcher = searcher.field(text_field).unwrap();
|
||||
let field_searcher = searcher.field(text_field);
|
||||
let mut term_it = field_searcher.terms();
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
@@ -314,15 +317,17 @@ mod tests {
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
{
|
||||
let mut streamer = term_dictionary.stream();
|
||||
let mut i = 0;
|
||||
@@ -343,16 +348,22 @@ mod tests {
|
||||
fn test_stream_high_range_prefix_suffix() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)).unwrap();
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2)).unwrap();
|
||||
term_dictionary_builder.insert("abr", &make_term_info(2)).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abr", &make_term_info(2))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
let mut kv_stream = term_dictionary.stream();
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||
@@ -372,17 +383,19 @@ mod tests {
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder.insert(id.as_bytes(), &make_term_info(*i)).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
{
|
||||
for i in (0..20).chain(6000..8_000) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
@@ -440,16 +453,18 @@ mod tests {
|
||||
fn test_stream_range_boundaries() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder.insert(&number_arr, &make_term_info(i as u32)).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&number_arr, &make_term_info(i as u32))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
|
||||
let value_list = |mut streamer: TermStreamerImpl| {
|
||||
let mut res: Vec<u32> = vec![];
|
||||
@@ -460,12 +475,17 @@ mod tests {
|
||||
};
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).into_stream();
|
||||
assert_eq!(value_list(range),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).into_stream();
|
||||
@@ -473,7 +493,10 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
|
||||
|
||||
@@ -11,7 +11,7 @@ use common::BinarySerializable;
|
||||
fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize {
|
||||
s1.iter()
|
||||
.zip(s2.iter())
|
||||
.take_while(|&(a, b)| a==b)
|
||||
.take_while(|&(a, b)| a == b)
|
||||
.count()
|
||||
}
|
||||
|
||||
@@ -45,32 +45,28 @@ pub struct TermDeltaDecoder {
|
||||
|
||||
impl TermDeltaDecoder {
|
||||
pub fn with_previous_term(term: Vec<u8>) -> TermDeltaDecoder {
|
||||
TermDeltaDecoder {
|
||||
term: Vec::from(term)
|
||||
}
|
||||
TermDeltaDecoder { term: Vec::from(term) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
|
||||
let (prefix_len, suffix_len): (usize, usize) =
|
||||
if (code & 1u8) == 1u8 {
|
||||
let b = cursor[0];
|
||||
cursor = &cursor[1..];
|
||||
let prefix_len = (b & 15u8) as usize;
|
||||
let suffix_len = (b >> 4u8) as usize;
|
||||
(prefix_len, suffix_len)
|
||||
}
|
||||
else {
|
||||
let prefix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
let suffix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
(prefix_len as usize, suffix_len as usize)
|
||||
};
|
||||
let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 {
|
||||
let b = cursor[0];
|
||||
cursor = &cursor[1..];
|
||||
let prefix_len = (b & 15u8) as usize;
|
||||
let suffix_len = (b >> 4u8) as usize;
|
||||
(prefix_len, suffix_len)
|
||||
} else {
|
||||
let prefix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
let suffix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
(prefix_len as usize, suffix_len as usize)
|
||||
};
|
||||
unsafe { self.term.set_len(prefix_len) };
|
||||
self.term.extend_from_slice(&(*cursor)[..suffix_len]);
|
||||
&cursor[suffix_len..]
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &[u8] {
|
||||
pub fn term(&self) -> &[u8] {
|
||||
&self.term[..]
|
||||
}
|
||||
}
|
||||
@@ -89,7 +85,6 @@ pub struct TermInfoDeltaEncoder {
|
||||
}
|
||||
|
||||
impl TermInfoDeltaEncoder {
|
||||
|
||||
pub fn new(has_positions: bool) -> Self {
|
||||
TermInfoDeltaEncoder {
|
||||
term_info: TermInfo::default(),
|
||||
@@ -109,7 +104,8 @@ impl TermInfoDeltaEncoder {
|
||||
positions_inner_offset: 0,
|
||||
};
|
||||
if self.has_positions {
|
||||
delta_term_info.delta_positions_offset = term_info.positions_offset - self.term_info.positions_offset;
|
||||
delta_term_info.delta_positions_offset = term_info.positions_offset -
|
||||
self.term_info.positions_offset;
|
||||
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
|
||||
}
|
||||
mem::replace(&mut self.term_info, term_info);
|
||||
@@ -131,7 +127,6 @@ pub fn make_mask(num_bytes: usize) -> u32 {
|
||||
}
|
||||
|
||||
impl TermInfoDeltaDecoder {
|
||||
|
||||
pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder {
|
||||
TermInfoDeltaDecoder {
|
||||
term_info: term_info,
|
||||
@@ -147,7 +142,7 @@ impl TermInfoDeltaDecoder {
|
||||
positions_offset: checkpoint.positions_offset,
|
||||
positions_inner_offset: 0u8,
|
||||
},
|
||||
has_positions: has_positions
|
||||
has_positions: has_positions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -164,12 +159,12 @@ impl TermInfoDeltaDecoder {
|
||||
self.term_info.postings_offset += delta_postings_offset;
|
||||
if self.has_positions {
|
||||
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
|
||||
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } & make_mask(num_bytes_positions_offset);
|
||||
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } &
|
||||
make_mask(num_bytes_positions_offset);
|
||||
self.term_info.positions_offset += delta_positions_offset;
|
||||
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
|
||||
&cursor[num_bytes_positions_offset + 1..]
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cursor
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,6 @@ pub struct CheckPoint {
|
||||
}
|
||||
|
||||
impl BinarySerializable for CheckPoint {
|
||||
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.stream_offset.serialize(writer)?;
|
||||
self.postings_offset.serialize(writer)?;
|
||||
@@ -40,4 +39,4 @@ impl BinarySerializable for CheckPoint {
|
||||
positions_offset: positions_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,11 +7,11 @@ use postings::TermInfo;
|
||||
use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder};
|
||||
|
||||
|
||||
fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl,
|
||||
target_key: &[u8],
|
||||
has_positions: bool)
|
||||
-> TermStreamerImpl<'a>
|
||||
{
|
||||
fn stream_before<'a>(
|
||||
term_dictionary: &'a TermDictionaryImpl,
|
||||
target_key: &[u8],
|
||||
has_positions: bool,
|
||||
) -> TermStreamerImpl<'a> {
|
||||
|
||||
let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref());
|
||||
let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..];
|
||||
@@ -24,8 +24,7 @@ fn stream_before<'a>(term_dictionary: &'a TermDictionaryImpl,
|
||||
|
||||
|
||||
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
|
||||
pub struct TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
pub struct TermStreamerBuilderImpl<'a> {
|
||||
term_dictionary: &'a TermDictionaryImpl,
|
||||
origin: usize,
|
||||
offset_from: usize,
|
||||
@@ -35,14 +34,17 @@ pub struct TermStreamerBuilderImpl<'a>
|
||||
has_positions: bool,
|
||||
}
|
||||
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
/// Limit the range to terms greater or equal to the bound
|
||||
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.lt(target_key);
|
||||
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
@@ -54,7 +56,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
/// Limit the range to terms strictly greater than the bound
|
||||
fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.le(target_key);
|
||||
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
@@ -66,7 +72,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
/// Limit the range to terms lesser or equal to the bound
|
||||
fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.lt(target_key);
|
||||
let (offset_before, _, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before - self.origin;
|
||||
@@ -76,7 +86,11 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
/// Limit the range to terms lesser or equal to the bound
|
||||
fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref(), self.has_positions);
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.le(target_key);
|
||||
let (offset_before, _, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before - self.origin;
|
||||
@@ -88,10 +102,13 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
let data: &[u8] = self.term_dictionary.stream_data();
|
||||
let start = self.offset_from;
|
||||
let stop = max(self.offset_to, start);
|
||||
let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key);
|
||||
let term_info_decoder =
|
||||
TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions);
|
||||
TermStreamerImpl {
|
||||
cursor: &data[start..stop],
|
||||
term_delta_decoder: TermDeltaDecoder::with_previous_term(self.current_key),
|
||||
term_info_decoder: TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions), // TODO checkpoint
|
||||
term_delta_decoder: term_delta_decoder,
|
||||
term_info_decoder: term_info_decoder,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,10 +120,10 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a>
|
||||
/// - the block start
|
||||
/// - the index within this block
|
||||
/// - the term_buffer state to initialize the block)
|
||||
fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P,
|
||||
mut streamer: TermStreamerImpl<'a>)
|
||||
-> (usize, Vec<u8>, TermInfo)
|
||||
{
|
||||
fn get_offset<'a, P: Fn(&[u8]) -> bool>(
|
||||
predicate: P,
|
||||
mut streamer: TermStreamerImpl<'a>,
|
||||
) -> (usize, Vec<u8>, TermInfo) {
|
||||
let mut prev: &[u8] = streamer.cursor;
|
||||
|
||||
let mut term_info = streamer.value().clone();
|
||||
@@ -124,11 +141,8 @@ fn get_offset<'a, P: Fn(&[u8]) -> bool>(predicate: P,
|
||||
(prev.as_ptr() as usize, prev_data, term_info)
|
||||
}
|
||||
|
||||
impl<'a> TermStreamerBuilderImpl<'a>
|
||||
{
|
||||
pub(crate) fn new(
|
||||
term_dictionary: &'a TermDictionaryImpl,
|
||||
has_positions: bool) -> Self {
|
||||
impl<'a> TermStreamerBuilderImpl<'a> {
|
||||
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self {
|
||||
let data = term_dictionary.stream_data();
|
||||
let origin = data.as_ptr() as usize;
|
||||
TermStreamerBuilderImpl {
|
||||
@@ -146,8 +160,7 @@ impl<'a> TermStreamerBuilderImpl<'a>
|
||||
|
||||
|
||||
/// See [`TermStreamer`](./trait.TermStreamer.html)
|
||||
pub struct TermStreamerImpl<'a>
|
||||
{
|
||||
pub struct TermStreamerImpl<'a> {
|
||||
cursor: &'a [u8],
|
||||
term_delta_decoder: TermDeltaDecoder,
|
||||
term_info_decoder: TermInfoDeltaDecoder,
|
||||
@@ -156,8 +169,7 @@ pub struct TermStreamerImpl<'a>
|
||||
|
||||
|
||||
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a>
|
||||
{
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.cursor.is_empty() {
|
||||
return false;
|
||||
@@ -178,4 +190,3 @@ impl<'a> TermStreamer for TermStreamerImpl<'a>
|
||||
&self.term_info_decoder.term_info()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -30,20 +30,16 @@ fn has_positions(field_type: &FieldType) -> bool {
|
||||
let indexing_options = text_options.get_indexing_options();
|
||||
if indexing_options.is_position_enabled() {
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
false
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
|
||||
pub struct TermDictionaryBuilderImpl<W>
|
||||
{
|
||||
pub struct TermDictionaryBuilderImpl<W> {
|
||||
write: CountingWriter<W>,
|
||||
term_delta_encoder: TermDeltaEncoder,
|
||||
term_info_encoder: TermInfoDeltaEncoder,
|
||||
@@ -61,7 +57,8 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
|
||||
}
|
||||
|
||||
impl<W> TermDictionaryBuilderImpl<W>
|
||||
where W: Write
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn add_index_entry(&mut self) {
|
||||
let stream_offset = self.write.written_bytes() as u32;
|
||||
@@ -74,10 +71,17 @@ impl<W> TermDictionaryBuilderImpl<W>
|
||||
positions_offset: positions_offset,
|
||||
};
|
||||
self.block_index
|
||||
.insert(&self.term_delta_encoder.term(), self.checkpoints.len() as u64)
|
||||
.expect("Serializing fst on a Vec<u8> should never fail. Where your terms not in order maybe?");
|
||||
checkpoint.serialize(&mut self.checkpoints)
|
||||
.expect("Serializing checkpoint on a Vec<u8> should never fail.");
|
||||
.insert(
|
||||
&self.term_delta_encoder.term(),
|
||||
self.checkpoints.len() as u64,
|
||||
)
|
||||
.expect(
|
||||
"Serializing fst on a Vec<u8> should never fail. \
|
||||
Where your terms not in order maybe?",
|
||||
);
|
||||
checkpoint.serialize(&mut self.checkpoints).expect(
|
||||
"Serializing checkpoint on a Vec<u8> should never fail.",
|
||||
);
|
||||
}
|
||||
|
||||
/// # Warning
|
||||
@@ -98,7 +102,13 @@ impl<W> TermDictionaryBuilderImpl<W>
|
||||
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
let delta_term_info = self.term_info_encoder.encode(term_info.clone());
|
||||
let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix();
|
||||
write_term_kv(prefix_len, suffix, &delta_term_info, self.term_info_encoder.has_positions, &mut self.write)?;
|
||||
write_term_kv(
|
||||
prefix_len,
|
||||
suffix,
|
||||
&delta_term_info,
|
||||
self.term_info_encoder.has_positions,
|
||||
&mut self.write,
|
||||
)?;
|
||||
self.len += 1;
|
||||
Ok(())
|
||||
}
|
||||
@@ -108,19 +118,20 @@ fn num_bytes_required(mut n: u32) -> u8 {
|
||||
for i in 1u8..5u8 {
|
||||
if n < 256u32 {
|
||||
return i;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
n /= 256;
|
||||
}
|
||||
}
|
||||
0u8
|
||||
}
|
||||
|
||||
fn write_term_kv<W: Write>(prefix_len: usize,
|
||||
suffix: &[u8],
|
||||
delta_term_info: &DeltaTermInfo,
|
||||
has_positions: bool,
|
||||
write: &mut W) -> io::Result<()> {
|
||||
fn write_term_kv<W: Write>(
|
||||
prefix_len: usize,
|
||||
suffix: &[u8],
|
||||
delta_term_info: &DeltaTermInfo,
|
||||
has_positions: bool,
|
||||
write: &mut W,
|
||||
) -> io::Result<()> {
|
||||
let suffix_len = suffix.len();
|
||||
let mut code = 0u8;
|
||||
let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq);
|
||||
@@ -131,9 +142,13 @@ fn write_term_kv<W: Write>(prefix_len: usize,
|
||||
code |= (num_bytes_positions_offset - 1) << 5u8;
|
||||
if (prefix_len < 16) && (suffix_len < 16) {
|
||||
code |= 1u8;
|
||||
write.write_all(&[code, (prefix_len as u8) | ((suffix_len as u8) << 4u8)])?;
|
||||
}
|
||||
else {
|
||||
write.write_all(
|
||||
&[
|
||||
code,
|
||||
(prefix_len as u8) | ((suffix_len as u8) << 4u8),
|
||||
],
|
||||
)?;
|
||||
} else {
|
||||
write.write_all(&[code])?;
|
||||
(prefix_len as u32).serialize(write)?;
|
||||
(suffix_len as u32).serialize(write)?;
|
||||
@@ -145,11 +160,15 @@ fn write_term_kv<W: Write>(prefix_len: usize,
|
||||
}
|
||||
{
|
||||
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) };
|
||||
write.write_all(&bytes[0..num_bytes_postings_offset as usize])?;
|
||||
write.write_all(
|
||||
&bytes[0..num_bytes_postings_offset as usize],
|
||||
)?;
|
||||
}
|
||||
if has_positions {
|
||||
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) };
|
||||
write.write_all(&bytes[0..num_bytes_positions_offset as usize])?;
|
||||
write.write_all(
|
||||
&bytes[0..num_bytes_positions_offset as usize],
|
||||
)?;
|
||||
write.write_all(&[delta_term_info.positions_inner_offset])?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -157,7 +176,8 @@ fn write_term_kv<W: Write>(prefix_len: usize,
|
||||
}
|
||||
|
||||
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
where W: Write
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
fn new(mut write: W, field_type: FieldType) -> io::Result<Self> {
|
||||
@@ -169,7 +189,7 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
term_delta_encoder: TermDeltaEncoder::default(),
|
||||
term_info_encoder: TermInfoDeltaEncoder::new(has_positions),
|
||||
block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"),
|
||||
checkpoints: vec!(),
|
||||
checkpoints: vec![],
|
||||
len: 0,
|
||||
})
|
||||
}
|
||||
@@ -206,28 +226,22 @@ impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
use self::ReadOnlySource::*;
|
||||
let fst_result = match source {
|
||||
Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
}
|
||||
Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly)
|
||||
}
|
||||
Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len),
|
||||
Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly),
|
||||
};
|
||||
let fst = fst_result.map_err(convert_fst_error)?;
|
||||
Ok(fst::Map::from(fst))
|
||||
}
|
||||
|
||||
/// See [`TermDictionary`](./trait.TermDictionary.html)
|
||||
pub struct TermDictionaryImpl
|
||||
{
|
||||
pub struct TermDictionaryImpl {
|
||||
stream_data: ReadOnlySource,
|
||||
fst_index: fst::Map,
|
||||
checkpoints_data: ReadOnlySource,
|
||||
has_positions: bool,
|
||||
}
|
||||
|
||||
impl TermDictionaryImpl
|
||||
{
|
||||
impl TermDictionaryImpl {
|
||||
pub(crate) fn stream_data(&self) -> &[u8] {
|
||||
self.stream_data.as_slice()
|
||||
}
|
||||
@@ -235,8 +249,8 @@ impl TermDictionaryImpl
|
||||
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, CheckPoint) {
|
||||
let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key);
|
||||
let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..];
|
||||
let checkpoint = CheckPoint::deserialize(&mut checkpoint_data)
|
||||
.expect("Checkpoint data is corrupted");
|
||||
let checkpoint =
|
||||
CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted");
|
||||
(term, checkpoint)
|
||||
}
|
||||
|
||||
@@ -288,47 +302,47 @@ impl TermDictionaryImpl
|
||||
|
||||
|
||||
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl
|
||||
{
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a>;
|
||||
|
||||
/// Opens a `TermDictionary` given a data source.
|
||||
fn from_source(mut source: ReadOnlySource) -> io::Result<Self> {
|
||||
let has_positions = source.slice(0, 1).as_ref()[0] == 255u8;
|
||||
fn from_source(mut source: ReadOnlySource) -> Self {
|
||||
let has_positions = source.slice(0, 1)[0] == 255u8;
|
||||
source = source.slice_from(1);
|
||||
|
||||
let total_len = source.len();
|
||||
let (body, footer) = source.split(total_len - 16);
|
||||
|
||||
let mut footer_buffer: &[u8] = footer.as_slice();
|
||||
let fst_addr: usize = u64::deserialize(&mut footer_buffer)? as usize;
|
||||
let checkpoints_addr: usize = u64::deserialize(&mut footer_buffer)? as usize;
|
||||
let fst_addr = u64::deserialize(&mut footer_buffer).expect(
|
||||
"deserializing 8 byte should never fail",
|
||||
) as usize;
|
||||
let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect(
|
||||
"deserializing 8 byte should never fail",
|
||||
) as usize;
|
||||
|
||||
let stream_data = body.slice(0, fst_addr - PADDING_SIZE);
|
||||
let fst_data = body.slice(fst_addr, checkpoints_addr);
|
||||
let checkpoints_data = body.slice_from(checkpoints_addr);
|
||||
|
||||
let fst_index = open_fst_index(fst_data)?;
|
||||
let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted");
|
||||
|
||||
Ok(TermDictionaryImpl {
|
||||
TermDictionaryImpl {
|
||||
has_positions: has_positions,
|
||||
stream_data: stream_data,
|
||||
checkpoints_data: checkpoints_data,
|
||||
fst_index: fst_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo> {
|
||||
let mut streamer = self.range()
|
||||
.ge(&target_key)
|
||||
.into_stream();
|
||||
let mut streamer = self.range().ge(&target_key).into_stream();
|
||||
if streamer.advance() && streamer.key() == target_key.as_ref() {
|
||||
Some(streamer.value().clone())
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -353,4 +367,4 @@ mod tests {
|
||||
assert_eq!(num_bytes_required(256), 2);
|
||||
assert_eq!(num_bytes_required(u32::max_value()), 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user