mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Merge branch 'imhotep-new-codec'
Conflicts: src/common/bitpacker.rs src/compression/pack/compression_pack_nosimd.rs src/indexer/log_merge_policy.rs
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.4.3"
|
||||
version = "0.5.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
build = "build.rs"
|
||||
license = "MIT"
|
||||
|
||||
@@ -91,9 +91,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
@@ -38,10 +38,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
try!(self.left.set_segment(segment_local_id, segment));
|
||||
try!(self.right.set_segment(segment_local_id, segment));
|
||||
Ok(())
|
||||
|
||||
@@ -45,11 +45,11 @@ mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,8 +15,9 @@ use SegmentLocalId;
|
||||
|
||||
/// Facet collector for i64/u64 fast field
|
||||
pub struct FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
counters: HashMap<T::ValueType, u64>,
|
||||
field: Field,
|
||||
@@ -25,8 +26,9 @@ pub struct FacetCollector<T>
|
||||
|
||||
|
||||
impl<T> FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
/// Creates a new facet collector for aggregating a given field.
|
||||
pub fn new(field: Field) -> FacetCollector<T> {
|
||||
@@ -40,8 +42,9 @@ impl<T> FacetCollector<T>
|
||||
|
||||
|
||||
impl<T> Collector for FacetCollector<T>
|
||||
where T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
@@ -51,7 +54,9 @@ impl<T> Collector for FacetCollector<T>
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let val = self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.expect(
|
||||
"collect() was called before set_segment. This should never happen.",
|
||||
)
|
||||
.get(doc);
|
||||
*(self.counters.entry(val).or_insert(0)) += 1;
|
||||
}
|
||||
|
||||
@@ -54,20 +54,22 @@ pub use self::chained_collector::chain;
|
||||
pub trait Collector {
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()>;
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
}
|
||||
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
@@ -172,12 +174,12 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,10 +23,11 @@ impl<'a> MultiCollector<'a> {
|
||||
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
@@ -53,8 +54,8 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::from(vec![&mut top_collector,
|
||||
&mut count_collector]);
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
|
||||
@@ -24,10 +24,9 @@ impl PartialOrd for GlobalScoredDoc {
|
||||
impl Ord for GlobalScoredDoc {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||
other
|
||||
.score
|
||||
.partial_cmp(&self.score)
|
||||
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||
other.score.partial_cmp(&self.score).unwrap_or_else(|| {
|
||||
other.doc_address.cmp(&self.doc_address)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +86,9 @@ impl TopCollector {
|
||||
scored_docs.sort();
|
||||
scored_docs
|
||||
.into_iter()
|
||||
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||
.map(|GlobalScoredDoc { score, doc_address }| {
|
||||
(score, doc_address)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -108,14 +109,13 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc =
|
||||
*self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect(
|
||||
"Top collector with size 0 is forbidden",
|
||||
);
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
let mut mut_head = self.heap.peek_mut().expect(
|
||||
"Top collector with size 0 is forbidden",
|
||||
);
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::ops::Deref;
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spawning over 9 bytes is possible for instance, if we do
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
@@ -88,7 +88,8 @@ impl BitPacker {
|
||||
|
||||
|
||||
pub struct BitUnpacker<Data>
|
||||
where Data: Deref<Target = [u8]>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
num_bits: usize,
|
||||
mask: u64,
|
||||
@@ -96,7 +97,8 @@ pub struct BitUnpacker<Data>
|
||||
}
|
||||
|
||||
impl<Data> BitUnpacker<Data>
|
||||
where Data: Deref<Target = [u8]>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
@@ -121,33 +123,13 @@ impl<Data> BitUnpacker<Data>
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
}
|
||||
else {
|
||||
let val_unshifted_unmasked: u64;
|
||||
if addr + 8 <= data.len() {
|
||||
val_unshifted_unmasked = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
}
|
||||
else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
val_unshifted_unmasked = unsafe { *(buffer[..].as_ptr() as *const u64) };
|
||||
}
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
}
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
}
|
||||
|
||||
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
||||
|
||||
191
src/common/composite_file.rs
Normal file
191
src/common/composite_file.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use std::io::Write;
|
||||
use common::CountingWriter;
|
||||
use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
|
||||
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<Field, usize>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
/// Crate a new API writer that writes a composite file
|
||||
/// in a given write.
|
||||
pub fn wrap(w: W) -> CompositeWrite<W> {
|
||||
CompositeWrite {
|
||||
write: CountingWriter::wrap(w),
|
||||
offsets: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
|
||||
let offset = self.write.written_bytes();
|
||||
assert!(!self.offsets.contains_key(&field));
|
||||
self.offsets.insert(field, offset);
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
|
||||
/// Close the composite file.
|
||||
///
|
||||
/// An index of the different field offsets
|
||||
/// will be written as a footer.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(field, offset)| (offset, field))
|
||||
.collect();
|
||||
|
||||
offset_fields.sort();
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (offset, field) in offset_fields {
|
||||
VInt((offset - prev_offset) as u64).serialize(
|
||||
&mut self.write,
|
||||
)?;
|
||||
field.serialize(&mut self.write)?;
|
||||
prev_offset = *offset;
|
||||
}
|
||||
|
||||
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
|
||||
footer_len.serialize(&mut self.write)?;
|
||||
self.write.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// A composite file is an abstraction to store a
|
||||
/// file partitioned by field.
|
||||
///
|
||||
/// The file needs to be written field by field.
|
||||
/// A footer describes the start and stop offsets
|
||||
/// for each field.
|
||||
#[derive(Clone)]
|
||||
pub struct CompositeFile {
|
||||
data: ReadOnlySource,
|
||||
offsets_index: HashMap<Field, (usize, usize)>,
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
/// Opens a composite file stored in a given
|
||||
/// `ReadOnlySource`.
|
||||
pub fn open(data: ReadOnlySource) -> io::Result<CompositeFile> {
|
||||
let end = data.len();
|
||||
let footer_len_data = data.slice_from(end - 4);
|
||||
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
|
||||
|
||||
let footer_start = end - 4 - footer_len;
|
||||
let footer_data = data.slice(footer_start, footer_start + footer_len);
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
|
||||
let mut field_index = HashMap::new();
|
||||
|
||||
let mut offset = 0;
|
||||
for _ in 0..num_fields {
|
||||
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
let field = Field::deserialize(&mut footer_buffer)?;
|
||||
offsets.push(offset);
|
||||
fields.push(field);
|
||||
}
|
||||
offsets.push(footer_start);
|
||||
for i in 0..num_fields {
|
||||
let field = fields[i];
|
||||
let start_offset = offsets[i];
|
||||
let end_offset = offsets[i + 1];
|
||||
field_index.insert(field, (start_offset, end_offset));
|
||||
}
|
||||
|
||||
Ok(CompositeFile {
|
||||
data: data.slice_to(footer_start),
|
||||
offsets_index: field_index,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a composite file that stores
|
||||
/// no fields.
|
||||
pub fn empty() -> CompositeFile {
|
||||
CompositeFile {
|
||||
offsets_index: HashMap::new(),
|
||||
data: ReadOnlySource::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
|
||||
self.offsets_index.get(&field).map(|&(from, to)| {
|
||||
self.data.slice(from, to)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeWrite, CompositeFile};
|
||||
use directory::{RAMDirectory, Directory};
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_composite_file() {
|
||||
let path = Path::new("test_path");
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let w = directory.open_write(path).unwrap();
|
||||
let mut composite_write = CompositeWrite::wrap(w);
|
||||
{
|
||||
let mut write_0 = composite_write.for_field(Field(0u32));
|
||||
VInt(32431123u64).serialize(&mut write_0).unwrap();
|
||||
write_0.flush().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut write_4 = composite_write.for_field(Field(4u32));
|
||||
VInt(2).serialize(&mut write_4).unwrap();
|
||||
write_4.flush().unwrap();
|
||||
}
|
||||
composite_write.close().unwrap();
|
||||
}
|
||||
{
|
||||
let r = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(r).unwrap();
|
||||
{
|
||||
let file0 = composite_file.open_read(Field(0u32)).unwrap();
|
||||
let mut file0_buf = file0.as_slice();
|
||||
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
|
||||
assert_eq!(file0_buf.len(), 0);
|
||||
assert_eq!(payload_0, 32431123u64);
|
||||
}
|
||||
{
|
||||
let file4 = composite_file.open_read(Field(4u32)).unwrap();
|
||||
let mut file4_buf = file4.as_slice();
|
||||
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
|
||||
assert_eq!(file4_buf.len(), 0);
|
||||
assert_eq!(payload_4, 2u64);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -2,7 +2,7 @@ use std::io::Write;
|
||||
use std::io;
|
||||
|
||||
|
||||
pub struct CountingWriter<W: Write> {
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
written_bytes: usize,
|
||||
}
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
|
||||
mod serialize;
|
||||
mod timer;
|
||||
mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeWrite, CompositeFile};
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
|
||||
@@ -101,9 +101,9 @@ impl BinarySerializable for String {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
.read_to_string(&mut result)?;
|
||||
reader.take(string_length as u64).read_to_string(
|
||||
&mut result,
|
||||
)?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,16 +24,14 @@ impl<'a> OpenTimer<'a> {
|
||||
|
||||
impl<'a> Drop for OpenTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
self.timer_tree
|
||||
.timings
|
||||
.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
self.timer_tree.timings.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,12 @@ impl BinarySerializable for VInt {
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
|
||||
_ => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Reach end of buffer",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(VInt(result))
|
||||
|
||||
@@ -1,170 +0,0 @@
|
||||
use super::{BlockEncoder, BlockDecoder};
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use compression::{VIntEncoder, VIntDecoder};
|
||||
|
||||
pub struct CompositeEncoder {
|
||||
block_encoder: BlockEncoder,
|
||||
output: Vec<u8>,
|
||||
}
|
||||
|
||||
impl CompositeEncoder {
|
||||
pub fn new() -> CompositeEncoder {
|
||||
CompositeEncoder {
|
||||
block_encoder: BlockEncoder::new(),
|
||||
output: Vec::with_capacity(500_000),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
self.output.clear();
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
let mut offset = 0u32;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
|
||||
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed =
|
||||
self.block_encoder
|
||||
.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
|
||||
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
self.output.clear();
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed = self.block_encoder
|
||||
.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct CompositeDecoder {
|
||||
block_decoder: BlockDecoder,
|
||||
vals: Vec<u32>,
|
||||
}
|
||||
|
||||
|
||||
impl CompositeDecoder {
|
||||
pub fn new() -> CompositeDecoder {
|
||||
CompositeDecoder {
|
||||
block_decoder: BlockDecoder::new(),
|
||||
vals: Vec::with_capacity(500_000),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_sorted(&mut self,
|
||||
mut compressed_data: &[u8],
|
||||
uncompressed_len: usize)
|
||||
-> &[u32] {
|
||||
if uncompressed_len > self.vals.capacity() {
|
||||
let extra_capacity = uncompressed_len - self.vals.capacity();
|
||||
self.vals.reserve(extra_capacity);
|
||||
}
|
||||
let mut offset = 0u32;
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder
|
||||
.uncompress_block_sorted(compressed_data, offset);
|
||||
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(compressed_data,
|
||||
offset,
|
||||
uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
|
||||
pub fn uncompress_unsorted(&mut self,
|
||||
mut compressed_data: &[u8],
|
||||
uncompressed_len: usize)
|
||||
-> &[u32] {
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder
|
||||
.uncompress_block_unsorted(compressed_data);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder
|
||||
.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<Vec<u32>> for CompositeDecoder {
|
||||
fn into(self) -> Vec<u32> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use test::Bencher;
|
||||
use super::*;
|
||||
use tests;
|
||||
|
||||
#[test]
|
||||
fn test_composite_unsorted() {
|
||||
let data = tests::generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_unsorted(&data);
|
||||
assert!(compressed.len() <= 19_794);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_unsorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
assert_eq!(data[i], result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_composite_sorted() {
|
||||
let data = tests::generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
assert!(compressed.len() <= 7_826);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_sorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
assert_eq!(data[i], result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const BENCH_NUM_INTS: usize = 99_968;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
|
||||
b.iter(|| { encoder.compress_sorted(&data); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); });
|
||||
}
|
||||
}
|
||||
@@ -1,52 +1,88 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
|
||||
mod composite;
|
||||
pub use self::composite::{CompositeEncoder, CompositeDecoder};
|
||||
mod stream;
|
||||
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
#[cfg(not(feature = "simdcompression"))]
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
pub use self::compression_pack_nosimd::*;
|
||||
pub use self::compression_pack_nosimd::{BlockEncoder, BlockDecoder};
|
||||
}
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
#[cfg(feature = "simdcompression")]
|
||||
mod pack {
|
||||
mod compression_pack_simd;
|
||||
pub use self::compression_pack_simd::*;
|
||||
pub use self::compression_pack_simd::{BlockEncoder, BlockDecoder};
|
||||
}
|
||||
|
||||
pub use self::pack::{BlockEncoder, BlockDecoder};
|
||||
|
||||
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
|
||||
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
|
||||
mod vint {
|
||||
mod compression_vint_nosimd;
|
||||
pub use self::compression_vint_nosimd::*;
|
||||
pub(crate) use self::compression_vint_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
|
||||
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
|
||||
mod vint {
|
||||
mod compression_vint_simd;
|
||||
pub use self::compression_vint_simd::*;
|
||||
pub(crate) use self::compression_vint_simd::*;
|
||||
}
|
||||
|
||||
/// Returns the size in bytes of a compressed block, given num_bits.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * 16
|
||||
}
|
||||
|
||||
pub trait VIntEncoder {
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
|
||||
/// and variable bytes encoding.
|
||||
///
|
||||
/// The method takes an array of ints to compress, and returns
|
||||
/// a `&[u8]` representing the compressed data.
|
||||
///
|
||||
/// The method also takes an offset to give the value of the
|
||||
/// hypothetical previous element in the delta-encoding.
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
|
||||
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using variable bytes encoding.
|
||||
///
|
||||
/// The method takes an array of ints to compress, and returns
|
||||
/// a `&[u8]` representing the compressed data.
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
|
||||
}
|
||||
|
||||
pub trait VIntDecoder {
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> &'a [u8];
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> &'a [u8];
|
||||
/// Uncompress an array of `u32` integers,
|
||||
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
|
||||
/// and variable bytes encoding.
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
///
|
||||
/// The method also takes an offset to give the value of the
|
||||
/// hypothetical previous element in the delta-encoding.
|
||||
///
|
||||
/// For instance, if delta encoded are `1, 3, 9`, and the
|
||||
/// `offset` is 5, then the output will be:
|
||||
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize;
|
||||
|
||||
/// Uncompress an array of `u32s`, compressed using variable
|
||||
/// byte encoding.
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
@@ -60,26 +96,24 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> &'a [u8] {
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> &'a [u8] {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
@@ -95,8 +129,8 @@ pub mod tests {
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 0);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
|
||||
assert_eq!(remaining_data.len(), 0);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -110,8 +144,8 @@ pub mod tests {
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 10);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
|
||||
assert_eq!(remaining_data.len(), 0);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -129,9 +163,9 @@ pub mod tests {
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
|
||||
assert_eq!(remaining_data.len(), 1);
|
||||
assert_eq!(remaining_data[0], 173u8);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
|
||||
assert_eq!(consumed_num_bytes, compressed.len() - 1);
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -149,9 +183,9 @@ pub mod tests {
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
|
||||
assert_eq!(remaining_data.len(), 1);
|
||||
assert_eq!(remaining_data[0], 173u8);
|
||||
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
|
||||
assert_eq!(consumed_num_bytes + 1, compressed.len());
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -169,9 +203,9 @@ pub mod tests {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let remaining_data =
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(0, remaining_data.len());
|
||||
assert_eq!(consumed_num_bytes, encoded_data.len());
|
||||
assert_eq!(input, decoder.output_array());
|
||||
}
|
||||
}
|
||||
@@ -181,19 +215,32 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| { encoder.compress_block_sorted(&data, 0u32); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for num_bits in 0..33 {
|
||||
let mut data = [0u32; 128];
|
||||
if num_bits > 0 {
|
||||
data[0] = 1 << (num_bits - 1);
|
||||
}
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed = encoder.compress_block_unsorted(&data);
|
||||
assert_eq!(compressed[0] as usize, num_bits);
|
||||
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
||||
}
|
||||
}
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
@@ -210,7 +257,9 @@ pub mod tests {
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,15 +3,15 @@ use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use common::CountingWriter;
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
|
||||
let mut max_delta = 0;
|
||||
{
|
||||
let mut local_offset = offset;
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let val = vals[i];
|
||||
let delta = val - local_offset;
|
||||
max_delta = cmp::max(max_delta, delta);
|
||||
@@ -22,6 +22,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
let num_bits = compute_num_bits(max_delta as u64);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
|
||||
@@ -34,7 +35,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
|
||||
pub struct BlockEncoder {
|
||||
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
|
||||
input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
@@ -42,7 +43,7 @@ impl BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
|
||||
input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,10 +56,9 @@ impl BlockEncoder {
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size = {
|
||||
let output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.expect("compress unsorted called with an empty array");
|
||||
let max = vals.iter().cloned().max().expect(
|
||||
"compress unsorted called with an empty array",
|
||||
);
|
||||
let num_bits = compute_num_bits(max as u64);
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
@@ -66,8 +66,16 @@ impl BlockEncoder {
|
||||
for val in vals {
|
||||
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
|
||||
}
|
||||
bit_packer.flush(&mut counting_writer);
|
||||
// we voluntarility avoid writing "closing", because we
|
||||
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
|
||||
bit_packer
|
||||
.write(vals[0] as u64, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
bit_packer.flush(&mut counting_writer).expect(
|
||||
"Flushing the bitpacking \
|
||||
in an in RAM buffer should never fail",
|
||||
);
|
||||
// we avoid writing "closing", because we
|
||||
// do not want 7 bytes of padding here.
|
||||
counting_writer.written_bytes()
|
||||
};
|
||||
@@ -93,34 +101,35 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32)
|
||||
-> &'a [u8] {
|
||||
pub fn uncompress_block_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32,
|
||||
) -> usize {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let delta = bit_unpacker.get(i);
|
||||
let val = offset + delta as u32;
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
|
||||
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
|
||||
};
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
self.output[i] = bit_unpacker.get(i) as u32;
|
||||
}
|
||||
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
mod simdcomp {
|
||||
use libc::size_t;
|
||||
@@ -8,10 +8,11 @@ mod simdcomp {
|
||||
extern "C" {
|
||||
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
|
||||
|
||||
pub fn uncompress_sorted(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn uncompress_sorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
|
||||
|
||||
@@ -78,19 +79,16 @@ impl BlockDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -117,4 +115,5 @@ mod tests {
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
assert_eq!(compressed.len(), 17);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
135
src/compression/stream.rs
Normal file
135
src/compression/stream.rs
Normal file
@@ -0,0 +1,135 @@
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
///
|
||||
/// Tantivy uses `CompressedIntStream` to read
|
||||
/// the position file.
|
||||
/// The `.skip(...)` makes it possible to avoid
|
||||
/// decompressing blocks that are not required.
|
||||
pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
block_decoder: BlockDecoder,
|
||||
inner_offset: usize,
|
||||
}
|
||||
|
||||
impl CompressedIntStream {
|
||||
/// Opens a compressed int stream.
|
||||
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
|
||||
CompressedIntStream {
|
||||
buffer: SourceRead::from(source),
|
||||
block_decoder: BlockDecoder::new(),
|
||||
inner_offset: COMPRESSION_BLOCK_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers,
|
||||
/// and advance the stream by that many els.
|
||||
pub fn read(&mut self, output: &mut [u32]) {
|
||||
let mut num_els: usize = output.len();
|
||||
let mut start: usize = 0;
|
||||
loop {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if num_els >= available {
|
||||
if available > 0 {
|
||||
let uncompressed_block = &self.block_decoder.output_array()
|
||||
[self.inner_offset..];
|
||||
&mut output[start..start + available].clone_from_slice(uncompressed_block);
|
||||
}
|
||||
num_els -= available;
|
||||
start += available;
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
|
||||
self.buffer.as_ref(),
|
||||
);
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = 0;
|
||||
} else {
|
||||
let uncompressed_block = &self.block_decoder.output_array()[self.inner_offset..
|
||||
self.inner_offset +
|
||||
num_els];
|
||||
&output[start..start + num_els].clone_from_slice(uncompressed_block);
|
||||
self.inner_offset += num_els;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Skip the next `skip_len` integer.
|
||||
///
|
||||
/// If a full block is skipped, calling
|
||||
/// `.skip(...)` will avoid decompressing it.
|
||||
pub fn skip(&mut self, mut skip_len: usize) {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if available >= skip_len {
|
||||
self.inner_offset += skip_len;
|
||||
} else {
|
||||
skip_len -= available;
|
||||
// entirely skip decompressing some blocks.
|
||||
while skip_len >= COMPRESSION_BLOCK_SIZE {
|
||||
skip_len -= COMPRESSION_BLOCK_SIZE;
|
||||
let num_bits: u8 = self.buffer.as_ref()[0];
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.buffer.advance(block_len);
|
||||
}
|
||||
let num_consumed_bytes = self.block_decoder.uncompress_block_unsorted(
|
||||
self.buffer.as_ref(),
|
||||
);
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = skip_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::BlockEncoder;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
let mut buffer: Vec<u8> = vec![];
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let vals: Vec<u32> = (0u32..1_025u32).collect();
|
||||
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
|
||||
let compressed_block = encoder.compress_block_unsorted(chunk);
|
||||
let num_bits = compressed_block[0];
|
||||
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
|
||||
buffer.extend_from_slice(compressed_block);
|
||||
}
|
||||
if cfg!(simd) {
|
||||
buffer.extend_from_slice(&[0u8; 7]);
|
||||
}
|
||||
ReadOnlySource::from(buffer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compressed_int_stream() {
|
||||
let buffer = create_stream_buffer();
|
||||
let mut stream = CompressedIntStream::wrap(buffer);
|
||||
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
|
||||
stream.read(&mut block[0..2]);
|
||||
assert_eq!(block[0], 0);
|
||||
assert_eq!(block[1], 1);
|
||||
stream.skip(5);
|
||||
stream.read(&mut block[0..3]);
|
||||
assert_eq!(block[0], 7);
|
||||
assert_eq!(block[1], 8);
|
||||
assert_eq!(block[2], 9);
|
||||
stream.skip(500);
|
||||
stream.read(&mut block[0..3]);
|
||||
assert_eq!(block[0], 510);
|
||||
assert_eq!(block[1], 511);
|
||||
assert_eq!(block[2], 512);
|
||||
stream.skip(511);
|
||||
stream.read(&mut block[..1]);
|
||||
assert_eq!(block[0], 1024);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
pub(crate) fn compress_sorted<'a>(
|
||||
input: &[u32],
|
||||
output: &'a mut [u8],
|
||||
mut offset: u32,
|
||||
) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
@@ -22,7 +26,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
@@ -43,10 +47,11 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
@@ -63,11 +68,11 @@ pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
read_byte
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
|
||||
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
@@ -84,5 +89,5 @@ pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) ->
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
read_byte
|
||||
}
|
||||
|
||||
@@ -4,41 +4,47 @@ mod streamvbyte {
|
||||
use libc::size_t;
|
||||
|
||||
extern "C" {
|
||||
pub fn streamvbyte_delta_encode(data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize)
|
||||
-> size_t;
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize,
|
||||
) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
|
||||
};
|
||||
@@ -46,23 +52,24 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
|
||||
unsafe {
|
||||
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,9 +48,10 @@ impl Index {
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory)
|
||||
.expect("Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.");
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
@@ -127,10 +128,11 @@ impl Index {
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer_with_num_threads(&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize)
|
||||
-> Result<IndexWriter> {
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes)
|
||||
}
|
||||
|
||||
@@ -155,10 +157,12 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
Ok(
|
||||
self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
@@ -190,10 +194,12 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
Ok(
|
||||
self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
@@ -203,10 +209,12 @@ impl Index {
|
||||
/// published or after a merge.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
let segment_readers: Vec<SegmentReader> = try!(
|
||||
searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect()
|
||||
);
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.collect();
|
||||
|
||||
@@ -9,7 +9,7 @@ use core::SegmentMeta;
|
||||
/// * the index docstamp
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone,Debug,Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
pub schema: Schema,
|
||||
|
||||
164
src/core/inverted_index_reader.rs
Normal file
164
src/core/inverted_index_reader.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
use directory::{SourceRead, ReadOnlySource};
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use postings::{SegmentPostings, BlockSegmentPostings};
|
||||
use postings::TermInfo;
|
||||
use postings::SegmentPostingsOption;
|
||||
use schema::Term;
|
||||
use std::cmp;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::Schema;
|
||||
use compression::CompressedIntStream;
|
||||
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// It is safe to delete the segment associated to
|
||||
/// an `InvertedIndexReader`. As long as it is open,
|
||||
/// the `ReadOnlySource` it is relying on should
|
||||
/// stay available.
|
||||
///
|
||||
///
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// the `SegmentReader`'s [`.inverted_index(...)`] method
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict_source: ReadOnlySource,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
schema: Schema,
|
||||
) -> InvertedIndexReader {
|
||||
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source: postings_source,
|
||||
positions_source: positions_source,
|
||||
delete_bitset: delete_bitset,
|
||||
schema: schema,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.termdict.get(term.as_slice())
|
||||
}
|
||||
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
/// Resets the block segment to another position of the postings
|
||||
/// file.
|
||||
///
|
||||
/// This is useful for enumerating through a list of terms,
|
||||
/// and consuming the associated posting lists while avoiding
|
||||
/// reallocating a `BlockSegmentPostings`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
let postings_reader = SourceRead::from(postings_slice);
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_reader);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let has_freq = option.has_freq();
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
has_freq,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption,
|
||||
) -> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
let position_stream = {
|
||||
if option.has_positions() {
|
||||
let position_offset = term_info.positions_offset;
|
||||
let positions_source = self.positions_source.slice_from(position_offset as usize);
|
||||
let mut stream = CompressedIntStream::wrap(positions_source);
|
||||
stream.skip(term_info.positions_inner_offset as usize);
|
||||
Some(stream)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
|
||||
let best_effort_option = cmp::min(maximum_option, option);
|
||||
Some(self.read_postings_from_terminfo(
|
||||
&term_info,
|
||||
best_effort_option,
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
match self.get_term_info(term) {
|
||||
Some(term_info) => term_info.doc_freq,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,9 @@ mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
mod segment_meta;
|
||||
mod inverted_index_reader;
|
||||
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
@@ -18,7 +20,6 @@ pub use self::index::Index;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
lazy_static! {
|
||||
|
||||
@@ -76,8 +76,11 @@ impl<T> Pool<T> {
|
||||
if former_generation >= generation {
|
||||
break;
|
||||
}
|
||||
self.freshest_generation
|
||||
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
|
||||
self.freshest_generation.compare_and_swap(
|
||||
former_generation,
|
||||
generation,
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,9 +94,9 @@ impl<T> Pool<T> {
|
||||
let gen_item = self.queue.pop();
|
||||
if gen_item.generation >= generation {
|
||||
return LeasedItem {
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
};
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
};
|
||||
} else {
|
||||
// this searcher is obsolete,
|
||||
// removing it from the pool.
|
||||
@@ -113,25 +116,26 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for LeasedItem<T> {
|
||||
fn drop(&mut self) {
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||
.expect("Unwrapping a leased item should never fail");
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect(
|
||||
"Unwrapping a leased item should never fail",
|
||||
);
|
||||
self.recycle_queue.push(gen_item);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,10 +6,11 @@ use common::TimerTree;
|
||||
use query::Query;
|
||||
use DocId;
|
||||
use DocAddress;
|
||||
use schema::Term;
|
||||
use termdict::TermMerger;
|
||||
use schema::{Term, Field};
|
||||
use termdict::{TermMerger, TermDictionary};
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use postings::TermInfo;
|
||||
use core::InvertedIndexReader;
|
||||
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
@@ -21,7 +22,6 @@ pub struct Searcher {
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
|
||||
impl Searcher {
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
@@ -46,7 +46,9 @@ impl Searcher {
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.doc_freq(term))
|
||||
.map(|segment_reader| {
|
||||
segment_reader.inverted_index(term.field()).doc_freq(term)
|
||||
})
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
@@ -65,20 +67,41 @@ impl Searcher {
|
||||
query.search(self, collector)
|
||||
}
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// the searcher.
|
||||
///
|
||||
/// This includes all of the fields from all of the segment_readers.
|
||||
/// See [`TermIterator`](struct.TermIterator.html).
|
||||
///
|
||||
/// # Warning
|
||||
/// This API is very likely to change in the future.
|
||||
pub fn terms(&self) -> TermMerger<TermInfo> {
|
||||
TermMerger::from(self.segment_readers())
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
pub struct FieldSearcher {
|
||||
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
|
||||
}
|
||||
|
||||
|
||||
impl FieldSearcher {
|
||||
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
|
||||
FieldSearcher { inv_index_readers: inv_index_readers }
|
||||
}
|
||||
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
TermMerger::new(term_streamers)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher { segment_readers: segment_readers }
|
||||
|
||||
@@ -76,18 +76,20 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(&self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<ReadOnlySource, OpenReadError> {
|
||||
pub fn open_read(
|
||||
&self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = try!(self.index.directory().open_read(&path));
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
/// Open one of the component file for *regular* write.
|
||||
pub fn open_write(&mut self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<WritePtr, OpenWriteError> {
|
||||
pub fn open_write(
|
||||
&mut self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = try!(self.index.directory_mut().open_write(&path));
|
||||
Ok(write)
|
||||
@@ -125,11 +127,11 @@ mod tests {
|
||||
{
|
||||
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
|
||||
assert!(directory.exists(&*path));
|
||||
directory.garbage_collect(|| { living_files.clone() });
|
||||
directory.garbage_collect(|| living_files.clone());
|
||||
assert!(directory.exists(&*path));
|
||||
}
|
||||
|
||||
directory.garbage_collect(|| { living_files });
|
||||
directory.garbage_collect(|| living_files);
|
||||
assert!(!directory.exists(&*path));
|
||||
}
|
||||
|
||||
|
||||
@@ -28,13 +28,15 @@ pub enum SegmentComponent {
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE];
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE,
|
||||
];
|
||||
SEGMENT_COMPONENTS.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,16 +64,14 @@ impl SegmentMeta {
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => {
|
||||
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
|
||||
}
|
||||
});
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
@@ -111,8 +109,8 @@ impl SegmentMeta {
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.deletes = Some(DeleteMeta {
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,28 +2,24 @@ use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use schema::Term;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use directory::ReadOnlySource;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use std::cmp;
|
||||
use postings::TermInfo;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use postings::SegmentPostingsOption;
|
||||
use postings::{SegmentPostings, BlockSegmentPostings};
|
||||
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
|
||||
use fastfield::{FastFieldReader, U64FastFieldReader};
|
||||
use schema::Schema;
|
||||
use postings::FreqHandler;
|
||||
|
||||
|
||||
|
||||
@@ -40,15 +36,19 @@ use postings::FreqHandler;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
terms: Arc<TermDictionaryImpl>,
|
||||
postings_data: ReadOnlySource,
|
||||
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_reader: StoreReader,
|
||||
fast_fields_reader: Arc<FastFieldsReader>,
|
||||
fieldnorms_reader: Arc<FastFieldsReader>,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_data: ReadOnlySource,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@@ -76,11 +76,6 @@ impl SegmentReader {
|
||||
self.delete_bitset.len() as DocId
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
|
||||
&*self.fast_fields_reader
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
///
|
||||
/// Returns the u64 fast value reader if the field
|
||||
@@ -91,17 +86,18 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
|
||||
(&self,
|
||||
field: Field)
|
||||
-> fastfield::Result<TFastFieldReader> {
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<TFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
} else {
|
||||
Ok(self.fast_fields_reader
|
||||
.open_reader(field)
|
||||
.expect("Fast field file corrupted."))
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(TFastFieldReader::open)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,15 +110,9 @@ impl SegmentReader {
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
|
||||
self.fieldnorms_reader.open_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
match self.get_term_info(term) {
|
||||
Some(term_info) => term_info.doc_freq,
|
||||
None => 0,
|
||||
}
|
||||
self.fieldnorms_composite.open_read(field).map(
|
||||
U64FastFieldReader::open,
|
||||
)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
@@ -133,23 +123,30 @@ impl SegmentReader {
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: Segment) -> Result<SegmentReader> {
|
||||
|
||||
let source = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let terms = TermDictionaryImpl::from_source(source)?;
|
||||
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_composite = CompositeFile::open(termdict_source)?;
|
||||
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
let postings_shared_mmap = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(postings_source)?;
|
||||
|
||||
let fast_field_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_reader = FastFieldsReader::from_source(fast_field_data)?;
|
||||
let positions_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
CompositeFile::open(source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(fast_fields_data)?;
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_reader = FastFieldsReader::from_source(fieldnorms_data)?;
|
||||
let fieldnorms_composite = CompositeFile::open(fieldnorms_data)?;
|
||||
|
||||
let positions_data = segment
|
||||
.open_read(SegmentComponent::POSITIONS)
|
||||
.unwrap_or_else(|_| ReadOnlySource::empty());
|
||||
|
||||
let delete_bitset = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
@@ -160,22 +157,66 @@ impl SegmentReader {
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
segment_meta: segment.meta().clone(),
|
||||
postings_data: postings_shared_mmap,
|
||||
terms: Arc::new(terms),
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
fast_fields_reader: Arc::new(fast_fields_reader),
|
||||
fieldnorms_reader: Arc::new(fieldnorms_reader),
|
||||
delete_bitset: delete_bitset,
|
||||
positions_data: positions_data,
|
||||
schema: schema,
|
||||
})
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
termdict_composite: termdict_composite,
|
||||
postings_composite: postings_composite,
|
||||
fast_fields_composite: fast_fields_composite,
|
||||
fieldnorms_composite: fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
delete_bitset: delete_bitset,
|
||||
positions_composite: positions_composite,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
&self.terms
|
||||
|
||||
/// Returns a field reader associated to the field given in argument.
|
||||
///
|
||||
/// The field reader is in charge of iterating through the
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) =
|
||||
self.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
{
|
||||
inv_idx_reader.clone();
|
||||
}
|
||||
|
||||
let termdict_source: ReadOnlySource = self.termdict_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field term dictionary in composite file.",
|
||||
);
|
||||
|
||||
let postings_source = self.postings_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field postings in composite file.",
|
||||
);
|
||||
|
||||
let positions_source = self.positions_composite.open_read(field).expect(
|
||||
"Index corrupted. Failed to open field positions in composite file.",
|
||||
);
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
termdict_source,
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
self.schema.clone(),
|
||||
));
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
// twice, but this is fine.
|
||||
self.inv_idx_reader_cache
|
||||
.write()
|
||||
.expect(
|
||||
"Field reader cache lock poisoned. This should never happen.",
|
||||
)
|
||||
.insert(field, inv_idx_reader.clone());
|
||||
|
||||
inv_idx_reader
|
||||
}
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
@@ -187,89 +228,6 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption)
|
||||
-> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
let maximum_option = get!(field_entry.field_type().get_segment_postings_option());
|
||||
let best_effort_option = cmp::min(maximum_option, option);
|
||||
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
|
||||
}
|
||||
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset)
|
||||
}
|
||||
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings_from_terminfo(&self,
|
||||
term_info: &TermInfo,
|
||||
option: SegmentPostingsOption)
|
||||
-> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
let freq_handler = match option {
|
||||
SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(),
|
||||
SegmentPostingsOption::Freq => FreqHandler::new_with_freq(),
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
let offset = term_info.positions_offset as usize;
|
||||
let offseted_position_data = &self.positions_data[offset..];
|
||||
FreqHandler::new_with_freq_and_position(offseted_position_data)
|
||||
}
|
||||
};
|
||||
BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler)
|
||||
}
|
||||
|
||||
|
||||
/// Resets the block segment to another position of the postings
|
||||
/// file.
|
||||
///
|
||||
/// This is useful for enumerating through a list of terms,
|
||||
/// and consuming the associated posting lists while avoiding
|
||||
/// reallocating a `BlockSegmentPostings`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo<'a>(&'a self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings<'a>) {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data: &'a [u8] = &self.postings_data[offset..];
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_data);
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.terms.get(term.as_slice())
|
||||
}
|
||||
|
||||
/// Returns the segment id
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.segment_id
|
||||
|
||||
@@ -39,11 +39,11 @@ impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
doc_id.serialize(&mut self.buffer)?;
|
||||
value.serialize(&mut self.buffer)?;
|
||||
Ok(if self.remaining == 0 {
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
} else {
|
||||
None
|
||||
})
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) => {
|
||||
try!(self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset))
|
||||
try!(self.get_skip_layer(layer_id).insert(
|
||||
skip_doc_id,
|
||||
&skip_offset,
|
||||
))
|
||||
}
|
||||
None => {
|
||||
return Ok(());
|
||||
|
||||
@@ -68,9 +68,14 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
};
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.take_while(|num_bits: &usize| {
|
||||
compute_table_size(*num_bits) < table_size_limit
|
||||
})
|
||||
.last()
|
||||
.expect(&format!("Per thread memory is too small: {}", per_thread_memory_budget));
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
@@ -174,13 +179,10 @@ impl<'a> HashMap<'a> {
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
self.get_key_value(kv.key_value_addr)
|
||||
})
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
self.get_key_value(kv.key_value_addr)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -282,8 +284,10 @@ mod tests {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes()));
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,13 +307,13 @@ mod tests {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -144,7 +144,8 @@ impl InnerHeap {
|
||||
addr
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
info!(r#"Exceeded heap size. The segment will be committed right after indexing this document."#,);
|
||||
info!(r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
@@ -154,10 +155,9 @@ impl InnerHeap {
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
self.next_heap.as_ref().unwrap().get_slice(BytesRef(
|
||||
start - self.buffer_len,
|
||||
))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
@@ -167,10 +167,10 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut_slice(
|
||||
start - self.buffer_len,
|
||||
stop - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
@@ -188,10 +188,9 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut(
|
||||
addr - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
@@ -200,10 +199,9 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(
|
||||
addr - self.buffer_len,
|
||||
)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
@@ -213,10 +211,10 @@ impl InnerHeap {
|
||||
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
self.next_heap.as_mut().unwrap().set(
|
||||
addr - self.buffer_len,
|
||||
val,
|
||||
);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
|
||||
@@ -103,9 +103,11 @@ impl fmt::Display for OpenWriteError {
|
||||
write!(f, "the file '{:?}' already exists", path)
|
||||
}
|
||||
OpenWriteError::IOError(ref err) => {
|
||||
write!(f,
|
||||
"an io error occurred while opening a file for writing: '{}'",
|
||||
err)
|
||||
write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for writing: '{}'",
|
||||
err
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -147,9 +149,11 @@ impl fmt::Display for OpenReadError {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
}
|
||||
OpenReadError::IOError(ref err) => {
|
||||
write!(f,
|
||||
"an io error occurred while opening a file for reading: '{}'",
|
||||
err)
|
||||
write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for reading: '{}'",
|
||||
err
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,10 +45,9 @@ pub struct FileProtection {
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let mut meta_informations_wlock = directory.meta_informations.write().expect(
|
||||
"Managed file lock poisoned",
|
||||
);
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
@@ -68,9 +67,10 @@ impl Drop for FileProtection {
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>)
|
||||
-> io::Result<()> {
|
||||
fn save_managed_paths(
|
||||
directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||
) -> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
write!(&mut w, "\n")?;
|
||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
@@ -84,22 +84,22 @@ impl ManagedDirectory {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> =
|
||||
serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
serde_json::from_str(&managed_files_json).chain_err(|| {
|
||||
ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone())
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files:
|
||||
HashMap::default(),
|
||||
})),
|
||||
})
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => {
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
}
|
||||
@@ -116,15 +116,14 @@ impl ManagedDirectory {
|
||||
/// If a file cannot be deleted (for permission reasons for instance)
|
||||
/// an error is simply logged, and the file remains in the list of managed
|
||||
/// files.
|
||||
pub fn garbage_collect<L: FnOnce()-> HashSet<PathBuf> >(&mut self, get_living_files: L) {
|
||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||
info!("Garbage collect");
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock =
|
||||
self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
let meta_informations_rlock = self.meta_informations.read().expect(
|
||||
"Managed directory rlock poisoned in garbage collect.",
|
||||
);
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
@@ -177,9 +176,9 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
let mut meta_informations_wlock = self.meta_informations.write().expect(
|
||||
"Managed directory wlock poisoned (2).",
|
||||
);
|
||||
{
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
@@ -202,13 +201,13 @@ impl ManagedDirectory {
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
let mut meta_informations_wlock = self.meta_informations.write().expect(
|
||||
"Managed file lock poisoned on protect",
|
||||
);
|
||||
*meta_informations_wlock
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
FileProtection {
|
||||
directory: self.clone(),
|
||||
@@ -224,9 +223,9 @@ impl ManagedDirectory {
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let mut meta_wlock = self.meta_informations.write().expect(
|
||||
"Managed file lock poisoned",
|
||||
);
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
|
||||
@@ -241,8 +240,9 @@ impl Directory for ManagedDirectory {
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
|
||||
self.register_file_as_managed(path)
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
self.register_file_as_managed(path).map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
self.directory.open_write(path)
|
||||
}
|
||||
|
||||
@@ -257,9 +257,9 @@ impl Directory for ManagedDirectory {
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
{
|
||||
let metas_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
let metas_rlock = self.meta_informations.read().expect(
|
||||
"poisoned lock in managed directory meta",
|
||||
);
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
if *counter > 0 {
|
||||
return Err(DeleteError::FileProtected(path.to_owned()));
|
||||
@@ -327,7 +327,7 @@ mod tests {
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -343,7 +343,7 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
@@ -366,7 +366,7 @@ mod tests {
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -374,7 +374,7 @@ mod tests {
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| { living_files });
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
@@ -398,11 +398,11 @@ mod tests {
|
||||
|
||||
{
|
||||
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
managed_directory.garbage_collect(|| { living_files.clone() });
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
|
||||
|
||||
@@ -24,15 +24,17 @@ use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let file = File::open(&full_path)
|
||||
.map_err(|e| if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
})?;
|
||||
let file = File::open(&full_path).map_err(|e| if e.kind() ==
|
||||
io::ErrorKind::NotFound
|
||||
{
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
let meta_data = file.metadata().map_err(|e| {
|
||||
IOError::with_path(full_path.to_owned(), e)
|
||||
})?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return an anonymous mmap_cache
|
||||
@@ -46,7 +48,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
|
||||
|
||||
}
|
||||
|
||||
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheCounters {
|
||||
// Number of time the cache prevents to call `mmap`
|
||||
pub hit: usize,
|
||||
@@ -58,7 +60,7 @@ pub struct CacheCounters {
|
||||
pub miss_weak: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug,Serialize,Deserialize)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheInfo {
|
||||
pub counters: CacheCounters,
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
@@ -113,31 +115,31 @@ impl MmapCache {
|
||||
self.cleanup();
|
||||
}
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,15 +182,19 @@ impl MmapDirectory {
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
|
||||
Err(OpenDirectoryError::DoesNotExist(
|
||||
PathBuf::from(directory_path),
|
||||
))
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
|
||||
Err(OpenDirectoryError::NotADirectory(
|
||||
PathBuf::from(directory_path),
|
||||
))
|
||||
} else {
|
||||
Ok(MmapDirectory {
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -215,9 +221,9 @@ impl MmapDirectory {
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
open_opts.write(true).custom_flags(
|
||||
winbase::FILE_FLAG_BACKUP_SEMANTICS,
|
||||
);
|
||||
}
|
||||
|
||||
let fd = try!(open_opts.open(&self.root_path));
|
||||
@@ -270,46 +276,50 @@ impl Directory for MmapDirectory {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquired write lock \
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while reading {:?}",
|
||||
path);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
Ok(
|
||||
mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())),
|
||||
)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
debug!("Open Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let open_res = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(full_path);
|
||||
let open_res = OpenOptions::new().write(true).create_new(true).open(
|
||||
full_path,
|
||||
);
|
||||
|
||||
let mut file = open_res
|
||||
.map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(path.to_owned())
|
||||
} else {
|
||||
IOError::with_path(path.to_owned(), err).into()
|
||||
})?;
|
||||
let mut file = open_res.map_err(|err| if err.kind() ==
|
||||
io::ErrorKind::AlreadyExists
|
||||
{
|
||||
OpenWriteError::FileAlreadyExists(path.to_owned())
|
||||
} else {
|
||||
IOError::with_path(path.to_owned(), err).into()
|
||||
})?;
|
||||
|
||||
// making sure the file is created.
|
||||
file.flush()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
file.flush().map_err(
|
||||
|e| IOError::with_path(path.to_owned(), e),
|
||||
)?;
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
self.sync_directory().map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
@@ -318,22 +328,23 @@ impl Directory for MmapDirectory {
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquired write lock \
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
// Removing the entry in the MMap cache.
|
||||
// The munmap will appear on Drop,
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => {
|
||||
self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into())
|
||||
self.sync_directory().map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e).into()
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
@@ -355,8 +366,9 @@ impl Directory for MmapDirectory {
|
||||
let mut buffer = Vec::new();
|
||||
match File::open(&full_path) {
|
||||
Ok(mut file) => {
|
||||
file.read_to_end(&mut buffer)
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
file.read_to_end(&mut buffer).map_err(|e| {
|
||||
IOError::with_path(path.to_owned(), e)
|
||||
})?;
|
||||
Ok(buffer)
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -13,14 +13,15 @@ mod managed_directory;
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{Write, Seek};
|
||||
use std::io::{Write, Seek, BufWriter};
|
||||
|
||||
use std::io::BufWriter;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
pub use self::managed_directory::{ManagedDirectory, FileProtection};
|
||||
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::{ManagedDirectory, FileProtection};
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
|
||||
@@ -41,8 +41,10 @@ impl VecWriter {
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path)
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -62,8 +64,10 @@ impl Write for VecWriter {
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.is_flushed = true;
|
||||
try!(self.shared_directory
|
||||
.write(self.path.clone(), self.data.get_ref()));
|
||||
try!(self.shared_directory.write(
|
||||
self.path.clone(),
|
||||
self.data.get_ref(),
|
||||
));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -79,11 +83,11 @@ impl InnerDirectory {
|
||||
}
|
||||
|
||||
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||
let mut map = try!(self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
|
||||
path))
|
||||
let mut map = try!(self.0.write().map_err(|_| {
|
||||
make_io_err(format!(
|
||||
"Failed to lock the directory, when trying to write {:?}",
|
||||
path
|
||||
))
|
||||
}));
|
||||
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||
Ok(prev_value.is_some())
|
||||
@@ -93,17 +97,21 @@ impl InnerDirectory {
|
||||
self.0
|
||||
.read()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquire read lock for the \
|
||||
let msg = format!(
|
||||
"Failed to acquire read lock for the \
|
||||
directory when trying to read {:?}",
|
||||
path);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|readable_map| {
|
||||
readable_map
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())))
|
||||
.map(|data| {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
@@ -111,16 +119,18 @@ impl InnerDirectory {
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquire write lock for the \
|
||||
let msg = format!(
|
||||
"Failed to acquire write lock for the \
|
||||
directory when trying to delete {:?}",
|
||||
path);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
@@ -164,9 +174,11 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
let exists = self.fs.write(path_buf.clone(), &Vec::new()).map_err(
|
||||
|err| {
|
||||
IOError::with_path(path.to_owned(), err)
|
||||
},
|
||||
)?;
|
||||
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if exists {
|
||||
|
||||
@@ -2,6 +2,8 @@ use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
@@ -41,6 +43,14 @@ impl ReadOnlySource {
|
||||
}
|
||||
}
|
||||
|
||||
/// Splits into 2 `ReadOnlySource`, at the offset given
|
||||
/// as an argument.
|
||||
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
|
||||
let left = self.slice(0, addr);
|
||||
let right = self.slice_from(addr);
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Creates a ReadOnlySource that is just a
|
||||
/// view over a slice of the data.
|
||||
///
|
||||
@@ -62,6 +72,23 @@ impl ReadOnlySource {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `from`
|
||||
/// boundary.
|
||||
///
|
||||
/// Equivalent to `.slice(from_offset, self.len())`
|
||||
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
|
||||
let len = self.len();
|
||||
self.slice(from_offset, len)
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
/// boundary.
|
||||
///
|
||||
/// Equivalent to `.slice(0, to_offset)`
|
||||
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
|
||||
self.slice(0, to_offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for ReadOnlySource {
|
||||
@@ -82,3 +109,42 @@ impl From<Vec<u8>> for ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(shared_data)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a ReadOnlySource
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
cursor: &'static [u8],
|
||||
}
|
||||
|
||||
impl SourceRead {
|
||||
// Advance the cursor by a given number of bytes.
|
||||
pub fn advance(&mut self, len: usize) {
|
||||
self.cursor = &self.cursor[len..];
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.cursor
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReadOnlySource> for SourceRead {
|
||||
// Creates a new `SourceRead` from a given `ReadOnlySource`
|
||||
fn from(source: ReadOnlySource) -> SourceRead {
|
||||
let len = source.len();
|
||||
let slice_ptr = source.as_slice().as_ptr();
|
||||
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
|
||||
SourceRead {
|
||||
_data_owner: source,
|
||||
cursor: static_slice,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for SourceRead {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.cursor.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
10
src/error.rs
10
src/error.rs
@@ -10,6 +10,7 @@ use schema;
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
/// Path does not exist.
|
||||
@@ -111,12 +112,9 @@ impl From<schema::DocParsingError> for Error {
|
||||
impl From<OpenWriteError> for Error {
|
||||
fn from(error: OpenWriteError) -> Error {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) => {
|
||||
ErrorKind::FileAlreadyExists(filepath)
|
||||
}
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}
|
||||
.into()
|
||||
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ mod delete;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
|
||||
pub use self::reader::{U64FastFieldReader, I64FastFieldReader};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::error::{Result, FastFieldNotAvailableError};
|
||||
@@ -51,6 +51,7 @@ mod tests {
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use common::CompositeFile;
|
||||
use rand::XorShiftRng;
|
||||
|
||||
lazy_static! {
|
||||
@@ -84,7 +85,7 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
|
||||
@@ -94,12 +95,12 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 38 as usize);
|
||||
assert_eq!(source.len(), 35 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let composite_file = CompositeFile::open(source).unwrap();
|
||||
let field_source = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -112,7 +113,7 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
|
||||
@@ -128,12 +129,12 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 63 as usize);
|
||||
assert_eq!(source.len(), 60 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -154,7 +155,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
|
||||
@@ -164,12 +165,12 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 36 as usize);
|
||||
assert_eq!(source.len(), 33 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -183,30 +184,35 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
|
||||
for i in 0u64..10_000u64 {
|
||||
add_single_field_doc(&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i);
|
||||
add_single_field_doc(
|
||||
&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i,
|
||||
);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 80044 as usize);
|
||||
assert_eq!(source.len(), 80041 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
|
||||
assert_eq!(
|
||||
fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -221,7 +227,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
@@ -233,12 +239,13 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 17711 as usize);
|
||||
assert_eq!(source.len(), 17708 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
fast_field_readers.open_reader(i64_field).unwrap();
|
||||
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
@@ -262,7 +269,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
@@ -272,9 +279,10 @@ mod tests {
|
||||
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
fast_field_readers.open_reader(i64_field).unwrap();
|
||||
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
}
|
||||
@@ -295,7 +303,7 @@ mod tests {
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
@@ -305,9 +313,10 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
@@ -333,13 +342,13 @@ mod tests {
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -349,7 +358,7 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
@@ -359,9 +368,11 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
@@ -380,7 +391,7 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
@@ -390,17 +401,18 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::from_source(source).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,15 @@
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use common::{self, BinarySerializable};
|
||||
use common::bitpacker::{compute_num_bits, BitUnpacker};
|
||||
use DocId;
|
||||
use schema::{Field, SchemaBuilder};
|
||||
use schema::SchemaBuilder;
|
||||
use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use schema::FieldType;
|
||||
use error::ResultExt;
|
||||
use std::mem;
|
||||
use common;
|
||||
use common::CompositeFile;
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
@@ -111,10 +106,10 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
let amplitude: u64;
|
||||
{
|
||||
let mut cursor = data.as_slice();
|
||||
min_value = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the min_value of fast field.");
|
||||
amplitude = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the amplitude of fast field.");
|
||||
min_value =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
amplitude =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
|
||||
}
|
||||
let max_value = min_value + amplitude;
|
||||
@@ -135,33 +130,36 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("test");
|
||||
let path = Path::new("__dummy__");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let write: WritePtr = directory.open_write(path).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for val in vals {
|
||||
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
|
||||
fast_field_writer.add_val(val);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers.get_field_writer(field).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
directory
|
||||
.open_read(path)
|
||||
.chain_err(|| "Failed to open the file")
|
||||
.and_then(|source| {
|
||||
FastFieldsReader::from_source(source)
|
||||
.chain_err(|| "Failed to read the file.")
|
||||
})
|
||||
.and_then(|ff_readers| {
|
||||
ff_readers
|
||||
.open_reader(field)
|
||||
.ok_or_else(|| "Failed to find the requested field".into())
|
||||
})
|
||||
.expect("This should never happen, please report.")
|
||||
|
||||
let source = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file =
|
||||
CompositeFile::open(source).expect("Failed to read the composite file");
|
||||
|
||||
let field_source = composite_file.open_read(field).expect(
|
||||
"File component not found",
|
||||
);
|
||||
U64FastFieldReader::open(field_source)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -212,7 +210,7 @@ impl FastFieldReader for I64FastFieldReader {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.underlying.get_range(start, output_u64);
|
||||
for mut_val in output_u64.iter_mut() {
|
||||
*mut_val ^= 1 << 63;
|
||||
*mut_val = common::u64_to_i64(*mut_val as u64) as u64;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -231,67 +229,3 @@ impl FastFieldReader for I64FastFieldReader {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// The `FastFieldsReader` is the datastructure containing
|
||||
/// all of the fast fields' data.
|
||||
///
|
||||
/// It contains a mapping that associated these fields to
|
||||
/// the proper slice in the fastfield reader file.
|
||||
pub struct FastFieldsReader {
|
||||
source: ReadOnlySource,
|
||||
field_offsets: HashMap<Field, (u32, u32)>,
|
||||
}
|
||||
|
||||
impl FastFieldsReader {
|
||||
/// Opens a `FastFieldsReader`
|
||||
///
|
||||
/// When opening the fast field reader, the
|
||||
/// the list of the offset is read (as a footer of the
|
||||
/// data file).
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
|
||||
let header_offset;
|
||||
let field_offsets: Vec<(Field, u32)>;
|
||||
{
|
||||
let buffer = source.as_slice();
|
||||
{
|
||||
let mut cursor = buffer;
|
||||
header_offset = u32::deserialize(&mut cursor)?;
|
||||
}
|
||||
{
|
||||
let mut cursor = &buffer[header_offset as usize..];
|
||||
field_offsets = Vec::deserialize(&mut cursor)?;
|
||||
}
|
||||
}
|
||||
let mut end_offsets: Vec<u32> = field_offsets.iter().map(|&(_, offset)| offset).collect();
|
||||
end_offsets.push(header_offset);
|
||||
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
|
||||
for (field_start_offsets, stop_offset) in
|
||||
field_offsets.iter().zip(end_offsets.iter().skip(1)) {
|
||||
let (field, start_offset) = *field_start_offsets;
|
||||
field_offsets_map.insert(field, (start_offset, *stop_offset));
|
||||
}
|
||||
Ok(FastFieldsReader {
|
||||
field_offsets: field_offsets_map,
|
||||
source: source,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the u64 fast value reader if the field
|
||||
/// is a u64 field indexed as "fast".
|
||||
///
|
||||
/// Return None if the field is not a u64 field
|
||||
/// indexed with the fast option.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
|
||||
self.field_offsets
|
||||
.get(&field)
|
||||
.map(|&(start, stop)| {
|
||||
let field_source = self.source.slice(start as usize, stop as usize);
|
||||
FFReader::open(field_source)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,8 @@ use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::{compute_num_bits, BitPacker};
|
||||
use common::CountingWriter;
|
||||
use std::io::{self, Write, Seek, SeekFrom};
|
||||
use common::CompositeWrite;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
@@ -26,51 +27,61 @@ use std::io::{self, Write, Seek, SeekFrom};
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
write: CountingWriter<WritePtr>,
|
||||
fields: Vec<(Field, u32)>,
|
||||
min_value: u64,
|
||||
field_open: bool,
|
||||
bit_packer: BitPacker,
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
|
||||
impl FastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn new(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let mut counting_writer = CountingWriter::wrap(write);
|
||||
0u32.serialize(&mut counting_writer)?;
|
||||
Ok(FastFieldSerializer {
|
||||
write: counting_writer,
|
||||
fields: Vec::new(),
|
||||
min_value: 0,
|
||||
field_open: false,
|
||||
bit_packer: BitPacker::new(0),
|
||||
})
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write: composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64)
|
||||
-> io::Result<()> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
|
||||
}
|
||||
self.min_value = min_value;
|
||||
self.field_open = true;
|
||||
self.fields.push((field, self.write.written_bytes() as u32));
|
||||
let write = &mut self.write;
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field(field);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
self.bit_packer = BitPacker::new(num_bits as usize);
|
||||
Ok(())
|
||||
let bit_packer = BitPacker::new(num_bits as usize);
|
||||
Ok(FastSingleFieldSerializer {
|
||||
write: write,
|
||||
bit_packer: bit_packer,
|
||||
min_value: min_value,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
@@ -78,33 +89,7 @@ impl FastFieldSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close the u64 fast field.
|
||||
pub fn close_field(&mut self) -> io::Result<()> {
|
||||
if !self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
|
||||
}
|
||||
self.field_open = false;
|
||||
// adding some padding to make sure we
|
||||
// can read the last elements with our u64
|
||||
// cursor
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<usize> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
|
||||
}
|
||||
let header_offset: usize = self.write.written_bytes() as usize;
|
||||
let (mut write, written_size) = self.write.finish()?;
|
||||
self.fields.serialize(&mut write)?;
|
||||
write.seek(SeekFrom::Start(0))?;
|
||||
(header_offset as u32).serialize(&mut write)?;
|
||||
write.flush()?;
|
||||
Ok(written_size)
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,9 +58,9 @@ impl FastFieldsWriter {
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.field_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
self.field_writers.iter_mut().find(|field_writer| {
|
||||
field_writer.field == field
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -155,9 +155,9 @@ impl IntFastFieldWriter {
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
VInt(val)
|
||||
.serialize(&mut self.vals)
|
||||
.expect("unable to serialize VInt to Vec");
|
||||
VInt(val).serialize(&mut self.vals).expect(
|
||||
"unable to serialize VInt to Vec",
|
||||
);
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
@@ -208,13 +208,14 @@ impl IntFastFieldWriter {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
|
||||
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
|
||||
let mut cursor = self.vals.as_slice();
|
||||
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
|
||||
serializer.add_val(val)?;
|
||||
single_field_serializer.add_val(val)?;
|
||||
}
|
||||
|
||||
serializer.close_field()
|
||||
single_field_serializer.close_field()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,9 +40,9 @@ impl DeleteQueue {
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
|
||||
delete_queue
|
||||
@@ -59,9 +59,11 @@ impl DeleteQueue {
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
.clone()
|
||||
.expect("Failed to unwrap last_block. This should never happen
|
||||
.expect(
|
||||
"Failed to unwrap last_block. This should never happen
|
||||
as the Option<> is only here to make
|
||||
initialization possible");
|
||||
initialization possible",
|
||||
);
|
||||
let operations_len = last_block.operations.len();
|
||||
DeleteCursor {
|
||||
block: last_block,
|
||||
@@ -92,9 +94,9 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
let mut self_wlock = self.inner.write().expect(
|
||||
"Failed to acquire write lock on delete queue writer",
|
||||
);
|
||||
|
||||
let delete_operations;
|
||||
{
|
||||
@@ -108,9 +110,9 @@ impl DeleteQueue {
|
||||
let next_block = NextBlock::from(self.clone());
|
||||
{
|
||||
self_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
self_wlock.last_block.clone()
|
||||
}
|
||||
@@ -132,18 +134,18 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
let next_read_lock = self.0.read().expect(
|
||||
"Failed to acquire write lock in delete queue",
|
||||
);
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
return Some(block.clone());
|
||||
}
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
let mut next_write_lock = self.0.write().expect(
|
||||
"Failed to acquire write lock in delete queue",
|
||||
);
|
||||
match *next_write_lock {
|
||||
InnerNextBlock::Closed(ref block) => {
|
||||
return Some(block.clone());
|
||||
|
||||
@@ -56,8 +56,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_none() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value());
|
||||
assert_eq!(
|
||||
doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -102,14 +102,17 @@ impl !Sync for IndexWriter {}
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn open_index_writer(index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize)
|
||||
-> Result<IndexWriter> {
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
|
||||
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!("The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT));
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
|
||||
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
|
||||
@@ -156,12 +159,13 @@ pub fn open_index_writer(index: &Index,
|
||||
|
||||
|
||||
|
||||
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64)
|
||||
-> Result<bool> {
|
||||
pub fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
) -> Result<bool> {
|
||||
|
||||
let mut might_have_changed = false;
|
||||
|
||||
@@ -177,8 +181,12 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
if let Some(mut docset) =
|
||||
segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) = inverted_index.read_postings(
|
||||
&delete_op.term,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
@@ -198,10 +206,11 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64)
|
||||
-> Result<Option<FileProtection>> {
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
@@ -222,11 +231,13 @@ pub fn advance_deletes(mut segment: Segment,
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp)?;
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
@@ -247,15 +258,16 @@ pub fn advance_deletes(mut segment: Segment,
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
fn index_documents(heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor)
|
||||
-> Result<bool> {
|
||||
fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
|
||||
@@ -265,8 +277,10 @@ fn index_documents(heap: &mut Heap,
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!("Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
@@ -275,8 +289,10 @@ fn index_documents(heap: &mut Heap,
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!("Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -296,11 +312,13 @@ fn index_documents(heap: &mut Heap,
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp)?;
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
@@ -327,14 +345,15 @@ impl IndexWriter {
|
||||
join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
|
||||
.chain_err(|| {
|
||||
ErrorKind::ErrorInThread("Error in indexing worker thread.".into())
|
||||
})?;
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result =
|
||||
self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
let result = self.segment_updater.wait_merging_thread().chain_err(|| {
|
||||
ErrorKind::ErrorInThread("Failed to join merging thread.".into())
|
||||
});
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
@@ -347,8 +366,10 @@ impl IndexWriter {
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater
|
||||
.add_segment(self.generation, segment_entry);
|
||||
self.segment_updater.add_segment(
|
||||
self.generation,
|
||||
segment_entry,
|
||||
);
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
@@ -372,7 +393,11 @@ impl IndexWriter {
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
self.worker_id,
|
||||
generation
|
||||
))
|
||||
.spawn(move || {
|
||||
|
||||
loop {
|
||||
@@ -396,14 +421,16 @@ impl IndexWriter {
|
||||
return Ok(());
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(&mut heap,
|
||||
table_size,
|
||||
segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone())?;
|
||||
index_documents(
|
||||
&mut heap,
|
||||
table_size,
|
||||
segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
)?;
|
||||
|
||||
}
|
||||
})?;
|
||||
@@ -436,9 +463,10 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -522,14 +550,15 @@ impl IndexWriter {
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle);
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result =
|
||||
worker_handle
|
||||
.join()
|
||||
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
|
||||
let indexing_worker_result = worker_handle.join().map_err(|e| {
|
||||
Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e)))
|
||||
})?;
|
||||
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
@@ -623,13 +652,17 @@ mod tests {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -719,9 +752,9 @@ mod tests {
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
index_writer.wait_merging_threads().expect(
|
||||
"waiting merging thread failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
|
||||
@@ -62,7 +62,9 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
|
||||
.map(|(ind, num_docs)| {
|
||||
(ind, (self.clip_min_size(num_docs) as f64).log2())
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (first_ind, first_score) = size_sorted_log_tuples[0];
|
||||
@@ -79,7 +81,9 @@ impl MergePolicy for LogMergePolicy {
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -138,17 +142,19 @@ mod tests {
|
||||
// * one with the 6 * 10-docs segments
|
||||
// * one with the 3 * 1000-docs segments
|
||||
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10)];
|
||||
let test_input = vec![
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
@@ -156,24 +162,28 @@ mod tests {
|
||||
#[test]
|
||||
fn test_log_merge_policy_within_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
|
||||
seg_meta(11), // log2(11) = ~3.46
|
||||
seg_meta(12), // log2(12) = ~3.58
|
||||
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
|
||||
seg_meta(1000), // log2(1000) = ~9.97
|
||||
seg_meta(1000)]; // log2(1000) = ~9.97
|
||||
let test_input = vec![
|
||||
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
|
||||
seg_meta(11), // log2(11) = ~3.46
|
||||
seg_meta(12), // log2(12) = ~3.58
|
||||
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
|
||||
seg_meta(1000), // log2(1000) = ~9.97
|
||||
seg_meta(1000),
|
||||
]; // log2(1000) = ~9.97
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
#[test]
|
||||
fn test_log_merge_policy_small_segments() {
|
||||
// segments under min_layer_size are merged together
|
||||
let test_input = vec![seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2)];
|
||||
let test_input = vec![
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use DocId;
|
||||
use core::SerializableSegment;
|
||||
use schema::FieldValue;
|
||||
use indexer::SegmentSerializer;
|
||||
use postings::PostingsSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use itertools::Itertools;
|
||||
use postings::Postings;
|
||||
@@ -17,9 +17,9 @@ use fastfield::FastFieldSerializer;
|
||||
use fastfield::FastFieldReader;
|
||||
use store::StoreWriter;
|
||||
use std::cmp::{min, max};
|
||||
use termdict::TermDictionary;
|
||||
use schema::Term;
|
||||
use termdict::TermStreamer;
|
||||
use postings::SegmentPostingsOption;
|
||||
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
@@ -28,33 +28,11 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
|
||||
struct DeltaPositionComputer {
|
||||
buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer { buffer: vec![0u32; 512] }
|
||||
}
|
||||
|
||||
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
|
||||
if positions.len() > self.buffer.len() {
|
||||
self.buffer.resize(positions.len(), 0u32);
|
||||
}
|
||||
let mut last_pos = 0u32;
|
||||
for (i, position) in positions.iter().cloned().enumerate() {
|
||||
self.buffer[i] = position - last_pos;
|
||||
last_pos = position;
|
||||
}
|
||||
&self.buffer[..positions.len()]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet)
|
||||
-> Option<(u64, u64)> {
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &U64FastFieldReader,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet,
|
||||
) -> Option<(u64, u64)> {
|
||||
if max_doc == 0 {
|
||||
None
|
||||
} else if !delete_bitset.has_deletes() {
|
||||
@@ -72,18 +50,46 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader,
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
fn extract_fieldnorm_reader(
|
||||
segment_reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
segment_reader.fast_fields_reader().open_reader(field)
|
||||
fn extract_fast_field_reader(
|
||||
segment_reader: &SegmentReader,
|
||||
field: Field,
|
||||
) -> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fast_field_reader(field).ok()
|
||||
|
||||
}
|
||||
|
||||
struct DeltaComputer {
|
||||
buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl DeltaComputer {
|
||||
fn new() -> DeltaComputer {
|
||||
DeltaComputer { buffer: vec![0u32; 512] }
|
||||
}
|
||||
|
||||
fn compute_delta(&mut self, positions: &[u32]) -> &[u32] {
|
||||
if positions.len() > self.buffer.len() {
|
||||
self.buffer.resize(positions.len(), 0u32);
|
||||
}
|
||||
let mut last_pos = 0u32;
|
||||
let num_positions = positions.len();
|
||||
for i in 0..num_positions {
|
||||
let cur_pos = positions[i];
|
||||
self.buffer[i] = cur_pos - last_pos;
|
||||
last_pos = cur_pos;
|
||||
}
|
||||
&self.buffer[..positions.len()]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl IndexMerger {
|
||||
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
|
||||
let mut readers = vec![];
|
||||
@@ -96,10 +102,10 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
@@ -110,9 +116,11 @@ impl IndexMerger {
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fieldnorm_fastfields,
|
||||
&extract_fieldnorm_reader,
|
||||
fast_field_serializer)
|
||||
self.generic_write_fast_field(
|
||||
fieldnorm_fastfields,
|
||||
&extract_fieldnorm_reader,
|
||||
fast_field_serializer,
|
||||
)
|
||||
}
|
||||
|
||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
@@ -123,19 +131,21 @@ impl IndexMerger {
|
||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fast_fields,
|
||||
&extract_fast_field_reader,
|
||||
fast_field_serializer)
|
||||
self.generic_write_fast_field(
|
||||
fast_fields,
|
||||
&extract_fast_field_reader,
|
||||
fast_field_serializer,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
// used both to merge field norms and regular u64 fast fields.
|
||||
fn generic_write_fast_field(&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field)
|
||||
-> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer)
|
||||
-> Result<()> {
|
||||
fn generic_write_fast_field(
|
||||
&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
|
||||
for field in fields {
|
||||
|
||||
@@ -147,19 +157,25 @@ impl IndexMerger {
|
||||
match field_reader_extractor(reader, field) {
|
||||
Some(u64_reader) => {
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset()) {
|
||||
compute_min_max_val(
|
||||
&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset(),
|
||||
)
|
||||
{
|
||||
// the segment has some non-deleted documents
|
||||
min_val = min(min_val, seg_min_val);
|
||||
max_val = max(max_val, seg_max_val);
|
||||
u64_readers
|
||||
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
|
||||
u64_readers.push((
|
||||
reader.max_doc(),
|
||||
u64_reader,
|
||||
reader.delete_bitset(),
|
||||
));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let error_msg = format!("Failed to find a u64_reader for field {:?}",
|
||||
field);
|
||||
let error_msg =
|
||||
format!("Failed to find a u64_reader for field {:?}", field);
|
||||
error!("{}", error_msg);
|
||||
bail!(ErrorKind::SchemaError(error_msg));
|
||||
}
|
||||
@@ -174,50 +190,68 @@ impl IndexMerger {
|
||||
|
||||
assert!(min_val <= max_val);
|
||||
|
||||
fast_field_serializer
|
||||
.new_u64_fast_field(field, min_val, max_val)?;
|
||||
|
||||
let mut fast_single_field_serializer = fast_field_serializer.new_u64_fast_field(
|
||||
field,
|
||||
min_val,
|
||||
max_val,
|
||||
)?;
|
||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||
for doc_id in 0..max_doc {
|
||||
if !delete_bitset.is_deleted(doc_id) {
|
||||
let val = u64_reader.get(doc_id);
|
||||
fast_field_serializer.add_val(val)?;
|
||||
fast_single_field_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fast_field_serializer.close_field()?;
|
||||
fast_single_field_serializer.close_field()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_postings(&self, serializer: &mut PostingsSerializer) -> Result<()> {
|
||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
|
||||
let mut merged_terms = TermMerger::from(&self.readers[..]);
|
||||
let mut delta_position_computer = DeltaPositionComputer::new();
|
||||
let mut delta_computer = DeltaComputer::new();
|
||||
|
||||
let mut max_doc = 0;
|
||||
|
||||
// map from segment doc ids to the resulting merged segment doc id.
|
||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
|
||||
|
||||
for reader in &self.readers {
|
||||
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
||||
for doc_id in 0..reader.max_doc() {
|
||||
if reader.is_deleted(doc_id) {
|
||||
segment_local_map.push(None);
|
||||
} else {
|
||||
segment_local_map.push(Some(max_doc));
|
||||
max_doc += 1u32;
|
||||
}
|
||||
let mut indexed_fields = vec![];
|
||||
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
||||
if field_entry.is_indexed() {
|
||||
indexed_fields.push(Field(field_ord as u32));
|
||||
}
|
||||
merged_doc_id_map.push(segment_local_map);
|
||||
}
|
||||
|
||||
let mut last_field: Option<Field> = None;
|
||||
for indexed_field in indexed_fields {
|
||||
|
||||
let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions;
|
||||
let field_readers = self.readers
|
||||
.iter()
|
||||
.map(|reader| reader.inverted_index(indexed_field))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
while merged_terms.advance() {
|
||||
let field_term_streams = field_readers
|
||||
.iter()
|
||||
.map(|field_reader| field_reader.terms().stream())
|
||||
.collect();
|
||||
|
||||
let mut merged_terms = TermMerger::new(field_term_streams);
|
||||
let mut max_doc = 0;
|
||||
|
||||
// map from segment doc ids to the resulting merged segment doc id.
|
||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
|
||||
Vec::with_capacity(self.readers.len());
|
||||
|
||||
for reader in &self.readers {
|
||||
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
||||
for doc_id in 0..reader.max_doc() {
|
||||
if reader.is_deleted(doc_id) {
|
||||
segment_local_map.push(None);
|
||||
} else {
|
||||
segment_local_map.push(Some(max_doc));
|
||||
max_doc += 1u32;
|
||||
}
|
||||
}
|
||||
merged_doc_id_map.push(segment_local_map);
|
||||
}
|
||||
|
||||
// Create the total list of doc ids
|
||||
// by stacking the doc ids from the different segment.
|
||||
@@ -229,86 +263,92 @@ impl IndexMerger {
|
||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
|
||||
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
||||
// ...
|
||||
let term = Term::wrap(merged_terms.key());
|
||||
let current_field = term.field();
|
||||
|
||||
if last_field != Some(current_field) {
|
||||
// we reached a new field.
|
||||
let field_entry = self.schema.get_field_entry(current_field);
|
||||
// ... set segment postings option the new field.
|
||||
segment_postings_option = field_entry
|
||||
.field_type()
|
||||
.get_segment_postings_option()
|
||||
.expect("Encountered a field that is not supposed to be
|
||||
indexed. Have you modified the schema?");
|
||||
let mut field_serializer = serializer.new_field(indexed_field)?;
|
||||
|
||||
last_field = Some(current_field);
|
||||
let field_entry = self.schema.get_field_entry(indexed_field);
|
||||
|
||||
// it is perfectly safe to call `.new_field`
|
||||
// even if there is no postings associated.
|
||||
serializer.new_field(current_field);
|
||||
}
|
||||
// ... set segment postings option the new field.
|
||||
let segment_postings_option = field_entry
|
||||
.field_type()
|
||||
.get_segment_postings_option()
|
||||
.expect(
|
||||
"Encountered a field that is not supposed to be
|
||||
indexed. Have you modified the schema?",
|
||||
);
|
||||
|
||||
// Let's compute the list of non-empty posting lists
|
||||
let segment_postings: Vec<_> = merged_terms
|
||||
.current_kvs()
|
||||
.iter()
|
||||
.flat_map(|heap_item| {
|
||||
let segment_ord = heap_item.segment_ord;
|
||||
let term_info = heap_item.streamer.value();
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
let mut segment_postings =
|
||||
segment_reader
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||
if segment_postings.advance() {
|
||||
Some((segment_ord, segment_postings))
|
||||
} else {
|
||||
None
|
||||
while merged_terms.advance() {
|
||||
|
||||
let term = Term::wrap(merged_terms.key());
|
||||
|
||||
// Let's compute the list of non-empty posting lists
|
||||
let segment_postings: Vec<_> = merged_terms
|
||||
.current_kvs()
|
||||
.iter()
|
||||
.flat_map(|heap_item| {
|
||||
let segment_ord = heap_item.segment_ord;
|
||||
let term_info = heap_item.streamer.value();
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
let inverted_index = segment_reader.inverted_index(term.field());
|
||||
let mut segment_postings = inverted_index.read_postings_from_terminfo(
|
||||
term_info,
|
||||
segment_postings_option,
|
||||
);
|
||||
if segment_postings.advance() {
|
||||
Some((segment_ord, segment_postings))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// At this point, `segment_postings` contains the posting list
|
||||
// of all of the segments containing the given term.
|
||||
//
|
||||
// These segments are non-empty and advance has already been called.
|
||||
|
||||
if !segment_postings.is_empty() {
|
||||
// If not, the `term` will be entirely removed.
|
||||
|
||||
// We know that there is at least one document containing
|
||||
// the term, so we add it.
|
||||
field_serializer.new_term(term.as_ref())?;
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
loop {
|
||||
// `.advance()` has been called once before the loop.
|
||||
// Hence we cannot use a `while segment_postings.advance()` loop.
|
||||
if let Some(remapped_doc_id) =
|
||||
old_to_new_doc_id[segment_postings.doc() as usize]
|
||||
{
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
let positions: &[u32] = segment_postings.positions();
|
||||
let term_freq = segment_postings.term_freq();
|
||||
let delta_positions = delta_computer.compute_delta(positions);
|
||||
field_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
term_freq,
|
||||
delta_positions,
|
||||
)?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// At this point, `segment_postings` contains the posting list
|
||||
// of all of the segments containing the given term.
|
||||
//
|
||||
// These segments are non-empty and advance has already been called.
|
||||
|
||||
if segment_postings.is_empty() {
|
||||
// by continuing here, the `term` will be entirely removed.
|
||||
continue;
|
||||
}
|
||||
|
||||
// We know that there is at least one document containing
|
||||
// the term, so we add it.
|
||||
serializer.new_term(term.as_ref())?;
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
loop {
|
||||
// `.advance()` has been called once before the loop.
|
||||
// Hence we cannot use a `while segment_postings.advance()` loop.
|
||||
if let Some(remapped_doc_id) =
|
||||
old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
let delta_positions: &[u32] =
|
||||
delta_position_computer
|
||||
.compute_delta_positions(segment_postings.positions());
|
||||
let term_freq = segment_postings.term_freq();
|
||||
serializer
|
||||
.write_doc(remapped_doc_id, term_freq, delta_positions)?;
|
||||
}
|
||||
if !segment_postings.advance() {
|
||||
break;
|
||||
}
|
||||
// closing the term.
|
||||
field_serializer.close_term()?;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// closing the term.
|
||||
serializer.close_term()?;
|
||||
field_serializer.close()?;
|
||||
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -318,9 +358,9 @@ impl IndexMerger {
|
||||
let store_reader = reader.get_store_reader();
|
||||
for doc_id in 0..reader.max_doc() {
|
||||
if !reader.is_deleted(doc_id) {
|
||||
let doc = try!(store_reader.get(doc_id));
|
||||
let doc = store_reader.get(doc_id)?;
|
||||
let field_values: Vec<&FieldValue> = doc.field_values().iter().collect();
|
||||
try!(store_writer.store(&field_values));
|
||||
store_writer.store(&field_values)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -330,11 +370,15 @@ impl IndexMerger {
|
||||
|
||||
impl SerializableSegment for IndexMerger {
|
||||
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
|
||||
try!(self.write_postings(serializer.get_postings_serializer()));
|
||||
try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
|
||||
try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
|
||||
try!(self.write_storable_fields(serializer.get_store_writer()));
|
||||
try!(serializer.close());
|
||||
self.write_postings(serializer.get_postings_serializer())?;
|
||||
self.write_fieldnorms(
|
||||
serializer.get_fieldnorms_serializer(),
|
||||
)?;
|
||||
self.write_fast_fields(
|
||||
serializer.get_fast_field_serializer(),
|
||||
)?;
|
||||
self.write_storable_fields(serializer.get_store_writer())?;
|
||||
serializer.close()?;
|
||||
Ok(self.max_doc)
|
||||
}
|
||||
}
|
||||
@@ -411,14 +455,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
{
|
||||
@@ -431,14 +474,22 @@ mod tests {
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2, 4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0, 3]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||
vec![4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2, 3, 4]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2, 4]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0, 3]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||
vec![4]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2, 3, 4]
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||
@@ -467,8 +518,10 @@ mod tests {
|
||||
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||
collector.vals()
|
||||
};
|
||||
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![5, 7, 13]);
|
||||
assert_eq!(
|
||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![5, 7, 13]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -515,14 +568,22 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![1, 3]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
vec![1]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
vec![1]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![1, 3]
|
||||
);
|
||||
}
|
||||
{
|
||||
// a second commit
|
||||
@@ -554,20 +615,34 @@ mod tests {
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
@@ -585,33 +660,46 @@ mod tests {
|
||||
}
|
||||
{
|
||||
// merging the segments
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -630,20 +718,34 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -653,13 +755,12 @@ mod tests {
|
||||
}
|
||||
{
|
||||
// Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
@@ -667,20 +768,34 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]
|
||||
);
|
||||
assert_eq!(
|
||||
search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]
|
||||
);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
@@ -692,13 +807,12 @@ mod tests {
|
||||
{
|
||||
// Test removing all docs
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index.searchable_segment_ids().expect(
|
||||
"Searchable segments failed.",
|
||||
);
|
||||
index_writer.merge(&segment_ids).wait().expect(
|
||||
"Merging failed",
|
||||
);
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
|
||||
@@ -44,10 +44,11 @@ pub struct SegmentEntry {
|
||||
|
||||
impl SegmentEntry {
|
||||
/// Create a new `SegmentEntry`
|
||||
pub fn new(segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>)
|
||||
-> SegmentEntry {
|
||||
pub fn new(
|
||||
segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>,
|
||||
) -> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
state: SegmentState::Ready,
|
||||
|
||||
@@ -32,31 +32,36 @@ pub struct SegmentManager {
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted,
|
||||
lock.committed)
|
||||
write!(
|
||||
f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted,
|
||||
lock.committed
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
|
||||
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
pub fn get_mergeable_segments(
|
||||
segment_manager: &SegmentManager,
|
||||
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
let registers_lock = segment_manager.read();
|
||||
(registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments())
|
||||
(
|
||||
registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments(),
|
||||
)
|
||||
}
|
||||
|
||||
impl SegmentManager {
|
||||
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: DeleteCursor)
|
||||
-> SegmentManager {
|
||||
pub fn from_segments(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: DeleteCursor,
|
||||
) -> SegmentManager {
|
||||
SegmentManager {
|
||||
registers: RwLock::new(SegmentRegisters {
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas,
|
||||
delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,25 +99,24 @@ impl SegmentManager {
|
||||
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
let registers = self.read();
|
||||
registers
|
||||
.committed
|
||||
.segment_entry(segment_id)
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
registers.committed.segment_entry(segment_id).or_else(|| {
|
||||
registers.uncommitted.segment_entry(segment_id)
|
||||
})
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.read()
|
||||
.expect("Failed to acquire read lock on SegmentManager.")
|
||||
self.registers.read().expect(
|
||||
"Failed to acquire read lock on SegmentManager.",
|
||||
)
|
||||
}
|
||||
|
||||
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on SegmentManager.")
|
||||
self.registers.write().expect(
|
||||
"Failed to acquire write lock on SegmentManager.",
|
||||
)
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
@@ -140,9 +144,11 @@ impl SegmentManager {
|
||||
}
|
||||
|
||||
|
||||
pub fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId) {
|
||||
pub fn cancel_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId,
|
||||
) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
|
||||
@@ -150,13 +156,15 @@ impl SegmentManager {
|
||||
{
|
||||
let target_segment_register: &mut SegmentRegister;
|
||||
target_segment_register = {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
if registers_lock.uncommitted.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
@@ -185,23 +193,26 @@ impl SegmentManager {
|
||||
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
pub fn end_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry) {
|
||||
pub fn end_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry,
|
||||
) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock
|
||||
.writing
|
||||
.remove(&after_merge_segment_entry.segment_id());
|
||||
registers_lock.writing.remove(&after_merge_segment_entry
|
||||
.segment_id());
|
||||
|
||||
let mut target_register: &mut SegmentRegister = {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
let target_register: &mut SegmentRegister = {
|
||||
if registers_lock.uncommitted.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(
|
||||
before_merge_segment_ids,
|
||||
)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
|
||||
@@ -24,7 +24,12 @@ impl Debug for SegmentRegister {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
try!(write!(f, "SegmentRegister("));
|
||||
for (k, v) in &self.segment_states {
|
||||
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
|
||||
try!(write!(
|
||||
f,
|
||||
"{}:{}, ",
|
||||
k.short_uuid_string(),
|
||||
v.state().letter_code()
|
||||
));
|
||||
}
|
||||
try!(write!(f, ")"));
|
||||
Ok(())
|
||||
@@ -74,9 +79,9 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
||||
segment_ids
|
||||
.iter()
|
||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||
segment_ids.iter().all(|segment_id| {
|
||||
self.segment_states.contains_key(segment_id)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
||||
@@ -91,14 +96,18 @@ impl SegmentRegister {
|
||||
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.expect(
|
||||
"Received a merge notification for a segment that is not registered",
|
||||
)
|
||||
.cancel_merge();
|
||||
}
|
||||
|
||||
pub fn start_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.expect(
|
||||
"Received a merge notification for a segment that is not registered",
|
||||
)
|
||||
.start_merge();
|
||||
}
|
||||
|
||||
@@ -144,34 +153,42 @@ mod tests {
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready
|
||||
);
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready
|
||||
);
|
||||
segment_register.start_merge(&segment_id_a);
|
||||
segment_register.start_merge(&segment_id_b);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge
|
||||
);
|
||||
assert_eq!(
|
||||
segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge
|
||||
);
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
|
||||
@@ -4,8 +4,7 @@ use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use store::StoreWriter;
|
||||
use postings::PostingsSerializer;
|
||||
|
||||
use postings::InvertedIndexSerializer;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
@@ -13,7 +12,7 @@ pub struct SegmentSerializer {
|
||||
store_writer: StoreWriter,
|
||||
fast_field_serializer: FastFieldSerializer,
|
||||
fieldnorms_serializer: FastFieldSerializer,
|
||||
postings_serializer: PostingsSerializer,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
|
||||
impl SegmentSerializer {
|
||||
@@ -22,22 +21,22 @@ impl SegmentSerializer {
|
||||
let store_write = try!(segment.open_write(SegmentComponent::STORE));
|
||||
|
||||
let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS));
|
||||
let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write));
|
||||
let fast_field_serializer = try!(FastFieldSerializer::from_write(fast_field_write));
|
||||
|
||||
let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS));
|
||||
let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write));
|
||||
let fieldnorms_serializer = try!(FastFieldSerializer::from_write(fieldnorms_write));
|
||||
|
||||
let postings_serializer = try!(PostingsSerializer::open(segment));
|
||||
let postings_serializer = try!(InvertedIndexSerializer::open(segment));
|
||||
Ok(SegmentSerializer {
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
}
|
||||
|
||||
/// Accessor to the `PostingsSerializer`.
|
||||
pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer {
|
||||
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
|
||||
&mut self.postings_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -61,11 +61,12 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
directory: &mut Directory)
|
||||
-> Result<()> {
|
||||
pub fn save_metas(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
schema: Schema,
|
||||
opstamp: u64,
|
||||
directory: &mut Directory,
|
||||
) -> Result<()> {
|
||||
let metas = IndexMeta {
|
||||
segments: segment_metas,
|
||||
schema: schema,
|
||||
@@ -89,11 +90,12 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
|
||||
|
||||
fn perform_merge(segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64)
|
||||
-> Result<SegmentEntry> {
|
||||
fn perform_merge(
|
||||
segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64,
|
||||
) -> Result<SegmentEntry> {
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids);
|
||||
|
||||
@@ -105,17 +107,21 @@ fn perform_merge(segment_ids: &[SegmentId],
|
||||
|
||||
for segment_id in segment_ids {
|
||||
if let Some(mut segment_entry) =
|
||||
segment_updater.0.segment_manager.segment_entry(segment_id) {
|
||||
segment_updater.0.segment_manager.segment_entry(segment_id)
|
||||
{
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
if let Some(file_protection) =
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)?
|
||||
{
|
||||
file_protections.push(file_protection);
|
||||
}
|
||||
segment_entries.push(segment_entry);
|
||||
} else {
|
||||
error!("Error, had to abort merge as some of the segment is not managed anymore.");
|
||||
let msg = format!("Segment {:?} requested for merge is not managed.",
|
||||
segment_id);
|
||||
let msg = format!(
|
||||
"Segment {:?} requested for merge is not managed.",
|
||||
segment_id
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(msg));
|
||||
}
|
||||
}
|
||||
@@ -134,12 +140,13 @@ fn perform_merge(segment_ids: &[SegmentId],
|
||||
// ... we just serialize this index merger in our new segment
|
||||
// to merge the two segments.
|
||||
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
|
||||
.expect("Creating index serializer failed");
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect(
|
||||
"Creating index serializer failed",
|
||||
);
|
||||
|
||||
let num_docs = merger
|
||||
.write(segment_serializer)
|
||||
.expect("Serializing merged index failed");
|
||||
let num_docs = merger.write(segment_serializer).expect(
|
||||
"Serializing merged index failed",
|
||||
);
|
||||
let mut segment_meta = SegmentMeta::new(merged_segment.id());
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
|
||||
@@ -161,23 +168,24 @@ struct InnerSegmentUpdater {
|
||||
}
|
||||
|
||||
impl SegmentUpdater {
|
||||
pub fn new(index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor)
|
||||
-> Result<SegmentUpdater> {
|
||||
pub fn new(
|
||||
index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor,
|
||||
) -> Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
})))
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
@@ -199,10 +207,10 @@ impl SegmentUpdater {
|
||||
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
|
||||
}
|
||||
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
|
||||
(&self,
|
||||
f: F)
|
||||
-> CpuFuture<T, Error> {
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
|
||||
&self,
|
||||
f: F,
|
||||
) -> CpuFuture<T, Error> {
|
||||
let me_clone = self.clone();
|
||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||
}
|
||||
@@ -211,11 +219,10 @@ impl SegmentUpdater {
|
||||
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
|
||||
if generation >= self.0.generation.load(Ordering::Acquire) {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
})
|
||||
.forget();
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
}).forget();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -249,46 +256,46 @@ impl SegmentUpdater {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
save_metas(self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut())
|
||||
.expect("Could not save metas.");
|
||||
save_metas(
|
||||
self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut(),
|
||||
).expect("Could not save metas.");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
}).wait()
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
info!("Running garbage collection");
|
||||
let mut index = self.0.index.clone();
|
||||
index.directory_mut().garbage_collect(|| {
|
||||
self.0.segment_manager.list_files()
|
||||
});
|
||||
index.directory_mut().garbage_collect(
|
||||
|| self.0.segment_manager.list_files(),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64) -> Result<()> {
|
||||
self.run_async(move |segment_updater| if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
.purge_deletes(opstamp)
|
||||
.expect("Failed purge deletes");
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
})
|
||||
.wait()
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp).expect(
|
||||
"Failed purge deletes",
|
||||
);
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
}).wait()
|
||||
}
|
||||
|
||||
|
||||
pub fn start_merge(&self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
@@ -308,10 +315,12 @@ impl SegmentUpdater {
|
||||
// first we need to apply deletes to our segment.
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp);
|
||||
let merge_result = perform_merge(
|
||||
&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp,
|
||||
);
|
||||
|
||||
match merge_result {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
@@ -345,11 +354,10 @@ impl SegmentUpdater {
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
});
|
||||
self.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(merging_thread_id, merging_join_handle);
|
||||
self.0.merging_threads.write().unwrap().insert(
|
||||
merging_thread_id,
|
||||
merging_join_handle,
|
||||
);
|
||||
merging_future_recv
|
||||
}
|
||||
|
||||
@@ -368,19 +376,23 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId) {
|
||||
self.0
|
||||
.segment_manager
|
||||
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
|
||||
fn cancel_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId,
|
||||
) {
|
||||
self.0.segment_manager.cancel_merge(
|
||||
before_merge_segment_ids,
|
||||
after_merge_segment_entry,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
fn end_merge(&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry)
|
||||
-> Result<()> {
|
||||
fn end_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry,
|
||||
) -> Result<()> {
|
||||
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
@@ -391,28 +403,37 @@ impl SegmentUpdater {
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
match advance_deletes(segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp) {
|
||||
match advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
Ok(file_protection_opt_res) => {
|
||||
_file_protection_opt = file_protection_opt_res;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids, e);
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
before_merge_segment_ids,
|
||||
e
|
||||
);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater.cancel_merge(&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id());
|
||||
segment_updater.cancel_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry.segment_id(),
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids,
|
||||
after_merge_segment_entry);
|
||||
segment_updater.0.segment_manager.end_merge(
|
||||
&before_merge_segment_ids,
|
||||
after_merge_segment_entry,
|
||||
);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
segment_updater.save_metas(segment_updater.0.index.opstamp());
|
||||
@@ -450,10 +471,9 @@ impl SegmentUpdater {
|
||||
}
|
||||
debug!("wait merging thread {}", new_merging_threads.len());
|
||||
for (_, merging_thread_handle) in new_merging_threads {
|
||||
merging_thread_handle
|
||||
.join()
|
||||
.map(|_| ())
|
||||
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
|
||||
merging_thread_handle.join().map(|_| ()).map_err(|_| {
|
||||
ErrorKind::ErrorInThread("Merging thread failed.".into())
|
||||
})?;
|
||||
}
|
||||
// Our merging thread may have queued their completed
|
||||
self.run_async(move |_| {}).wait()?;
|
||||
@@ -522,9 +542,9 @@ mod tests {
|
||||
assert_eq!(index.searcher().num_docs(), 302);
|
||||
|
||||
{
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting for merging threads");
|
||||
index_writer.wait_merging_threads().expect(
|
||||
"waiting for merging threads",
|
||||
);
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
@@ -54,22 +54,23 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// the flushing behavior as a buffer limit
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema)
|
||||
-> Result<SegmentWriter<'a>> {
|
||||
pub fn for_segment(
|
||||
heap: &'a Heap,
|
||||
table_bits: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
@@ -77,10 +78,12 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(self) -> Result<Vec<u64>> {
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer)?;
|
||||
write(
|
||||
&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
)?;
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
@@ -107,10 +110,11 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema)
|
||||
-> io::Result<()> {
|
||||
pub fn add_document(
|
||||
&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema,
|
||||
) -> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
@@ -122,8 +126,11 @@ impl<'a> SegmentWriter<'a> {
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &field_values)
|
||||
self.multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
&field_values,
|
||||
)
|
||||
} else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
@@ -132,15 +139,17 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
|
||||
self.fieldnorms_writer.get_field_writer(field).map(
|
||||
|field_norms_writer| field_norms_writer.add_val(num_tokens as u64),
|
||||
);
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u64(field_value.field(),
|
||||
field_value.value().u64_value());
|
||||
let term = Term::from_field_u64(
|
||||
field_value.field(),
|
||||
field_value.value().u64_value(),
|
||||
);
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
@@ -148,8 +157,10 @@ impl<'a> SegmentWriter<'a> {
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(),
|
||||
field_value.value().i64_value());
|
||||
let term = Term::from_field_i64(
|
||||
field_value.field(),
|
||||
field_value.value().i64_value(),
|
||||
);
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
@@ -160,7 +171,9 @@ impl<'a> SegmentWriter<'a> {
|
||||
self.fast_field_writers.add_document(doc);
|
||||
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
.filter(|field_value| {
|
||||
schema.get_field_entry(field_value.field()).is_stored()
|
||||
})
|
||||
.collect();
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
try!(doc_writer.store(&stored_fieldvalues));
|
||||
@@ -191,15 +204,22 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
|
||||
// This method is used as a trick to workaround the borrow checker
|
||||
fn write(multifield_postings: &MultiFieldPostingsWriter,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer)
|
||||
-> Result<()> {
|
||||
fn write(
|
||||
multifield_postings: &MultiFieldPostingsWriter,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
) -> Result<()> {
|
||||
|
||||
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
|
||||
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
|
||||
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
|
||||
try!(multifield_postings.serialize(
|
||||
serializer.get_postings_serializer(),
|
||||
));
|
||||
try!(fast_field_writers.serialize(
|
||||
serializer.get_fast_field_serializer(),
|
||||
));
|
||||
try!(fieldnorms_writer.serialize(
|
||||
serializer.get_fieldnorms_serializer(),
|
||||
));
|
||||
try!(serializer.close());
|
||||
|
||||
Ok(())
|
||||
@@ -208,10 +228,12 @@ fn write(multifield_postings: &MultiFieldPostingsWriter,
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer)?;
|
||||
write(
|
||||
&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer,
|
||||
)?;
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
114
src/lib.rs
114
src/lib.rs
@@ -68,7 +68,7 @@ extern crate stable_deref_trait;
|
||||
#[cfg(test)]
|
||||
extern crate env_logger;
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
#[cfg(feature = "simdcompression")]
|
||||
extern crate libc;
|
||||
|
||||
#[cfg(windows)]
|
||||
@@ -98,6 +98,8 @@ mod core;
|
||||
mod compression;
|
||||
mod indexer;
|
||||
mod common;
|
||||
|
||||
#[allow(unused_doc_comment)]
|
||||
mod error;
|
||||
mod analyzer;
|
||||
mod datastruct;
|
||||
@@ -116,7 +118,7 @@ pub use directory::Directory;
|
||||
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
|
||||
pub use indexer::IndexWriter;
|
||||
pub use schema::{Term, Document};
|
||||
pub use core::SegmentReader;
|
||||
pub use core::{SegmentReader, InvertedIndexReader};
|
||||
pub use self::common::TimerTree;
|
||||
|
||||
pub use postings::DocSet;
|
||||
@@ -254,7 +256,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_docfreq() {
|
||||
fn test_docfreq1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
@@ -293,7 +295,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_fieldnorm() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -382,15 +383,24 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -416,16 +426,25 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -451,13 +470,22 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -465,7 +493,9 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_c, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
@@ -489,6 +519,7 @@ mod tests {
|
||||
let term = Term::from_field_u64(field, 1u64);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
@@ -512,6 +543,7 @@ mod tests {
|
||||
let term = Term::from_field_i64(value_field, negative_val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
@@ -574,10 +606,17 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none());
|
||||
assert!(
|
||||
inverted_index
|
||||
.read_postings(&term_abcd, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
let term_af = Term::from_field_text(text_field, "af");
|
||||
let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_af, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 3);
|
||||
@@ -619,29 +658,43 @@ mod tests {
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||
vec![1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||
vec![1, 2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||
vec![2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||
vec![2]
|
||||
);
|
||||
}
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a")]),
|
||||
vec![0, 1, 2]);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![
|
||||
Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a"),
|
||||
]),
|
||||
vec![0, 1, 2]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -678,7 +731,8 @@ mod tests {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||
let document = doc!(text_field => "tantivy",
|
||||
let document =
|
||||
doc!(text_field => "tantivy",
|
||||
text_field => "some other value",
|
||||
other_text_field => "short");
|
||||
assert_eq!(document.len(), 3);
|
||||
|
||||
@@ -52,6 +52,33 @@ pub trait DocSet {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Fills a given mutable buffer with the next doc ids from the
|
||||
/// `DocSet`
|
||||
///
|
||||
/// If that many `DocId`s are available, the method should
|
||||
/// fill the entire buffer and return the length of the buffer.
|
||||
///
|
||||
/// If we reach the end of the `DocSet` before filling
|
||||
/// it entirely, then the buffer is filled up to this point, and
|
||||
/// return value is the number of elements that were filled.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This method is only here for specific high-performance
|
||||
/// use case where batching. The normal way to
|
||||
/// go through the `DocId`'s is to call `.advance()`.
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
for (i, buffer_val) in buffer.iter_mut().enumerate() {
|
||||
if self.advance() {
|
||||
*buffer_val = self.doc();
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return buffer.len();
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
|
||||
@@ -1,125 +0,0 @@
|
||||
use compression::BlockDecoder;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use compression::{CompositeDecoder, VIntDecoder};
|
||||
use postings::SegmentPostingsOption;
|
||||
use compression::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
|
||||
/// `FreqHandler` is in charge of decompressing
|
||||
/// frequencies and/or positions.
|
||||
pub struct FreqHandler {
|
||||
freq_decoder: BlockDecoder,
|
||||
positions: Vec<u32>,
|
||||
option: SegmentPostingsOption,
|
||||
positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
|
||||
|
||||
fn read_positions(data: &[u8]) -> Vec<u32> {
|
||||
let mut composite_reader = CompositeDecoder::new();
|
||||
let mut readable: &[u8] = data;
|
||||
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
|
||||
composite_reader.uncompress_unsorted(readable, uncompressed_len);
|
||||
composite_reader.into()
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl FreqHandler {
|
||||
/// Returns a `FreqHandler` that just decodes `DocId`s.
|
||||
pub fn new_without_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: BlockDecoder::with_val(1u32),
|
||||
positions: Vec::new(),
|
||||
option: SegmentPostingsOption::NoFreq,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
|
||||
pub fn new_with_freq() -> FreqHandler {
|
||||
FreqHandler {
|
||||
freq_decoder: BlockDecoder::new(),
|
||||
positions: Vec::new(),
|
||||
option: SegmentPostingsOption::Freq,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
|
||||
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
|
||||
let positions = read_positions(position_data);
|
||||
FreqHandler {
|
||||
freq_decoder: BlockDecoder::new(),
|
||||
positions: positions,
|
||||
option: SegmentPostingsOption::FreqAndPositions,
|
||||
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_positions_offset(&mut self) {
|
||||
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
|
||||
let mut i: usize = 0;
|
||||
self.positions_offsets[i] = cur_position;
|
||||
let mut last_cur_position = cur_position;
|
||||
for &doc_freq in self.freq_decoder.output_array() {
|
||||
i += 1;
|
||||
let mut cumulated_pos = 0u32;
|
||||
// this next loop decodes delta positions into normal positions.
|
||||
for j in last_cur_position..(last_cur_position + (doc_freq as usize)) {
|
||||
cumulated_pos += self.positions[j];
|
||||
self.positions[j] = cumulated_pos;
|
||||
}
|
||||
cur_position += doc_freq as usize;
|
||||
self.positions_offsets[i] = cur_position;
|
||||
last_cur_position = cur_position;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to term frequency
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Accessor to the positions
|
||||
///
|
||||
/// idx is the offset of the current doc in the block.
|
||||
/// It takes value between 0 and 128.
|
||||
pub fn positions(&self, idx: usize) -> &[u32] {
|
||||
let start = self.positions_offsets[idx];
|
||||
let stop = self.positions_offsets[idx + 1];
|
||||
&self.positions[start..stop]
|
||||
}
|
||||
|
||||
/// Decompresses a complete frequency block
|
||||
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => data,
|
||||
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
|
||||
self.fill_positions_offset();
|
||||
remaining
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Decompresses an incomplete frequency block
|
||||
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
|
||||
match self.option {
|
||||
SegmentPostingsOption::NoFreq => {}
|
||||
SegmentPostingsOption::Freq => {
|
||||
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
|
||||
}
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
|
||||
self.fill_positions_offset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,14 +16,14 @@ mod term_info;
|
||||
mod vec_postings;
|
||||
mod segment_postings;
|
||||
mod intersection;
|
||||
mod freq_handler;
|
||||
mod docset;
|
||||
mod segment_postings_option;
|
||||
|
||||
pub use self::docset::{SkipResult, DocSet};
|
||||
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
pub use self::serializer::PostingsSerializer;
|
||||
pub use self::serializer::{InvertedIndexSerializer, FieldSerializer};
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
@@ -32,7 +32,6 @@ pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
pub use self::segment_postings_option::SegmentPostingsOption;
|
||||
pub use common::HasLen;
|
||||
|
||||
@@ -64,21 +63,25 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut segment = index.new_segment();
|
||||
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
|
||||
posting_serializer.new_field(text_field);
|
||||
posting_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..3u32 {
|
||||
let positions = vec![1, 2, 3, 2];
|
||||
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
|
||||
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
|
||||
{
|
||||
let mut field_serializer = posting_serializer.new_field(text_field).unwrap();
|
||||
field_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..120u32 {
|
||||
let delta_positions = vec![1, 2, 3, 2];
|
||||
field_serializer
|
||||
.write_doc(doc_id, 2, &delta_positions)
|
||||
.unwrap();
|
||||
}
|
||||
field_serializer.close_term().unwrap();
|
||||
}
|
||||
posting_serializer.close_term().unwrap();
|
||||
posting_serializer.close().unwrap();
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
|
||||
assert!(read.len() <= 16);
|
||||
assert!(read.len() <= 140);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm() {
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -87,8 +90,8 @@ mod tests {
|
||||
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer = SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema)
|
||||
.unwrap();
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
@@ -134,13 +137,17 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||
assert!(segment_reader
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.is_none());
|
||||
assert!(
|
||||
segment_reader
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.is_none()
|
||||
);
|
||||
}
|
||||
{
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let mut postings_a = segment_reader
|
||||
.inverted_index(term_a.field())
|
||||
.read_postings(&term_a, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_a.len(), 1000);
|
||||
@@ -148,6 +155,7 @@ mod tests {
|
||||
assert_eq!(postings_a.doc(), 0);
|
||||
assert_eq!(postings_a.term_freq(), 6);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 1u32);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
@@ -162,6 +170,7 @@ mod tests {
|
||||
{
|
||||
let term_e = Term::from_field_text(text_field, "e");
|
||||
let mut postings_e = segment_reader
|
||||
.inverted_index(term_e.field())
|
||||
.read_postings(&term_e, FreqAndPositions)
|
||||
.unwrap();
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
@@ -201,8 +210,10 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher);
|
||||
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
|
||||
@@ -249,6 +260,7 @@ mod tests {
|
||||
for i in 0..num_docs - 1 {
|
||||
for j in i + 1..num_docs {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -262,6 +274,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -282,6 +295,7 @@ mod tests {
|
||||
// check that filtering works
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_0.field())
|
||||
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -291,6 +305,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_0.field())
|
||||
.read_postings(&term_0, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -315,6 +330,7 @@ mod tests {
|
||||
// make sure seeking still works
|
||||
for i in 0..num_docs {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -330,6 +346,7 @@ mod tests {
|
||||
// now try with a longer sequence
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -365,12 +382,14 @@ mod tests {
|
||||
// finally, check that it's empty
|
||||
{
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(term_2.field())
|
||||
.read_postings(&term_2, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -437,11 +456,12 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -450,21 +470,27 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
b.iter(|| {
|
||||
let segment_postings_a = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_b = segment_reader
|
||||
.inverted_index(TERM_B.field())
|
||||
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_c = segment_reader
|
||||
.inverted_index(TERM_C.field())
|
||||
.read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_d = segment_reader
|
||||
.inverted_index(TERM_D.field())
|
||||
.read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![
|
||||
segment_postings_a,
|
||||
segment_postings_b,
|
||||
segment_postings_c,
|
||||
segment_postings_d,
|
||||
]);
|
||||
while intersection.advance() {}
|
||||
});
|
||||
}
|
||||
@@ -475,6 +501,7 @@ mod tests {
|
||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
|
||||
@@ -491,6 +518,7 @@ mod tests {
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
for doc in &existing_docs {
|
||||
@@ -528,6 +556,7 @@ mod tests {
|
||||
b.iter(|| {
|
||||
let n: u32 = test::black_box(17);
|
||||
let mut segment_postings = segment_reader
|
||||
.inverted_index(TERM_A.field())
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut s = 0u32;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use schema::FieldValue;
|
||||
use postings::PostingsSerializer;
|
||||
use postings::{InvertedIndexSerializer, FieldSerializer};
|
||||
use std::io;
|
||||
use postings::Recorder;
|
||||
use analyzer::SimpleTokenizer;
|
||||
@@ -16,9 +16,10 @@ use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
|
||||
heap: &'a Heap)
|
||||
-> Box<PostingsWriter + 'a> {
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
@@ -51,9 +52,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let per_field_postings_writers: Vec<_> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| {
|
||||
posting_from_field_entry(field_entry, heap)
|
||||
})
|
||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||
.collect();
|
||||
|
||||
MultiFieldPostingsWriter {
|
||||
@@ -78,7 +77,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
|
||||
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
|
||||
@@ -101,8 +100,13 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
postings_writer
|
||||
.serialize(field, &term_offsets[start..stop], serializer, self.heap)?;
|
||||
let mut field_serializer = serializer.new_field(field)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[start..stop],
|
||||
&mut field_serializer,
|
||||
self.heap,
|
||||
)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -126,30 +130,33 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap);
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
pos: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self,
|
||||
field: Field,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
fn index_text<'a>(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
@@ -195,12 +202,14 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn suscribe(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap) {
|
||||
fn suscribe(
|
||||
&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc: DocId,
|
||||
position: u32,
|
||||
term: &Term,
|
||||
heap: &Heap,
|
||||
) {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
let recorder: &mut Rec = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
@@ -213,20 +222,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
recorder.record_position(position, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self,
|
||||
field: Field,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
serializer.new_field(field);
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for &(term_bytes, addr) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
try!(serializer.new_term(term_bytes));
|
||||
try!(recorder.serialize(addr, serializer, heap));
|
||||
try!(serializer.close_term());
|
||||
serializer.new_term(term_bytes)?;
|
||||
recorder.serialize(addr, serializer, heap)?;
|
||||
serializer.close_term()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use DocId;
|
||||
use std::io;
|
||||
use postings::PostingsSerializer;
|
||||
use postings::FieldSerializer;
|
||||
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
@@ -27,11 +27,12 @@ pub trait Recorder: HeapAllocable {
|
||||
/// Close the document. It will help record the term frequency.
|
||||
fn close_doc(&mut self, heap: &Heap);
|
||||
/// Pushes the postings information to the serializer.
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// Only records the doc ids
|
||||
@@ -64,13 +65,14 @@ impl Recorder for NothingRecorder {
|
||||
|
||||
fn close_doc(&mut self, _heap: &Heap) {}
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
for doc in self.stack.iter(self_addr, heap) {
|
||||
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
|
||||
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -116,21 +118,23 @@ impl Recorder for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
let mut doc_iter = self.stack.iter(self_addr, heap).chain(
|
||||
Some(self.current_tf)
|
||||
.into_iter(),
|
||||
);
|
||||
|
||||
while let Some(doc) = doc_iter.next() {
|
||||
let term_freq = doc_iter
|
||||
.next()
|
||||
.expect("The IndexWriter recorded a doc without a term freq.");
|
||||
let term_freq = doc_iter.next().expect(
|
||||
"The IndexWriter recorded a doc without a term freq.",
|
||||
);
|
||||
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -171,11 +175,12 @@ impl Recorder for TFAndPositionRecorder {
|
||||
self.stack.push(POSITION_END, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
fn serialize(
|
||||
&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()> {
|
||||
let mut doc_positions = Vec::with_capacity(100);
|
||||
let mut positions_iter = self.stack.iter(self_addr, heap);
|
||||
while let Some(doc) = positions_iter.next() {
|
||||
@@ -189,7 +194,11 @@ impl Recorder for TFAndPositionRecorder {
|
||||
prev_position = position;
|
||||
}
|
||||
}
|
||||
try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions));
|
||||
serializer.write_doc(
|
||||
doc,
|
||||
doc_positions.len() as u32,
|
||||
&doc_positions,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,12 +1,65 @@
|
||||
use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
|
||||
use compression::{COMPRESSION_BLOCK_SIZE, BlockDecoder, VIntDecoder, CompressedIntStream};
|
||||
use DocId;
|
||||
use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult};
|
||||
use postings::{Postings, DocSet, HasLen, SkipResult};
|
||||
use std::cmp;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fst::Streamer;
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::cell::UnsafeCell;
|
||||
use directory::{SourceRead, ReadOnlySource};
|
||||
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
|
||||
|
||||
|
||||
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
// before reading positions.
|
||||
//
|
||||
// if none, position are already loaded in
|
||||
// the positions vec.
|
||||
position_to_skip: Option<usize>,
|
||||
positions: Vec<u32>,
|
||||
positions_stream: CompressedIntStream,
|
||||
}
|
||||
|
||||
impl PositionComputer {
|
||||
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
|
||||
PositionComputer {
|
||||
position_to_skip: None,
|
||||
positions: vec![],
|
||||
positions_stream: positions_stream,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_skip(&mut self, num_skip: usize) {
|
||||
self.position_to_skip = Some(
|
||||
self.position_to_skip
|
||||
.map(|prev_skip| prev_skip + num_skip)
|
||||
.unwrap_or(0),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
|
||||
if let Some(num_skip) = self.position_to_skip {
|
||||
|
||||
self.positions.resize(term_freq, 0u32);
|
||||
|
||||
self.positions_stream.skip(num_skip);
|
||||
self.positions_stream.read(&mut self.positions[..term_freq]);
|
||||
|
||||
let mut cum = 0u32;
|
||||
for i in 0..term_freq as usize {
|
||||
cum += self.positions[i];
|
||||
self.positions[i] = cum;
|
||||
}
|
||||
self.position_to_skip = None;
|
||||
}
|
||||
&self.positions[..term_freq]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
@@ -14,42 +67,60 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
block_cursor: BlockSegmentPostings<'a>,
|
||||
pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
delete_bitset: DeleteBitSet,
|
||||
position_computer: Option<UnsafeCell<PositionComputer>>,
|
||||
}
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
|
||||
impl SegmentPostings {
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
/// * `len` - number of document in the posting lists.
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_block_postings(segment_block_postings: BlockSegmentPostings<'a>,
|
||||
delete_bitset: DeleteBitSet)
|
||||
-> SegmentPostings<'a> {
|
||||
pub fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_stream_opt: Option<CompressedIntStream>,
|
||||
) -> SegmentPostings {
|
||||
let position_computer =
|
||||
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: NUM_DOCS_PER_BLOCK, // cursor within the block
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
delete_bitset: delete_bitset,
|
||||
position_computer: position_computer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
pub fn empty() -> SegmentPostings {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: DeleteBitSet::empty(),
|
||||
cur: NUM_DOCS_PER_BLOCK,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
|
||||
if let Some(ref position_computer) = self.position_computer.as_ref() {
|
||||
let num_skips = num_skips_fn();
|
||||
unsafe {
|
||||
(*position_computer.get()).add_skip(num_skips);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> DocSet for SegmentPostings<'a> {
|
||||
impl DocSet for SegmentPostings {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
@@ -59,10 +130,11 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = NUM_DOCS_PER_BLOCK;
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.position_add_skip(|| self.term_freq() as usize);
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
}
|
||||
@@ -75,6 +147,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
return SkipResult::End;
|
||||
}
|
||||
|
||||
// in the following, thanks to the call to advance above,
|
||||
// we know that the position is not loaded and we need
|
||||
// to skip every doc_freq we cross.
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
loop {
|
||||
// check if we need to go to the next block
|
||||
@@ -83,13 +159,26 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
(block_docs[self.cur], block_docs[block_docs.len() - 1])
|
||||
};
|
||||
if target > last_doc_in_block {
|
||||
|
||||
// we add skip for the current term independantly,
|
||||
// so that position_add_skip will decide if it should
|
||||
// just set itself to Some(0) or effectively
|
||||
// add the term freq.
|
||||
//let num_skips: u32 = ;
|
||||
self.position_add_skip(|| {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
|
||||
let sum_freq: u32 = freqs_skipped.iter().cloned().sum();
|
||||
sum_freq as usize
|
||||
});
|
||||
|
||||
if !self.block_cursor.advance() {
|
||||
return SkipResult::End;
|
||||
}
|
||||
|
||||
self.cur = 0;
|
||||
} else {
|
||||
if target < current_doc {
|
||||
// We've overpassed the target after the first `advance` call
|
||||
// We've passed the target after the first `advance` call
|
||||
// or we're at the beginning of a block.
|
||||
// Either way, we're on the first `DocId` greater than `target`
|
||||
return SkipResult::OverStep;
|
||||
@@ -135,6 +224,13 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
|
||||
// `doc` is now >= `target`
|
||||
let doc = block_docs[start];
|
||||
|
||||
self.position_add_skip(|| {
|
||||
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
|
||||
let sum_freqs: u32 = freqs_skipped.iter().sum();
|
||||
sum_freqs as usize
|
||||
});
|
||||
|
||||
self.cur = start;
|
||||
|
||||
if !self.delete_bitset.is_deleted(doc) {
|
||||
@@ -156,31 +252,41 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
self.len()
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
assert!(self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc().");
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> HasLen for SegmentPostings<'a> {
|
||||
impl HasLen for SegmentPostings {
|
||||
fn len(&self) -> usize {
|
||||
self.block_cursor.doc_freq()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
impl Postings for SegmentPostings {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.block_cursor.freq_handler().freq(self.cur)
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.block_cursor.freq_handler().positions(self.cur)
|
||||
let term_freq = self.term_freq();
|
||||
self.position_computer
|
||||
.as_ref()
|
||||
.map(|position_computer| unsafe {
|
||||
(&mut *position_computer.get()).positions(term_freq as usize)
|
||||
})
|
||||
.unwrap_or(&EMPTY_POSITIONS[..])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// `BlockSegmentPostings` is a cursor iterating over blocks
|
||||
/// of documents.
|
||||
///
|
||||
@@ -188,28 +294,35 @@ impl<'a> Postings for SegmentPostings<'a> {
|
||||
///
|
||||
/// While it is useful for some very specific high-performance
|
||||
/// use cases, you should prefer using `SegmentPostings` for most usage.
|
||||
pub struct BlockSegmentPostings<'a> {
|
||||
block_decoder: BlockDecoder,
|
||||
pub struct BlockSegmentPostings {
|
||||
doc_decoder: BlockDecoder,
|
||||
freq_decoder: BlockDecoder,
|
||||
has_freq: bool,
|
||||
|
||||
doc_freq: usize,
|
||||
doc_offset: DocId,
|
||||
num_binpacked_blocks: usize,
|
||||
num_vint_docs: usize,
|
||||
remaining_data: &'a [u8],
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: SourceRead,
|
||||
}
|
||||
|
||||
impl<'a> BlockSegmentPostings<'a> {
|
||||
pub(crate) fn from_data(doc_freq: usize,
|
||||
data: &'a [u8],
|
||||
freq_handler: FreqHandler)
|
||||
-> BlockSegmentPostings<'a> {
|
||||
let num_binpacked_blocks: usize = (doc_freq as usize) / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = (doc_freq as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
|
||||
impl BlockSegmentPostings {
|
||||
pub(crate) fn from_data(
|
||||
doc_freq: usize,
|
||||
data: SourceRead,
|
||||
has_freq: bool,
|
||||
) -> BlockSegmentPostings {
|
||||
let num_binpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_binpacked_blocks;
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: num_binpacked_blocks,
|
||||
num_vint_docs: num_vint_docs,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: freq_handler,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
|
||||
has_freq: has_freq,
|
||||
|
||||
remaining_data: data,
|
||||
doc_offset: 0,
|
||||
doc_freq: doc_freq,
|
||||
@@ -226,9 +339,9 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
// # Warning
|
||||
//
|
||||
// This does not reset the positions list.
|
||||
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: &'a [u8]) {
|
||||
let num_binpacked_blocks: usize = doc_freq / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = doc_freq & (NUM_DOCS_PER_BLOCK - 1);
|
||||
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
|
||||
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
|
||||
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_binpacked_blocks = num_binpacked_blocks;
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = postings_data;
|
||||
@@ -250,7 +363,25 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
/// returned by `.docs()` is empty.
|
||||
#[inline]
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.block_decoder.output_array()
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Return the array of `term freq` in the block.
|
||||
#[inline]
|
||||
pub fn freqs(&self) -> &[u32] {
|
||||
self.freq_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Return the frequency at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn freq(&self, idx: usize) -> u32 {
|
||||
self.freq_decoder.output(idx)
|
||||
}
|
||||
|
||||
/// Returns the length of the current block.
|
||||
@@ -260,13 +391,7 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
|
||||
#[inline]
|
||||
fn block_len(&self) -> usize {
|
||||
self.block_decoder.output_len
|
||||
}
|
||||
|
||||
|
||||
/// Returns a reference to the frequency handler.
|
||||
pub fn freq_handler(&self) -> &FreqHandler {
|
||||
&self.freq_handler
|
||||
self.doc_decoder.output_len
|
||||
}
|
||||
|
||||
/// Advance to the next block.
|
||||
@@ -274,21 +399,35 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
/// Returns false iff there was no remaining blocks.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_binpacked_blocks > 0 {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
|
||||
if self.has_freq {
|
||||
let num_consumed_bytes = self.freq_decoder.uncompress_block_unsorted(
|
||||
self.remaining_data.as_ref(),
|
||||
);
|
||||
self.remaining_data.advance(num_consumed_bytes);
|
||||
}
|
||||
// it will be used as the next offset.
|
||||
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
|
||||
self.num_binpacked_blocks -= 1;
|
||||
true
|
||||
} else if self.num_vint_docs > 0 {
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data,
|
||||
self.doc_offset,
|
||||
self.num_vint_docs);
|
||||
self.freq_handler
|
||||
.read_freq_vint(self.remaining_data, self.num_vint_docs);
|
||||
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.doc_offset,
|
||||
self.num_vint_docs,
|
||||
);
|
||||
self.remaining_data.advance(num_compressed_bytes);
|
||||
if self.has_freq {
|
||||
self.freq_decoder.uncompress_vint_unsorted(
|
||||
self.remaining_data.as_ref(),
|
||||
self.num_vint_docs,
|
||||
);
|
||||
}
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
} else {
|
||||
@@ -297,20 +436,23 @@ impl<'a> BlockSegmentPostings<'a> {
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings<'static> {
|
||||
pub fn empty() -> BlockSegmentPostings {
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: 0,
|
||||
num_vint_docs: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
|
||||
doc_decoder: BlockDecoder::new(),
|
||||
freq_decoder: BlockDecoder::with_val(1),
|
||||
has_freq: false,
|
||||
|
||||
remaining_data: From::from(ReadOnlySource::empty()),
|
||||
doc_offset: 0,
|
||||
doc_freq: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Streamer<'b> for BlockSegmentPostings<'a> {
|
||||
impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
type Item = &'b [DocId];
|
||||
|
||||
fn next(&'b mut self) -> Option<&'b [DocId]> {
|
||||
@@ -366,11 +508,13 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
let mut block_segments =
|
||||
segment_reader
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
let mut block_segments = inverted_index.read_block_postings_from_terminfo(
|
||||
&term_info,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the block before calling advance is empty
|
||||
assert!(block_segments.docs().is_empty());
|
||||
@@ -406,17 +550,20 @@ mod tests {
|
||||
let mut block_segments;
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
block_segments =
|
||||
segment_reader
|
||||
.read_block_postings_from_terminfo(&term_info, SegmentPostingsOption::NoFreq);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
block_segments = inverted_index.read_block_postings_from_terminfo(
|
||||
&term_info,
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert!(block_segments.docs() == &[0, 2, 4]);
|
||||
{
|
||||
let term = Term::from_field_u64(int_field, 1u64);
|
||||
let term_info = segment_reader.get_term_info(&term).unwrap();
|
||||
segment_reader.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap();
|
||||
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
|
||||
}
|
||||
assert!(block_segments.advance());
|
||||
assert!(block_segments.docs() == &[1, 3, 5]);
|
||||
|
||||
@@ -16,6 +16,26 @@ pub enum SegmentPostingsOption {
|
||||
FreqAndPositions,
|
||||
}
|
||||
|
||||
impl SegmentPostingsOption {
|
||||
/// Returns true iff this option includes encoding
|
||||
/// term frequencies.
|
||||
pub fn has_freq(&self) -> bool {
|
||||
match *self {
|
||||
SegmentPostingsOption::NoFreq => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff this option include encoding
|
||||
/// term positions.
|
||||
pub fn has_positions(&self) -> bool {
|
||||
match *self {
|
||||
SegmentPostingsOption::FreqAndPositions => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -5,16 +5,14 @@ use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::TextIndexingOptions;
|
||||
use directory::WritePtr;
|
||||
use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
|
||||
use compression::{COMPRESSION_BLOCK_SIZE, BlockEncoder};
|
||||
use DocId;
|
||||
use core::Segment;
|
||||
use std::io::{self, Write};
|
||||
use compression::VIntEncoder;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use termdict::TermDictionaryBuilder;
|
||||
|
||||
|
||||
@@ -49,74 +47,127 @@ use termdict::TermDictionaryBuilder;
|
||||
///
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
pub struct PostingsSerializer {
|
||||
terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>,
|
||||
postings_write: CountingWriter<WritePtr>,
|
||||
positions_write: CountingWriter<WritePtr>,
|
||||
last_doc_id_encoded: u32,
|
||||
positions_encoder: CompositeEncoder,
|
||||
block_encoder: BlockEncoder,
|
||||
doc_ids: Vec<DocId>,
|
||||
term_freqs: Vec<u32>,
|
||||
position_deltas: Vec<u32>,
|
||||
pub struct InvertedIndexSerializer {
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
schema: Schema,
|
||||
text_indexing_options: TextIndexingOptions,
|
||||
term_open: bool,
|
||||
current_term_info: TermInfo,
|
||||
}
|
||||
|
||||
impl PostingsSerializer {
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn new(terms_write: WritePtr,
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
schema: Schema)
|
||||
-> Result<PostingsSerializer> {
|
||||
let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write));
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: CountingWriter::wrap(postings_write),
|
||||
positions_write: CountingWriter::wrap(positions_write),
|
||||
last_doc_id_encoded: 0u32,
|
||||
positions_encoder: CompositeEncoder::new(),
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: Vec::new(),
|
||||
term_freqs: Vec::new(),
|
||||
position_deltas: Vec::new(),
|
||||
schema: schema,
|
||||
text_indexing_options: TextIndexingOptions::Unindexed,
|
||||
term_open: false,
|
||||
current_term_info: TermInfo::default(),
|
||||
})
|
||||
fn new(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
positions_write: CompositeWrite<WritePtr>,
|
||||
schema: Schema,
|
||||
) -> Result<InvertedIndexSerializer> {
|
||||
Ok(InvertedIndexSerializer {
|
||||
terms_write: terms_write,
|
||||
postings_write: postings_write,
|
||||
positions_write: positions_write,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
|
||||
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
||||
use SegmentComponent::{TERMS, POSTINGS, POSITIONS};
|
||||
PostingsSerializer::new(segment.open_write(TERMS)?,
|
||||
segment.open_write(POSTINGS)?,
|
||||
segment.open_write(POSITIONS)?,
|
||||
segment.schema())
|
||||
InvertedIndexSerializer::new(
|
||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||
segment.schema(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Must be called before starting pushing terms of
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(&mut self, field: Field) {
|
||||
pub fn new_field(&mut self, field: Field) -> io::Result<FieldSerializer> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
self.text_indexing_options = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::U64(ref int_options) |
|
||||
FieldType::I64(ref int_options) => {
|
||||
if int_options.is_indexed() {
|
||||
TextIndexingOptions::Unindexed
|
||||
} else {
|
||||
TextIndexingOptions::Untokenized
|
||||
}
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
let positions_write = self.positions_write.for_field(field);
|
||||
FieldSerializer::new(
|
||||
field_entry.field_type().clone(),
|
||||
term_dictionary_write,
|
||||
postings_write,
|
||||
positions_write,
|
||||
)
|
||||
}
|
||||
|
||||
/// Closes the serializer.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.terms_write.close()?;
|
||||
self.postings_write.close()?;
|
||||
self.positions_write.close()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// The field serializer is in charge of
|
||||
/// the serialization of a specific field.
|
||||
pub struct FieldSerializer<'a> {
|
||||
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
|
||||
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
|
||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||
current_term_info: TermInfo,
|
||||
term_open: bool,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> FieldSerializer<'a> {
|
||||
fn new(
|
||||
field_type: FieldType,
|
||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||
) -> io::Result<FieldSerializer<'a>> {
|
||||
|
||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let text_indexing_options = text_options.get_indexing_options();
|
||||
(
|
||||
text_indexing_options.is_termfreq_enabled(),
|
||||
text_indexing_options.is_position_enabled(),
|
||||
)
|
||||
}
|
||||
_ => (false, false),
|
||||
};
|
||||
let term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
|
||||
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
|
||||
let positions_serializer_opt = if position_enabled {
|
||||
Some(PositionSerializer::new(positions_write))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(FieldSerializer {
|
||||
term_dictionary_builder: term_dictionary_builder,
|
||||
postings_serializer: postings_serializer,
|
||||
positions_serializer_opt: positions_serializer_opt,
|
||||
current_term_info: TermInfo::default(),
|
||||
term_open: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let (filepos, offset) = self.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.addr())
|
||||
.unwrap_or((0u32, 0u8));
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_offset: self.postings_serializer.addr(),
|
||||
positions_offset: filepos,
|
||||
positions_inner_offset: offset,
|
||||
}
|
||||
}
|
||||
|
||||
/// Starts the postings for a new term.
|
||||
@@ -124,70 +175,16 @@ impl PostingsSerializer {
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
panic!("Called new_term, while the previous term was not closed.");
|
||||
}
|
||||
assert!(
|
||||
!self.term_open,
|
||||
"Called new_term, while the previous term was not closed."
|
||||
);
|
||||
self.term_open = true;
|
||||
self.doc_ids.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
self.term_freqs.clear();
|
||||
self.position_deltas.clear();
|
||||
self.current_term_info = TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_offset: self.postings_write.written_bytes() as u32,
|
||||
positions_offset: self.positions_write.written_bytes() as u32,
|
||||
};
|
||||
self.terms_fst_builder.insert_key(term)
|
||||
self.postings_serializer.clear();
|
||||
self.current_term_info = self.current_term_info();
|
||||
self.term_dictionary_builder.insert_key(term)
|
||||
}
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
///
|
||||
/// If the current block is incomplete, it need to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
|
||||
self.terms_fst_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
// not a perfect multiple of our block size.
|
||||
//
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded =
|
||||
self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_unsorted(&self.term_freqs[..]);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
}
|
||||
// On the other hand, positions are entirely buffered until the
|
||||
// end of the term, at which point they are compressed and written.
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
let posdelta_len = VInt(self.position_deltas.len() as u64);
|
||||
posdelta_len.serialize(&mut self.positions_write)?;
|
||||
let positions_encoded: &[u8] = self.positions_encoder
|
||||
.compress_unsorted(&self.position_deltas[..]);
|
||||
self.positions_write.write_all(positions_encoded)?;
|
||||
self.position_deltas.clear();
|
||||
}
|
||||
self.term_open = false;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Serialize the information that a document contains the current term,
|
||||
/// its term frequency, and the position deltas.
|
||||
///
|
||||
@@ -197,32 +194,93 @@ impl PostingsSerializer {
|
||||
///
|
||||
/// Term frequencies and positions may be ignored by the serializer depending
|
||||
/// on the configuration of the field in the `Schema`.
|
||||
pub fn write_doc(&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32])
|
||||
-> io::Result<()> {
|
||||
pub fn write_doc(
|
||||
&mut self,
|
||||
doc_id: DocId,
|
||||
term_freq: u32,
|
||||
position_deltas: &[u32],
|
||||
) -> io::Result<()> {
|
||||
self.current_term_info.doc_freq += 1;
|
||||
self.postings_serializer.write_doc(doc_id, term_freq)?;
|
||||
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
|
||||
positions_serializer.write(position_deltas)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
///
|
||||
/// If the current block is incomplete, it need to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
self.term_dictionary_builder.insert_value(
|
||||
&self.current_term_info,
|
||||
)?;
|
||||
self.postings_serializer.close_term()?;
|
||||
self.term_open = false;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Closes the current current field.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
self.close_term()?;
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt {
|
||||
positions_serializer.close()?;
|
||||
}
|
||||
self.postings_serializer.close()?;
|
||||
self.term_dictionary_builder.finish()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct PostingsSerializer<W: Write> {
|
||||
postings_write: CountingWriter<W>,
|
||||
last_doc_id_encoded: u32,
|
||||
|
||||
block_encoder: BlockEncoder,
|
||||
doc_ids: Vec<DocId>,
|
||||
term_freqs: Vec<u32>,
|
||||
|
||||
termfreq_enabled: bool,
|
||||
}
|
||||
|
||||
impl<W: Write> PostingsSerializer<W> {
|
||||
fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
|
||||
PostingsSerializer {
|
||||
postings_write: CountingWriter::wrap(write),
|
||||
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: vec![],
|
||||
term_freqs: vec![],
|
||||
|
||||
last_doc_id_encoded: 0u32,
|
||||
termfreq_enabled: termfreq_enabled,
|
||||
}
|
||||
}
|
||||
|
||||
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
if self.termfreq_enabled {
|
||||
self.term_freqs.push(term_freq as u32);
|
||||
}
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.position_deltas.extend_from_slice(position_deltas);
|
||||
}
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(
|
||||
&self.doc_ids,
|
||||
self.last_doc_id_encoded,
|
||||
);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
}
|
||||
if self.text_indexing_options.is_termfreq_enabled() {
|
||||
if self.termfreq_enabled {
|
||||
// encode the term_freqs
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_unsorted(&self.term_freqs);
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder.compress_block_unsorted(&self.term_freqs);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
@@ -231,12 +289,93 @@ impl PostingsSerializer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Closes the serializer.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
try!(self.close_term());
|
||||
try!(self.terms_fst_builder.finish());
|
||||
try!(self.postings_write.flush());
|
||||
try!(self.positions_write.flush());
|
||||
fn close_term(&mut self) -> io::Result<()> {
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
// this happens when the number of doc ids is
|
||||
// not a perfect multiple of our block size.
|
||||
//
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder.compress_vint_sorted(
|
||||
&self.doc_ids,
|
||||
self.last_doc_id_encoded,
|
||||
);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.doc_ids.clear();
|
||||
}
|
||||
// ... Idem for term frequencies
|
||||
if self.termfreq_enabled {
|
||||
let block_encoded = self.block_encoder.compress_vint_unsorted(
|
||||
&self.term_freqs[..],
|
||||
);
|
||||
self.postings_write.write_all(block_encoded)?;
|
||||
self.term_freqs.clear();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn close(mut self) -> io::Result<()> {
|
||||
self.postings_write.flush()
|
||||
}
|
||||
|
||||
|
||||
fn addr(&self) -> u32 {
|
||||
self.postings_write.written_bytes() as u32
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.doc_ids.clear();
|
||||
self.term_freqs.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
}
|
||||
}
|
||||
|
||||
struct PositionSerializer<W: Write> {
|
||||
buffer: Vec<u32>,
|
||||
write: CountingWriter<W>, // See if we can offset the original counting writer.
|
||||
block_encoder: BlockEncoder,
|
||||
}
|
||||
|
||||
impl<W: Write> PositionSerializer<W> {
|
||||
fn new(write: W) -> PositionSerializer<W> {
|
||||
PositionSerializer {
|
||||
buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE),
|
||||
write: CountingWriter::wrap(write),
|
||||
block_encoder: BlockEncoder::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn addr(&self) -> (u32, u8) {
|
||||
(self.write.written_bytes() as u32, self.buffer.len() as u8)
|
||||
}
|
||||
|
||||
fn write_block(&mut self) -> io::Result<()> {
|
||||
assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE);
|
||||
let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer);
|
||||
self.write.write_all(block_compressed)?;
|
||||
self.buffer.clear();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write(&mut self, mut vals: &[u32]) -> io::Result<()> {
|
||||
let mut buffer_len = self.buffer.len();
|
||||
while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE {
|
||||
let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len;
|
||||
self.buffer.extend_from_slice(&vals[..len_to_completion]);
|
||||
self.write_block()?;
|
||||
vals = &vals[len_to_completion..];
|
||||
buffer_len = self.buffer.len();
|
||||
}
|
||||
self.buffer.extend_from_slice(&vals);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn close(mut self) -> io::Result<()> {
|
||||
self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32);
|
||||
self.write_block()?;
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use std::io;
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// addressing the start of the posting list associated
|
||||
/// to this term.
|
||||
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
|
||||
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
@@ -20,6 +20,8 @@ pub struct TermInfo {
|
||||
pub postings_offset: u32,
|
||||
/// Offset within the position (`.pos`) file.
|
||||
pub positions_offset: u32,
|
||||
/// Offset within the position block.
|
||||
pub positions_inner_offset: u8,
|
||||
}
|
||||
|
||||
|
||||
@@ -27,17 +29,20 @@ impl BinarySerializable for TermInfo {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.doc_freq.serialize(writer)?;
|
||||
self.postings_offset.serialize(writer)?;
|
||||
self.positions_offset.serialize(writer)
|
||||
self.positions_offset.serialize(writer)?;
|
||||
self.positions_inner_offset.serialize(writer)
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let doc_freq = try!(u32::deserialize(reader));
|
||||
let postings_offset = try!(u32::deserialize(reader));
|
||||
let positions_offset = try!(u32::deserialize(reader));
|
||||
let doc_freq = u32::deserialize(reader)?;
|
||||
let postings_offset = u32::deserialize(reader)?;
|
||||
let positions_offset = u32::deserialize(reader)?;
|
||||
let positions_inner_offset = u8::deserialize(reader)?;
|
||||
Ok(TermInfo {
|
||||
doc_freq: doc_freq,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
})
|
||||
doc_freq: doc_freq,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
positions_inner_offset: positions_inner_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,10 +37,12 @@ impl Query for BooleanQuery {
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
let sub_weights = try!(self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect());
|
||||
let sub_weights = try!(
|
||||
self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect()
|
||||
);
|
||||
let occurs: Vec<Occur> = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref _subquery)| *occur)
|
||||
@@ -57,10 +59,9 @@ impl BooleanQuery {
|
||||
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query: Box<Query> = box TermQuery::new(term,
|
||||
SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
.collect();
|
||||
BooleanQuery::from(occur_term_queries)
|
||||
}
|
||||
|
||||
@@ -55,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
.map(|(ord, doc)| {
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32,
|
||||
}
|
||||
})
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
scorers: non_empty_scorers,
|
||||
|
||||
@@ -22,11 +22,12 @@ impl BooleanWeight {
|
||||
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> =
|
||||
try!(self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(
|
||||
self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect()
|
||||
);
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
|
||||
@@ -64,8 +64,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let make_term_query = |text: &str| {
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, text),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let query: Box<Query> = box term_query;
|
||||
query
|
||||
};
|
||||
@@ -87,19 +89,25 @@ mod tests {
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Should, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d"))]);
|
||||
let boolean_query = BooleanQuery::from(vec![
|
||||
(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
|
||||
}
|
||||
{
|
||||
|
||||
@@ -61,9 +61,9 @@ mod tests {
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
searcher.search(&phrase_query, &mut test_collector).expect(
|
||||
"search should succeed",
|
||||
);
|
||||
test_collector.docs()
|
||||
};
|
||||
|
||||
|
||||
@@ -5,12 +5,12 @@ use postings::Postings;
|
||||
use postings::IntersectionDocSet;
|
||||
use DocId;
|
||||
|
||||
pub struct PhraseScorer<'a> {
|
||||
pub intersection_docset: IntersectionDocSet<SegmentPostings<'a>>,
|
||||
pub struct PhraseScorer {
|
||||
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> PhraseScorer<'a> {
|
||||
impl PhraseScorer {
|
||||
fn phrase_match(&self) -> bool {
|
||||
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
|
||||
.docsets()
|
||||
@@ -54,7 +54,7 @@ impl<'a> PhraseScorer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocSet for PhraseScorer<'a> {
|
||||
impl DocSet for PhraseScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.intersection_docset.advance() {
|
||||
if self.phrase_match() {
|
||||
@@ -74,7 +74,7 @@ impl<'a> DocSet for PhraseScorer<'a> {
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Scorer for PhraseScorer<'a> {
|
||||
impl Scorer for PhraseScorer {
|
||||
fn score(&self) -> f32 {
|
||||
1f32
|
||||
}
|
||||
|
||||
@@ -22,14 +22,17 @@ impl Weight for PhraseWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
let inverted_index = reader.inverted_index(term.field());
|
||||
let term_postings_option =
|
||||
reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
inverted_index.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
term_postings_list.push(term_postings);
|
||||
} else {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,10 +61,8 @@ pub trait Query: fmt::Debug {
|
||||
/// - iterate throw the matched documents and push them to the collector.
|
||||
///
|
||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<TimerTree> {
|
||||
|
||||
let mut timer_tree = TimerTree::default();
|
||||
let weight = try!(self.weight(searcher));
|
||||
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
|
||||
@@ -3,7 +3,8 @@ use combine::char::*;
|
||||
use super::user_input_ast::*;
|
||||
|
||||
fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
let term_val = || {
|
||||
let word = many1(satisfy(|c: char| c.is_alphanumeric()));
|
||||
@@ -11,27 +12,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
phrase.or(word)
|
||||
};
|
||||
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))).map(
|
||||
|(s1, s2): (char, String)| format!("{}{}", s1, s2),
|
||||
);
|
||||
|
||||
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
let field = (
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
|
||||
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name:
|
||||
Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
try(term_query)
|
||||
.or(term_default_field)
|
||||
.map(UserInputAST::from)
|
||||
@@ -40,25 +43,29 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
|
||||
|
||||
fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
(char('-'), parser(literal))
|
||||
.map(|(_, expr)| UserInputAST::Not(box expr))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| UserInputAST::Must(box expr)))
|
||||
.or((char('+'), parser(literal)).map(|(_, expr)| {
|
||||
UserInputAST::Must(box expr)
|
||||
}))
|
||||
.or(parser(literal))
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
where
|
||||
I: Stream<Item = char>,
|
||||
{
|
||||
sep_by(parser(leaf), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
})
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
})
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
@@ -117,20 +117,22 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, _remaining) = parse_to_ast(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let (user_input_ast, _remaining) = parse_to_ast(query).map_err(
|
||||
|_| QueryParserError::SyntaxError,
|
||||
)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
self.schema.get_field(field_name).ok_or_else(|| {
|
||||
QueryParserError::FieldDoesNotExist(String::from(field_name))
|
||||
})
|
||||
}
|
||||
|
||||
fn compute_logical_ast(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
fn compute_logical_ast(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
@@ -138,10 +140,11 @@ impl QueryParser {
|
||||
Ok(ast)
|
||||
}
|
||||
|
||||
fn compute_logical_ast_for_leaf(&self,
|
||||
field: Field,
|
||||
phrase: &str)
|
||||
-> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
fn compute_logical_ast_for_leaf(
|
||||
&self,
|
||||
field: Field,
|
||||
phrase: &str,
|
||||
) -> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
@@ -174,7 +177,9 @@ impl QueryParser {
|
||||
if terms.is_empty() {
|
||||
Ok(None)
|
||||
} else if terms.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
|
||||
Ok(Some(
|
||||
LogicalLiteral::Term(terms.into_iter().next().unwrap()),
|
||||
))
|
||||
} else {
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
@@ -191,18 +196,24 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_logical_ast_with_occur(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
fn compute_logical_ast_with_occur(
|
||||
&self,
|
||||
user_input_ast: UserInputAST,
|
||||
) -> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(sub_queries.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res| {
|
||||
res.map(|(occur, sub_ast)| (compose_occur(default_occur, occur), sub_ast))
|
||||
})
|
||||
.collect());
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
|
||||
sub_queries
|
||||
.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res| {
|
||||
res.map(|(occur, sub_ast)| {
|
||||
(compose_occur(default_occur, occur), sub_ast)
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
);
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Not(subquery) => {
|
||||
@@ -320,9 +331,10 @@ mod test {
|
||||
}
|
||||
|
||||
|
||||
fn parse_query_to_logical_ast(query: &str,
|
||||
default_conjunction: bool)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
fn parse_query_to_logical_ast(
|
||||
query: &str,
|
||||
default_conjunction: bool,
|
||||
) -> Result<LogicalAST, QueryParserError> {
|
||||
let mut query_parser = make_query_parser();
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
@@ -330,9 +342,11 @@ mod test {
|
||||
query_parser.parse_query_to_logical_ast(query)
|
||||
}
|
||||
|
||||
fn test_parse_query_to_logical_ast_helper(query: &str,
|
||||
expected: &str,
|
||||
default_conjunction: bool) {
|
||||
fn test_parse_query_to_logical_ast_helper(
|
||||
query: &str,
|
||||
expected: &str,
|
||||
default_conjunction: bool,
|
||||
) {
|
||||
let query = parse_query_to_logical_ast(query, default_conjunction).unwrap();
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
@@ -358,21 +372,29 @@ mod test {
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64")));
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64"))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false);
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -381,82 +403,115 @@ mod test {
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok()
|
||||
);
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err());
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err()
|
||||
);
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper("unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false);
|
||||
assert!(
|
||||
query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok()
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("signed:-2324",
|
||||
&format!("{:?}",
|
||||
Term::from_field_i64(Field(2u32), -2324)),
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
false,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
false);
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
true,
|
||||
);
|
||||
assert_eq!(
|
||||
parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
+(Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), \
|
||||
Term([0, 0, 0, 0, 98])]\"",
|
||||
true);
|
||||
true,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,8 +44,10 @@ mod tests {
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq,
|
||||
);
|
||||
let term_weight = term_query.weight(&searcher).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();
|
||||
|
||||
@@ -7,7 +7,8 @@ use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
pub struct TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
|
||||
@@ -15,7 +16,8 @@ pub struct TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
pub fn postings(&self) -> &TPostings {
|
||||
&self.postings
|
||||
@@ -23,7 +25,8 @@ impl<TPostings> TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
@@ -40,7 +43,8 @@ impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
}
|
||||
|
||||
impl<TPostings> Scorer for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
where
|
||||
TPostings: Postings,
|
||||
{
|
||||
fn score(&self) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
|
||||
@@ -27,24 +27,28 @@ impl TermWeight {
|
||||
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
|
||||
}
|
||||
|
||||
pub fn specialized_scorer<'a>(&'a self,
|
||||
reader: &'a SegmentReader)
|
||||
-> Result<TermScorer<SegmentPostings<'a>>> {
|
||||
/// If the field is not found, returns an empty `DocSet`.
|
||||
pub fn specialized_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TermScorer<SegmentPostings>> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field);
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
Ok(reader
|
||||
.read_postings(&self.term, self.segment_postings_options)
|
||||
.map(|segment_postings| {
|
||||
TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt: fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
}
|
||||
})
|
||||
.unwrap_or(TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::empty(),
|
||||
}))
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.segment_postings_options);
|
||||
if let Some(segment_postings) = postings_opt {
|
||||
Ok(TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt: fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
})
|
||||
} else {
|
||||
Ok(TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use common::BinarySerializable;
|
||||
///
|
||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
||||
/// Value 255 is reserved.
|
||||
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct Field(pub u32);
|
||||
|
||||
impl BinarySerializable for Field {
|
||||
|
||||
@@ -89,7 +89,8 @@ impl FieldEntry {
|
||||
|
||||
impl Serialize for FieldEntry {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let mut s = serializer.serialize_struct("field_entry", 3)?;
|
||||
s.serialize_field("name", &self.name)?;
|
||||
@@ -115,7 +116,8 @@ impl Serialize for FieldEntry {
|
||||
|
||||
impl<'de> Deserialize<'de> for FieldEntry {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
#[serde(field_identifier, rename_all = "lowercase")]
|
||||
@@ -137,7 +139,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
}
|
||||
|
||||
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
|
||||
where V: MapAccess<'de>
|
||||
where
|
||||
V: MapAccess<'de>,
|
||||
{
|
||||
let mut name = None;
|
||||
let mut ty = None;
|
||||
@@ -187,13 +190,14 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
|
||||
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
|
||||
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
|
||||
let field_type = field_type
|
||||
.ok_or_else(|| de::Error::missing_field("options"))?;
|
||||
let field_type = field_type.ok_or_else(
|
||||
|| de::Error::missing_field("options"),
|
||||
)?;
|
||||
|
||||
Ok(FieldEntry {
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -80,8 +80,9 @@ impl FieldType {
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}",
|
||||
json)))
|
||||
Err(ValueParsingError::TypeError(
|
||||
format!("Expected an integer, got {:?}", json),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -110,9 +111,11 @@ impl FieldType {
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let msg = format!("Json value not supported error {:?}. Expected {:?}",
|
||||
json,
|
||||
self);
|
||||
let msg = format!(
|
||||
"Json value not supported error {:?}. Expected {:?}",
|
||||
json,
|
||||
self
|
||||
);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,9 +105,9 @@ impl SchemaBuilder {
|
||||
/// This will consume your `SchemaBuilder`
|
||||
pub fn build(self) -> Schema {
|
||||
Schema(Arc::new(InnerSchema {
|
||||
fields: self.fields,
|
||||
fields_map: self.fields_map,
|
||||
}))
|
||||
fields: self.fields,
|
||||
fields_map: self.fields_map,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,15 +206,14 @@ impl Schema {
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json)
|
||||
.map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
})?;
|
||||
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
|
||||
let doc_json_sample: String = if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
} else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
})?;
|
||||
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
@@ -225,18 +224,15 @@ impl Schema {
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = try!(field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
let value =
|
||||
try!(field_type.value_from_json(json_item).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = try!(field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| {
|
||||
let value = try!(field_type.value_from_json(json_value).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
}));
|
||||
doc.add(FieldValue::new(field, value));
|
||||
@@ -259,7 +255,8 @@ impl fmt::Debug for Schema {
|
||||
|
||||
impl Serialize for Schema {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
|
||||
for e in &self.0.fields {
|
||||
@@ -271,7 +268,8 @@ impl Serialize for Schema {
|
||||
|
||||
impl<'de> Deserialize<'de> for Schema {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct SchemaVisitor;
|
||||
|
||||
@@ -283,7 +281,8 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where A: SeqAccess<'de>
|
||||
where
|
||||
A: SeqAccess<'de>,
|
||||
{
|
||||
let mut schema = SchemaBuilder {
|
||||
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
|
||||
@@ -430,12 +429,14 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let doc = schema
|
||||
.parse_document(r#"{
|
||||
.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10
|
||||
}"#)
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
|
||||
@@ -443,13 +444,15 @@ mod tests {
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10,
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
|
||||
assert_eq!(field_name, "jambon");
|
||||
@@ -460,13 +463,15 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": "5",
|
||||
"popularity": "10",
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
|
||||
assert!(true);
|
||||
@@ -477,12 +482,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": -5,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
@@ -493,12 +500,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 9223372036854775808,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
|
||||
@@ -509,12 +518,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
"popularity": 9223372036854775808
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
@@ -525,11 +536,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
let json_err = schema.parse_document(
|
||||
r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
}"#);
|
||||
}"#,
|
||||
);
|
||||
match json_err {
|
||||
Err(NotJSON(_)) => {
|
||||
assert!(true);
|
||||
|
||||
@@ -13,7 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
|
||||
pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where
|
||||
B: AsRef<[u8]>;
|
||||
|
||||
impl Term {
|
||||
/// Builds a term given a field, and a u64-value
|
||||
@@ -109,7 +111,8 @@ impl Term {
|
||||
}
|
||||
|
||||
impl<B> Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
/// Wraps a source of data
|
||||
pub fn wrap(data: B) -> Term<B> {
|
||||
@@ -166,7 +169,8 @@ impl<B> Term<B>
|
||||
}
|
||||
|
||||
impl<B> AsRef<[u8]> for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
where
|
||||
B: AsRef<[u8]>,
|
||||
{
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::BitOr;
|
||||
|
||||
|
||||
/// Define how a text field should be handled by tantivy.
|
||||
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TextOptions {
|
||||
indexing: TextIndexingOptions,
|
||||
stored: bool,
|
||||
@@ -45,10 +45,10 @@ impl Default for TextOptions {
|
||||
|
||||
|
||||
/// Describe how a field should be indexed
|
||||
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum TextIndexingOptions {
|
||||
/// Unindexed fields will not generate any postings. They will not be searchable either.
|
||||
#[serde(rename="unindexed")]
|
||||
#[serde(rename = "unindexed")]
|
||||
Unindexed,
|
||||
/// Untokenized means that the field text will not be split into tokens before being indexed.
|
||||
/// A field with the value "Hello world", will have the document suscribe to one single
|
||||
@@ -56,23 +56,23 @@ pub enum TextIndexingOptions {
|
||||
///
|
||||
/// It will **not** be searchable if the user enter "hello" for instance.
|
||||
/// This can be useful for tags, or ids for instance.
|
||||
#[serde(rename="untokenized")]
|
||||
#[serde(rename = "untokenized")]
|
||||
Untokenized,
|
||||
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
|
||||
/// to the posting lists associated to all of the tokens.
|
||||
/// The frequence of appearance of the term in the document however will be lost.
|
||||
/// The term frequency used in the TfIdf formula will always be 1.
|
||||
#[serde(rename="tokenize")]
|
||||
#[serde(rename = "tokenize")]
|
||||
TokenizedNoFreq,
|
||||
/// TokenizedWithFreq will tokenize the field value, and encode
|
||||
/// both the docid and the term frequency in the posting lists associated to all
|
||||
#[serde(rename="freq")]
|
||||
#[serde(rename = "freq")]
|
||||
TokenizedWithFreq,
|
||||
/// Like TokenizedWithFreq, but also encodes the positions of the
|
||||
/// terms in a separate file. This option is required for phrase queries.
|
||||
/// Don't use this if you are certain you won't need it, the term positions file
|
||||
/// can be very big.
|
||||
#[serde(rename="position")]
|
||||
#[serde(rename = "position")]
|
||||
TokenizedWithFreqAndPosition,
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,8 @@ pub enum Value {
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
match *self {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
@@ -28,7 +29,8 @@ impl Serialize for Value {
|
||||
|
||||
impl<'de> Deserialize<'de> for Value {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
struct ValueVisitor;
|
||||
|
||||
@@ -162,9 +164,13 @@ mod binary_serialize {
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
_ => {
|
||||
Err(io::Error::new(io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}",
|
||||
type_code)))
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"No field type is associated with code {:?}",
|
||||
type_code
|
||||
),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,17 +54,19 @@ mod tests {
|
||||
fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title = schema_builder
|
||||
.add_text_field("title", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
let lorem = String::from(
|
||||
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.");
|
||||
laborum.",
|
||||
);
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer);
|
||||
for i in 0..num_docs {
|
||||
@@ -96,8 +98,10 @@ mod tests {
|
||||
let store_source = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::from_source(store_source);
|
||||
for i in 0..1_000 {
|
||||
assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
||||
format!("Doc {}", i));
|
||||
assert_eq!(
|
||||
*store.get(i).unwrap().get_first(field_title).unwrap().text(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,9 +110,9 @@ mod tests {
|
||||
let mut directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
let path = Path::new("store");
|
||||
b.iter(|| {
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ impl StoreReader {
|
||||
let mut cursor = &total_buffer[block_offset..];
|
||||
let block_length = u32::deserialize(&mut cursor).unwrap();
|
||||
let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..
|
||||
(block_offset + 4 + block_length as usize)];
|
||||
(block_offset + 4 + block_length as usize)];
|
||||
let mut lz4_decoder = try!(lz4::Decoder::new(block_array));
|
||||
*self.current_block_offset.borrow_mut() = usize::max_value();
|
||||
try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()));
|
||||
@@ -94,5 +94,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
|
||||
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
let offset = offset as usize;
|
||||
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
|
||||
(
|
||||
data.slice(0, offset),
|
||||
data.slice(offset, footer_offset),
|
||||
max_doc,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -49,12 +49,15 @@ impl StoreWriter {
|
||||
///
|
||||
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
|
||||
try!((field_values.len() as u32).serialize(
|
||||
&mut self.intermediary_buffer,
|
||||
));
|
||||
for field_value in field_values {
|
||||
try!((*field_value).serialize(&mut self.intermediary_buffer));
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32)
|
||||
.serialize(&mut self.current_block)?;
|
||||
(self.intermediary_buffer.len() as u32).serialize(
|
||||
&mut self.current_block,
|
||||
)?;
|
||||
self.current_block.write_all(&self.intermediary_buffer[..])?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
@@ -66,16 +69,22 @@ impl StoreWriter {
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
{
|
||||
let mut encoder = try!(lz4::EncoderBuilder::new().build(&mut self.intermediary_buffer));
|
||||
let mut encoder = try!(lz4::EncoderBuilder::new().build(
|
||||
&mut self.intermediary_buffer,
|
||||
));
|
||||
try!(encoder.write_all(&self.current_block));
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
try!(encoder_result);
|
||||
}
|
||||
(self.intermediary_buffer.len() as u32)
|
||||
.serialize(&mut self.writer)?;
|
||||
(self.intermediary_buffer.len() as u32).serialize(
|
||||
&mut self.writer,
|
||||
)?;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
self.offset_index_writer
|
||||
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
|
||||
self.offset_index_writer.insert(
|
||||
self.doc,
|
||||
&(self.writer.written_bytes() as
|
||||
u64),
|
||||
)?;
|
||||
self.current_block.clear();
|
||||
Ok(())
|
||||
}
|
||||
@@ -90,8 +99,7 @@ impl StoreWriter {
|
||||
try!(self.write_and_compress_block());
|
||||
}
|
||||
let header_offset: u64 = self.writer.written_bytes() as u64;
|
||||
try!(self.offset_index_writer
|
||||
.write(&mut self.writer));
|
||||
try!(self.offset_index_writer.write(&mut self.writer));
|
||||
try!(header_offset.serialize(&mut self.writer));
|
||||
try!(self.doc.serialize(&mut self.writer));
|
||||
self.writer.flush()
|
||||
|
||||
@@ -1,23 +1,17 @@
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use fst::map::{StreamBuilder, Stream};
|
||||
use common::BinarySerializable;
|
||||
use postings::TermInfo;
|
||||
use super::TermDictionaryImpl;
|
||||
use termdict::{TermStreamerBuilder, TermStreamer};
|
||||
|
||||
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
|
||||
pub struct TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
fst_map: &'a TermDictionaryImpl<V>,
|
||||
pub struct TermStreamerBuilderImpl<'a> {
|
||||
fst_map: &'a TermDictionaryImpl,
|
||||
stream_builder: StreamBuilder<'a>,
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
pub(crate) fn new(fst_map: &'a TermDictionaryImpl<V>,
|
||||
stream_builder: StreamBuilder<'a>)
|
||||
-> Self {
|
||||
impl<'a> TermStreamerBuilderImpl<'a> {
|
||||
pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self {
|
||||
TermStreamerBuilderImpl {
|
||||
fst_map: fst_map,
|
||||
stream_builder: stream_builder,
|
||||
@@ -25,10 +19,8 @@ impl<'a, V> TermStreamerBuilderImpl<'a, V>
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
type Streamer = TermStreamerImpl<'a, V>;
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
self.stream_builder = self.stream_builder.ge(bound);
|
||||
@@ -56,35 +48,30 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
stream: self.stream_builder.into_stream(),
|
||||
offset: 0u64,
|
||||
current_key: Vec::with_capacity(100),
|
||||
current_value: V::default(),
|
||||
current_value: TermInfo::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// See [`TermStreamer`](./trait.TermStreamer.html)
|
||||
pub struct TermStreamerImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
fst_map: &'a TermDictionaryImpl<V>,
|
||||
pub struct TermStreamerImpl<'a> {
|
||||
fst_map: &'a TermDictionaryImpl,
|
||||
stream: Stream<'a>,
|
||||
offset: u64,
|
||||
current_key: Vec<u8>,
|
||||
current_value: V,
|
||||
current_value: TermInfo,
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if let Some((term, offset)) = self.stream.next() {
|
||||
self.current_key.clear();
|
||||
self.current_key.extend_from_slice(term);
|
||||
self.offset = offset;
|
||||
self.current_value =
|
||||
self.fst_map
|
||||
.read_value(self.offset)
|
||||
.expect("Fst data is corrupted. Failed to deserialize a value.");
|
||||
self.current_value = self.fst_map.read_value(self.offset).expect(
|
||||
"Fst data is corrupted. Failed to deserialize a value.",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -95,7 +82,7 @@ impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
fn value(&self) -> &V {
|
||||
fn value(&self) -> &TermInfo {
|
||||
&self.current_value
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use fst;
|
||||
use fst::raw::Fst;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use schema::FieldType;
|
||||
use postings::TermInfo;
|
||||
use termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
use super::{TermStreamerImpl, TermStreamerBuilderImpl};
|
||||
@@ -13,18 +13,14 @@ fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
}
|
||||
|
||||
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
|
||||
pub struct TermDictionaryBuilderImpl<W, V = TermInfo>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
{
|
||||
pub struct TermDictionaryBuilderImpl<W> {
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<W, V> TermDictionaryBuilderImpl<W, V>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
impl<W> TermDictionaryBuilderImpl<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// # Warning
|
||||
/// Horribly dangerous internal API
|
||||
@@ -43,26 +39,25 @@ impl<W, V> TermDictionaryBuilderImpl<W, V>
|
||||
/// # Warning
|
||||
///
|
||||
/// Horribly dangerous internal API. See `.insert_key(...)`.
|
||||
pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> {
|
||||
pub(crate) fn insert_value(&mut self, value: &TermInfo) -> io::Result<()> {
|
||||
value.serialize(&mut self.data)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn new(w: W) -> io::Result<Self> {
|
||||
fn new(w: W, _field_type: FieldType) -> io::Result<Self> {
|
||||
let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?;
|
||||
Ok(TermDictionaryBuilderImpl {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &V) -> io::Result<()> {
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
|
||||
let key = key_ref.as_ref();
|
||||
self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
@@ -81,73 +76,65 @@ impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
|
||||
}
|
||||
}
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
fn open_fst_index(source: ReadOnlySource) -> fst::Map {
|
||||
let fst = match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error)?
|
||||
Fst::from_shared_bytes(data.data, data.start, data.len).expect("FST data is corrupted")
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)?
|
||||
Fst::from_mmap(mmap_readonly).expect("FST data is corrupted")
|
||||
}
|
||||
};
|
||||
Ok(fst::Map::from(fst))
|
||||
fst::Map::from(fst)
|
||||
}
|
||||
|
||||
/// See [`TermDictionary`](./trait.TermDictionary.html)
|
||||
pub struct TermDictionaryImpl<V = TermInfo>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
pub struct TermDictionaryImpl {
|
||||
fst_index: fst::Map,
|
||||
values_mmap: ReadOnlySource,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<V> TermDictionaryImpl<V>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
impl TermDictionaryImpl {
|
||||
/// Deserialize and returns the value at address `offset`
|
||||
pub(crate) fn read_value(&self, offset: u64) -> io::Result<V> {
|
||||
pub(crate) fn read_value(&self, offset: u64) -> io::Result<TermInfo> {
|
||||
let buffer = self.values_mmap.as_slice();
|
||||
let mut cursor = &buffer[(offset as usize)..];
|
||||
V::deserialize(&mut cursor)
|
||||
TermInfo::deserialize(&mut cursor)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl<V>
|
||||
where V: BinarySerializable + Default + 'a
|
||||
{
|
||||
type Streamer = TermStreamerImpl<'a, V>;
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a, V>;
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a>;
|
||||
|
||||
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
|
||||
fn from_source(source: ReadOnlySource) -> Self {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer)? as usize;
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer).expect(
|
||||
"Deserializing 4 bytes should always work",
|
||||
) as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
let fst_index = open_fst_index(fst_source)?;
|
||||
Ok(TermDictionaryImpl {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
let fst_index = open_fst_index(fst_source);
|
||||
TermDictionaryImpl {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
}
|
||||
}
|
||||
|
||||
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
|
||||
self.fst_index
|
||||
.get(key)
|
||||
.map(|offset| {
|
||||
self.read_value(offset)
|
||||
.expect("The fst is corrupted. Failed to deserialize a value.")
|
||||
})
|
||||
fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
|
||||
self.fst_index.get(key).map(|offset| {
|
||||
self.read_value(offset).expect(
|
||||
"The fst is corrupted. Failed to deserialize a value.",
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn range(&self) -> TermStreamerBuilderImpl<V> {
|
||||
fn range(&self) -> TermStreamerBuilderImpl {
|
||||
TermStreamerBuilderImpl::new(self, self.fst_index.range())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,42 +1,30 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use core::SegmentReader;
|
||||
use termdict::TermStreamerImpl;
|
||||
use common::BinarySerializable;
|
||||
use postings::TermInfo;
|
||||
use std::cmp::Ordering;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermDictionary;
|
||||
use schema::Term;
|
||||
|
||||
pub struct HeapItem<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
pub streamer: TermStreamerImpl<'a, V>,
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: TermStreamerImpl<'a>,
|
||||
pub segment_ord: usize,
|
||||
}
|
||||
|
||||
impl<'a, V> PartialEq for HeapItem<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
impl<'a> PartialEq for HeapItem<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.segment_ord == other.segment_ord
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> Eq for HeapItem<'a, V> where V: 'a + BinarySerializable + Default {}
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
|
||||
impl<'a, V> PartialOrd for HeapItem<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
fn partial_cmp(&self, other: &HeapItem<'a, V>) -> Option<Ordering> {
|
||||
impl<'a> PartialOrd for HeapItem<'a> {
|
||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> Ord for HeapItem<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
fn cmp(&self, other: &HeapItem<'a, V>) -> Ordering {
|
||||
impl<'a> Ord for HeapItem<'a> {
|
||||
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
|
||||
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
|
||||
}
|
||||
}
|
||||
@@ -48,28 +36,27 @@ impl<'a, V> Ord for HeapItem<'a, V>
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub struct TermMerger<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
heap: BinaryHeap<HeapItem<'a, V>>,
|
||||
current_streamers: Vec<HeapItem<'a, V>>,
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
}
|
||||
|
||||
impl<'a, V> TermMerger<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
fn new(streams: Vec<TermStreamerImpl<'a, V>>) -> TermMerger<'a, V> {
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
///
|
||||
///
|
||||
pub fn new(streams: Vec<TermStreamerImpl<'a>>) -> TermMerger<'a> {
|
||||
TermMerger {
|
||||
heap: BinaryHeap::new(),
|
||||
current_streamers: streams
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(ord, streamer)| {
|
||||
HeapItem {
|
||||
streamer: streamer,
|
||||
segment_ord: ord,
|
||||
}
|
||||
})
|
||||
HeapItem {
|
||||
streamer: streamer,
|
||||
segment_ord: ord,
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
@@ -125,7 +112,7 @@ impl<'a, V> TermMerger<'a, V>
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn current_kvs(&self) -> &[HeapItem<'a, V>] {
|
||||
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
|
||||
&self.current_streamers[..]
|
||||
}
|
||||
|
||||
@@ -139,14 +126,3 @@ impl<'a, V> TermMerger<'a, V>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> {
|
||||
fn from(segment_readers: &'a [SegmentReader]) -> TermMerger<'a, TermInfo> {
|
||||
TermMerger::new(segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.terms().stream())
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,36 +1,10 @@
|
||||
/*!
|
||||
The term dictionary is one of the key datastructure of
|
||||
tantivy. It associates sorted `terms` to their respective
|
||||
posting list.
|
||||
tantivy. It associates sorted `terms` to a `TermInfo` struct
|
||||
that serves as an address in their respective posting list.
|
||||
|
||||
The term dictionary makes it possible to iterate through
|
||||
the keys in a sorted manner.
|
||||
|
||||
# Example
|
||||
|
||||
```
|
||||
extern crate tantivy;
|
||||
use tantivy::termdict::*;
|
||||
use tantivy::directory::ReadOnlySource;
|
||||
|
||||
# fn main() {
|
||||
# run().expect("Test failed");
|
||||
# }
|
||||
# fn run() -> tantivy::Result<()> {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec!())?;
|
||||
|
||||
// keys have to be insert in order.
|
||||
term_dictionary_builder.insert("apple", &1u32)?;
|
||||
term_dictionary_builder.insert("grape", &2u32)?;
|
||||
term_dictionary_builder.insert("pear", &3u32)?;
|
||||
let buffer: Vec<u8> = term_dictionary_builder.finish()?;
|
||||
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary = TermDictionaryImpl::from_source(source)?;
|
||||
|
||||
assert_eq!(term_dictionary.get("grape"), Some(2u32));
|
||||
# Ok(())
|
||||
# }
|
||||
The term dictionary API makes it possible to iterate through
|
||||
a range of keys in a sorted manner.
|
||||
```
|
||||
|
||||
|
||||
@@ -74,48 +48,45 @@ followed by a streaming through at most `1024` elements in the
|
||||
term `stream`.
|
||||
*/
|
||||
|
||||
use schema::{Field, Term};
|
||||
use common::BinarySerializable;
|
||||
use schema::{Field, Term, FieldType};
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
use postings::TermInfo;
|
||||
|
||||
pub use self::merger::TermMerger;
|
||||
|
||||
|
||||
#[cfg(not(feature="streamdict"))]
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
mod fstdict;
|
||||
#[cfg(not(feature="streamdict"))]
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
|
||||
|
||||
#[cfg(feature="streamdict")]
|
||||
#[cfg(feature = "streamdict")]
|
||||
mod streamdict;
|
||||
#[cfg(feature="streamdict")]
|
||||
#[cfg(feature = "streamdict")]
|
||||
pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
|
||||
|
||||
mod merger;
|
||||
use std::io;
|
||||
|
||||
|
||||
/// Dictionary associating sorted `&[u8]` to values
|
||||
pub trait TermDictionary<'a, V>
|
||||
where V: BinarySerializable + Default + 'a,
|
||||
Self: Sized
|
||||
pub trait TermDictionary<'a>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
/// Streamer type associated to the term dictionary
|
||||
type Streamer: TermStreamer<V> + 'a;
|
||||
type Streamer: TermStreamer + 'a;
|
||||
|
||||
/// StreamerBuilder type associated to the term dictionary
|
||||
type StreamBuilder: TermStreamerBuilder<V, Streamer = Self::Streamer> + 'a;
|
||||
type StreamBuilder: TermStreamerBuilder<Streamer = Self::Streamer> + 'a;
|
||||
|
||||
/// Opens a `TermDictionary` given a data source.
|
||||
fn from_source(source: ReadOnlySource) -> io::Result<Self>;
|
||||
fn from_source(source: ReadOnlySource) -> Self;
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V>;
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo>;
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
@@ -140,17 +111,17 @@ pub trait TermDictionary<'a, V>
|
||||
/// Builder for the new term dictionary.
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub trait TermDictionaryBuilder<W, V>: Sized
|
||||
where W: io::Write,
|
||||
V: BinarySerializable + Default
|
||||
pub trait TermDictionaryBuilder<W>: Sized
|
||||
where
|
||||
W: io::Write,
|
||||
{
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
fn new(write: W) -> io::Result<Self>;
|
||||
fn new(write: W, field_type: FieldType) -> io::Result<Self>;
|
||||
|
||||
/// Inserts a `(key, value)` pair in the term dictionary.
|
||||
///
|
||||
/// *Keys have to be inserted in order.*
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key: K, value: &V) -> io::Result<()>;
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key: K, value: &TermInfo) -> io::Result<()>;
|
||||
|
||||
/// Finalize writing the builder, and returns the underlying
|
||||
/// `Write` object.
|
||||
@@ -160,7 +131,7 @@ pub trait TermDictionaryBuilder<W, V>: Sized
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub trait TermStreamer<V>: Sized {
|
||||
pub trait TermStreamer: Sized {
|
||||
/// Advance position the stream on the next item.
|
||||
/// Before the first call to `.advance()`, the stream
|
||||
/// is an unitialized state.
|
||||
@@ -187,10 +158,10 @@ pub trait TermStreamer<V>: Sized {
|
||||
///
|
||||
/// Calling `.value()` before the first call to `.advance()` returns
|
||||
/// `V::default()`.
|
||||
fn value(&self) -> &V;
|
||||
fn value(&self) -> &TermInfo;
|
||||
|
||||
/// Return the next `(key, value)` pair.
|
||||
fn next(&mut self) -> Option<(Term<&[u8]>, &V)> {
|
||||
fn next(&mut self) -> Option<(Term<&[u8]>, &TermInfo)> {
|
||||
if self.advance() {
|
||||
Some((Term::wrap(self.key()), self.value()))
|
||||
} else {
|
||||
@@ -202,11 +173,9 @@ pub trait TermStreamer<V>: Sized {
|
||||
|
||||
/// `TermStreamerBuilder` is an helper object used to define
|
||||
/// a range of terms that should be streamed.
|
||||
pub trait TermStreamerBuilder<V>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
pub trait TermStreamerBuilder {
|
||||
/// Associated `TermStreamer` type that this builder is building.
|
||||
type Streamer: TermStreamer<V>;
|
||||
type Streamer: TermStreamer;
|
||||
|
||||
/// Limit the range to terms greater or equal to the bound
|
||||
fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self;
|
||||
@@ -231,60 +200,70 @@ mod tests {
|
||||
use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl};
|
||||
use directory::{RAMDirectory, Directory, ReadOnlySource};
|
||||
use std::path::PathBuf;
|
||||
use schema::{Term, SchemaBuilder, Document, TEXT};
|
||||
use schema::{FieldType, Term, SchemaBuilder, Document, TEXT};
|
||||
use core::Index;
|
||||
use std::str;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermDictionaryBuilder;
|
||||
use postings::TermInfo;
|
||||
|
||||
const BLOCK_SIZE: usize = 1_500;
|
||||
|
||||
|
||||
fn make_term_info(val: u32) -> TermInfo {
|
||||
TermInfo {
|
||||
doc_freq: val,
|
||||
positions_offset: val * 2u32,
|
||||
postings_offset: val * 3u32,
|
||||
positions_inner_offset: 5u8,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_dictionary() {
|
||||
fn test_term_dictionary_simple() {
|
||||
let mut directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abc".as_bytes(), &34u32)
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abcd".as_bytes(), &346u32)
|
||||
.insert("abc".as_bytes(), &make_term_info(34u32))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abcd".as_bytes(), &make_term_info(346u32))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let term_dict: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source).unwrap();
|
||||
assert_eq!(term_dict.get("abc"), Some(34u32));
|
||||
assert_eq!(term_dict.get("abcd"), Some(346u32));
|
||||
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32);
|
||||
assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32);
|
||||
let mut stream = term_dict.stream();
|
||||
{
|
||||
{
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k.as_ref(), "abc".as_bytes());
|
||||
assert_eq!(v, &34u32);
|
||||
assert_eq!(v.doc_freq, 34u32);
|
||||
}
|
||||
assert_eq!(stream.key(), "abc".as_bytes());
|
||||
assert_eq!(*stream.value(), 34u32);
|
||||
assert_eq!(stream.value().doc_freq, 34u32);
|
||||
}
|
||||
{
|
||||
{
|
||||
let (k, v) = stream.next().unwrap();
|
||||
assert_eq!(k.as_slice(), "abcd".as_bytes());
|
||||
assert_eq!(v, &346u32);
|
||||
assert_eq!(v.doc_freq, 346u32);
|
||||
}
|
||||
assert_eq!(stream.key(), "abcd".as_bytes());
|
||||
assert_eq!(*stream.value(), 346u32);
|
||||
assert_eq!(stream.value().doc_freq, 346u32);
|
||||
}
|
||||
assert!(!stream.advance());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_term_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -319,7 +298,9 @@ mod tests {
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let mut term_it = searcher.terms();
|
||||
|
||||
let field_searcher = searcher.field(text_field);
|
||||
let mut term_it = field_searcher.terms();
|
||||
let mut term_string = String::new();
|
||||
while term_it.advance() {
|
||||
let term = Term::from_bytes(term_it.key());
|
||||
@@ -334,23 +315,26 @@ mod tests {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder.insert(id.as_bytes(), i).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
{
|
||||
let mut streamer = term_dictionary.stream();
|
||||
let mut i = 0;
|
||||
while let Some((streamer_k, streamer_v)) = streamer.next() {
|
||||
let &(ref key, ref v) = &ids[i];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v, v);
|
||||
assert_eq!(streamer_v, &make_term_info(*v));
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
@@ -359,23 +343,59 @@ mod tests {
|
||||
term_dictionary.get(key.as_bytes());
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_stream_high_range_prefix_suffix() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abr", &make_term_info(2))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
let mut kv_stream = term_dictionary.stream();
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(1));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(2));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abr".as_bytes());
|
||||
assert!(!kv_stream.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_range() {
|
||||
let ids: Vec<_> = (0u32..50_000u32)
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder.insert(id.as_bytes(), i).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
|
||||
let term_dictionary: TermDictionaryImpl<u32> = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
{
|
||||
for i in (0..20).chain(6000..8_000) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
@@ -387,7 +407,8 @@ mod tests {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j];
|
||||
assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
|
||||
assert_eq!(streamer_v, v);
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
assert_eq!(streamer_v, &make_term_info(*v));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -403,7 +424,7 @@ mod tests {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j + 1];
|
||||
assert_eq!(streamer_k.as_ref(), key.as_bytes());
|
||||
assert_eq!(streamer_v, v);
|
||||
assert_eq!(streamer_v.doc_freq, *v);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -430,45 +451,56 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_stream_range_boundaries() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![]).unwrap();
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder.insert(&number_arr, &i).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&number_arr, &make_term_info(i as u32))
|
||||
.unwrap();
|
||||
}
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl<u8> = TermDictionaryImpl::from_source(source)
|
||||
.unwrap();
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
|
||||
let value_list = |mut streamer: TermStreamerImpl<u8>| {
|
||||
let mut res: Vec<u8> = vec![];
|
||||
while let Some((_, &v)) = streamer.next() {
|
||||
res.push(v);
|
||||
let value_list = |mut streamer: TermStreamerImpl| {
|
||||
let mut res: Vec<u32> = vec![];
|
||||
while let Some((_, ref v)) = streamer.next() {
|
||||
res.push(v.doc_freq);
|
||||
}
|
||||
res
|
||||
};
|
||||
{
|
||||
let range = term_dictionary.range().ge([2u8]).into_stream();
|
||||
assert_eq!(value_list(range),
|
||||
vec![2u8, 3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().gt([2u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![3u8, 4u8, 5u8, 6u8, 7u8, 8u8, 9u8]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().lt([6u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8]);
|
||||
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().le([6u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8, 6u8]);
|
||||
assert_eq!(
|
||||
value_list(range),
|
||||
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
|
||||
);
|
||||
}
|
||||
{
|
||||
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream();
|
||||
assert_eq!(value_list(range), vec![0u8, 1u8, 2u8, 3u8, 4u8]);
|
||||
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
175
src/termdict/streamdict/delta_encoder.rs
Normal file
175
src/termdict/streamdict/delta_encoder.rs
Normal file
@@ -0,0 +1,175 @@
|
||||
use postings::TermInfo;
|
||||
use super::CheckPoint;
|
||||
use std::mem;
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// Returns the len of the longest
|
||||
/// common prefix of `s1` and `s2`.
|
||||
///
|
||||
/// ie: the greatest `L` such that
|
||||
/// for all `0 <= i < L`, `s1[i] == s2[i]`
|
||||
fn common_prefix_len(s1: &[u8], s2: &[u8]) -> usize {
|
||||
s1.iter()
|
||||
.zip(s2.iter())
|
||||
.take_while(|&(a, b)| a == b)
|
||||
.count()
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct TermDeltaEncoder {
|
||||
last_term: Vec<u8>,
|
||||
prefix_len: usize,
|
||||
}
|
||||
|
||||
impl TermDeltaEncoder {
|
||||
pub fn encode<'a>(&mut self, term: &'a [u8]) {
|
||||
self.prefix_len = common_prefix_len(term, &self.last_term);
|
||||
self.last_term.truncate(self.prefix_len);
|
||||
self.last_term.extend_from_slice(&term[self.prefix_len..]);
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &[u8] {
|
||||
&self.last_term[..]
|
||||
}
|
||||
|
||||
pub fn prefix_suffix(&mut self) -> (usize, &[u8]) {
|
||||
(self.prefix_len, &self.last_term[self.prefix_len..])
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct TermDeltaDecoder {
|
||||
term: Vec<u8>,
|
||||
}
|
||||
|
||||
impl TermDeltaDecoder {
|
||||
pub fn with_previous_term(term: Vec<u8>) -> TermDeltaDecoder {
|
||||
TermDeltaDecoder { term: Vec::from(term) }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
|
||||
let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 {
|
||||
let b = cursor[0];
|
||||
cursor = &cursor[1..];
|
||||
let prefix_len = (b & 15u8) as usize;
|
||||
let suffix_len = (b >> 4u8) as usize;
|
||||
(prefix_len, suffix_len)
|
||||
} else {
|
||||
let prefix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
let suffix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
(prefix_len as usize, suffix_len as usize)
|
||||
};
|
||||
unsafe { self.term.set_len(prefix_len) };
|
||||
self.term.extend_from_slice(&(*cursor)[..suffix_len]);
|
||||
&cursor[suffix_len..]
|
||||
}
|
||||
|
||||
pub fn term(&self) -> &[u8] {
|
||||
&self.term[..]
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct DeltaTermInfo {
|
||||
pub doc_freq: u32,
|
||||
pub delta_postings_offset: u32,
|
||||
pub delta_positions_offset: u32,
|
||||
pub positions_inner_offset: u8,
|
||||
}
|
||||
|
||||
pub struct TermInfoDeltaEncoder {
|
||||
term_info: TermInfo,
|
||||
pub has_positions: bool,
|
||||
}
|
||||
|
||||
impl TermInfoDeltaEncoder {
|
||||
pub fn new(has_positions: bool) -> Self {
|
||||
TermInfoDeltaEncoder {
|
||||
term_info: TermInfo::default(),
|
||||
has_positions: has_positions,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn term_info(&self) -> &TermInfo {
|
||||
&self.term_info
|
||||
}
|
||||
|
||||
pub fn encode(&mut self, term_info: TermInfo) -> DeltaTermInfo {
|
||||
let mut delta_term_info = DeltaTermInfo {
|
||||
doc_freq: term_info.doc_freq,
|
||||
delta_postings_offset: term_info.postings_offset - self.term_info.postings_offset,
|
||||
delta_positions_offset: 0,
|
||||
positions_inner_offset: 0,
|
||||
};
|
||||
if self.has_positions {
|
||||
delta_term_info.delta_positions_offset = term_info.positions_offset -
|
||||
self.term_info.positions_offset;
|
||||
delta_term_info.positions_inner_offset = term_info.positions_inner_offset;
|
||||
}
|
||||
mem::replace(&mut self.term_info, term_info);
|
||||
delta_term_info
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct TermInfoDeltaDecoder {
|
||||
term_info: TermInfo,
|
||||
has_positions: bool,
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn make_mask(num_bytes: usize) -> u32 {
|
||||
const MASK: [u32; 4] = [0xffu32, 0xffffu32, 0xffffffu32, 0xffffffffu32];
|
||||
*unsafe { MASK.get_unchecked(num_bytes.wrapping_sub(1) as usize) }
|
||||
}
|
||||
|
||||
impl TermInfoDeltaDecoder {
|
||||
pub fn from_term_info(term_info: TermInfo, has_positions: bool) -> TermInfoDeltaDecoder {
|
||||
TermInfoDeltaDecoder {
|
||||
term_info: term_info,
|
||||
has_positions: has_positions,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_checkpoint(checkpoint: &CheckPoint, has_positions: bool) -> TermInfoDeltaDecoder {
|
||||
TermInfoDeltaDecoder {
|
||||
term_info: TermInfo {
|
||||
doc_freq: 0u32,
|
||||
postings_offset: checkpoint.postings_offset,
|
||||
positions_offset: checkpoint.positions_offset,
|
||||
positions_inner_offset: 0u8,
|
||||
},
|
||||
has_positions: has_positions,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
|
||||
let num_bytes_docfreq: usize = ((code >> 1) & 3) as usize + 1;
|
||||
let num_bytes_postings_offset: usize = ((code >> 3) & 3) as usize + 1;
|
||||
let mut v: u64 = unsafe { *(cursor.as_ptr() as *const u64) };
|
||||
let doc_freq: u32 = (v as u32) & make_mask(num_bytes_docfreq);
|
||||
v >>= (num_bytes_docfreq as u64) * 8u64;
|
||||
let delta_postings_offset: u32 = (v as u32) & make_mask(num_bytes_postings_offset);
|
||||
cursor = &cursor[num_bytes_docfreq + num_bytes_postings_offset..];
|
||||
self.term_info.doc_freq = doc_freq;
|
||||
self.term_info.postings_offset += delta_postings_offset;
|
||||
if self.has_positions {
|
||||
let num_bytes_positions_offset = ((code >> 5) & 3) as usize + 1;
|
||||
let delta_positions_offset: u32 = unsafe { *(cursor.as_ptr() as *const u32) } &
|
||||
make_mask(num_bytes_positions_offset);
|
||||
self.term_info.positions_offset += delta_positions_offset;
|
||||
self.term_info.positions_inner_offset = cursor[num_bytes_positions_offset];
|
||||
&cursor[num_bytes_positions_offset + 1..]
|
||||
} else {
|
||||
cursor
|
||||
}
|
||||
}
|
||||
|
||||
pub fn term_info(&self) -> &TermInfo {
|
||||
&self.term_info
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,42 @@
|
||||
use std::io::{self, Write, Read};
|
||||
use common::BinarySerializable;
|
||||
|
||||
mod termdict;
|
||||
mod streamer;
|
||||
mod delta_encoder;
|
||||
|
||||
|
||||
pub use self::delta_encoder::{TermDeltaEncoder, TermDeltaDecoder};
|
||||
pub use self::delta_encoder::{TermInfoDeltaEncoder, TermInfoDeltaDecoder, DeltaTermInfo};
|
||||
|
||||
pub use self::termdict::TermDictionaryImpl;
|
||||
pub use self::termdict::TermDictionaryBuilderImpl;
|
||||
pub use self::streamer::TermStreamerImpl;
|
||||
pub use self::streamer::TermStreamerBuilderImpl;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CheckPoint {
|
||||
pub stream_offset: u32,
|
||||
pub postings_offset: u32,
|
||||
pub positions_offset: u32,
|
||||
}
|
||||
|
||||
impl BinarySerializable for CheckPoint {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.stream_offset.serialize(writer)?;
|
||||
self.postings_offset.serialize(writer)?;
|
||||
self.positions_offset.serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let stream_offset = u32::deserialize(reader)?;
|
||||
let postings_offset = u32::deserialize(reader)?;
|
||||
let positions_offset = u32::deserialize(reader)?;
|
||||
Ok(CheckPoint {
|
||||
stream_offset: stream_offset,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,47 +1,54 @@
|
||||
#![allow(should_implement_trait)]
|
||||
|
||||
use std::cmp::max;
|
||||
use common::BinarySerializable;
|
||||
use super::TermDictionaryImpl;
|
||||
use termdict::{TermStreamerBuilder, TermStreamer};
|
||||
use postings::TermInfo;
|
||||
use super::delta_encoder::{TermInfoDeltaDecoder, TermDeltaDecoder};
|
||||
|
||||
pub(crate) fn stream_before<'a, V>(term_dictionary: &'a TermDictionaryImpl<V>,
|
||||
target_key: &[u8])
|
||||
-> TermStreamerImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
let (prev_key, offset) = term_dictionary.strictly_previous_key(target_key.as_ref());
|
||||
let offset: usize = offset as usize;
|
||||
|
||||
fn stream_before<'a>(
|
||||
term_dictionary: &'a TermDictionaryImpl,
|
||||
target_key: &[u8],
|
||||
has_positions: bool,
|
||||
) -> TermStreamerImpl<'a> {
|
||||
|
||||
let (prev_key, checkpoint) = term_dictionary.strictly_previous_key(target_key.as_ref());
|
||||
let stream_data: &'a [u8] = &term_dictionary.stream_data()[checkpoint.stream_offset as usize..];
|
||||
TermStreamerImpl {
|
||||
cursor: &term_dictionary.stream_data()[offset..],
|
||||
current_key: Vec::from(prev_key),
|
||||
current_value: V::default(),
|
||||
cursor: stream_data,
|
||||
term_delta_decoder: TermDeltaDecoder::with_previous_term(prev_key),
|
||||
term_info_decoder: TermInfoDeltaDecoder::from_checkpoint(&checkpoint, has_positions),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
|
||||
pub struct TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
term_dictionary: &'a TermDictionaryImpl<V>,
|
||||
pub struct TermStreamerBuilderImpl<'a> {
|
||||
term_dictionary: &'a TermDictionaryImpl,
|
||||
origin: usize,
|
||||
offset_from: usize,
|
||||
offset_to: usize,
|
||||
current_key: Vec<u8>,
|
||||
term_info: TermInfo,
|
||||
has_positions: bool,
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
type Streamer = TermStreamerImpl<'a, V>;
|
||||
impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
/// Limit the range to terms greater or equal to the bound
|
||||
fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.lt(target_key);
|
||||
let (offset_before, current_key) = get_offset(smaller_than, streamer);
|
||||
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
self.term_info = term_info;
|
||||
self.offset_from = offset_before - self.origin;
|
||||
self
|
||||
}
|
||||
@@ -49,10 +56,15 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
/// Limit the range to terms strictly greater than the bound
|
||||
fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.le(target_key);
|
||||
let (offset_before, current_key) = get_offset(smaller_than, streamer);
|
||||
let (offset_before, current_key, term_info) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
self.term_info = term_info;
|
||||
self.offset_from = offset_before - self.origin;
|
||||
self
|
||||
}
|
||||
@@ -60,9 +72,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
/// Limit the range to terms lesser or equal to the bound
|
||||
fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.lt(target_key);
|
||||
let (offset_before, _) = get_offset(smaller_than, streamer);
|
||||
let (offset_before, _, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before - self.origin;
|
||||
self
|
||||
}
|
||||
@@ -70,9 +86,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
/// Limit the range to terms lesser or equal to the bound
|
||||
fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.term_dictionary, target_key.as_ref());
|
||||
let streamer = stream_before(
|
||||
self.term_dictionary,
|
||||
target_key.as_ref(),
|
||||
self.has_positions,
|
||||
);
|
||||
let smaller_than = |k: &[u8]| k.le(target_key);
|
||||
let (offset_before, _) = get_offset(smaller_than, streamer);
|
||||
let (offset_before, _, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before - self.origin;
|
||||
self
|
||||
}
|
||||
@@ -82,10 +102,13 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
let data: &[u8] = self.term_dictionary.stream_data();
|
||||
let start = self.offset_from;
|
||||
let stop = max(self.offset_to, start);
|
||||
let term_delta_decoder = TermDeltaDecoder::with_previous_term(self.current_key);
|
||||
let term_info_decoder =
|
||||
TermInfoDeltaDecoder::from_term_info(self.term_info, self.has_positions);
|
||||
TermStreamerImpl {
|
||||
cursor: &data[start..stop],
|
||||
current_key: self.current_key,
|
||||
current_value: V::default(),
|
||||
term_delta_decoder: term_delta_decoder,
|
||||
term_info_decoder: term_info_decoder,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -93,100 +116,77 @@ impl<'a, V> TermStreamerBuilder<V> for TermStreamerBuilderImpl<'a, V>
|
||||
/// Returns offset information for the first
|
||||
/// key in the stream matching a given predicate.
|
||||
///
|
||||
/// returns (start offset, the data required to load the value)
|
||||
fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P,
|
||||
mut streamer: TermStreamerImpl<V>)
|
||||
-> (usize, Vec<u8>)
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
/// returns
|
||||
/// - the block start
|
||||
/// - the index within this block
|
||||
/// - the term_buffer state to initialize the block)
|
||||
fn get_offset<'a, P: Fn(&[u8]) -> bool>(
|
||||
predicate: P,
|
||||
mut streamer: TermStreamerImpl<'a>,
|
||||
) -> (usize, Vec<u8>, TermInfo) {
|
||||
let mut prev: &[u8] = streamer.cursor;
|
||||
|
||||
let mut prev_data: Vec<u8> = streamer.current_key.clone();
|
||||
let mut term_info = streamer.value().clone();
|
||||
let mut prev_data: Vec<u8> = Vec::from(streamer.term_delta_decoder.term());
|
||||
|
||||
while let Some((iter_key, _)) = streamer.next() {
|
||||
while let Some((iter_key, iter_term_info)) = streamer.next() {
|
||||
if !predicate(iter_key.as_ref()) {
|
||||
return (prev.as_ptr() as usize, prev_data);
|
||||
return (prev.as_ptr() as usize, prev_data, term_info);
|
||||
}
|
||||
prev = streamer.cursor;
|
||||
prev_data.clear();
|
||||
prev_data.extend_from_slice(iter_key.as_ref());
|
||||
term_info = iter_term_info.clone();
|
||||
}
|
||||
(prev.as_ptr() as usize, prev_data)
|
||||
(prev.as_ptr() as usize, prev_data, term_info)
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamerBuilderImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl<V>) -> Self {
|
||||
impl<'a> TermStreamerBuilderImpl<'a> {
|
||||
pub(crate) fn new(term_dictionary: &'a TermDictionaryImpl, has_positions: bool) -> Self {
|
||||
let data = term_dictionary.stream_data();
|
||||
let origin = data.as_ptr() as usize;
|
||||
TermStreamerBuilderImpl {
|
||||
term_dictionary: term_dictionary,
|
||||
term_info: TermInfo::default(),
|
||||
origin: origin,
|
||||
offset_from: 0,
|
||||
offset_to: data.len(),
|
||||
current_key: Vec::with_capacity(300),
|
||||
has_positions: has_positions,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// See [`TermStreamer`](./trait.TermStreamer.html)
|
||||
pub struct TermStreamerImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
pub struct TermStreamerImpl<'a> {
|
||||
cursor: &'a [u8],
|
||||
current_key: Vec<u8>,
|
||||
current_value: V,
|
||||
term_delta_decoder: TermDeltaDecoder,
|
||||
term_info_decoder: TermInfoDeltaDecoder,
|
||||
}
|
||||
|
||||
|
||||
impl<'a, V: BinarySerializable> TermStreamerImpl<'a, V>
|
||||
where V: 'a + BinarySerializable + Default
|
||||
{
|
||||
pub(crate) fn extract_value(self) -> V {
|
||||
self.current_value
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize_vint(data: &mut &[u8]) -> u64 {
|
||||
let mut res = 0;
|
||||
let mut shift = 0;
|
||||
for i in 0.. {
|
||||
let b = data[i];
|
||||
res |= ((b % 128u8) as u64) << shift;
|
||||
if b & 128u8 != 0u8 {
|
||||
*data = &data[(i + 1)..];
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
impl<'a, V> TermStreamer<V> for TermStreamerImpl<'a, V>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.cursor.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let common_length: usize = deserialize_vint(&mut self.cursor) as usize;
|
||||
self.current_key.truncate(common_length);
|
||||
let added_length: usize = deserialize_vint(&mut self.cursor) as usize;
|
||||
self.current_key.extend(&self.cursor[..added_length]);
|
||||
|
||||
self.cursor = &self.cursor[added_length..];
|
||||
self.current_value =
|
||||
V::deserialize(&mut self.cursor)
|
||||
.expect("Term dictionary corrupted. Failed to deserialize a value");
|
||||
let mut cursor: &[u8] = &self.cursor;
|
||||
let code: u8 = cursor[0];
|
||||
cursor = self.term_delta_decoder.decode(code, &cursor[1..]);
|
||||
cursor = self.term_info_decoder.decode(code, cursor);
|
||||
self.cursor = cursor;
|
||||
true
|
||||
}
|
||||
|
||||
fn key(&self) -> &[u8] {
|
||||
&self.current_key
|
||||
self.term_delta_decoder.term()
|
||||
}
|
||||
|
||||
fn value(&self) -> &V {
|
||||
&self.current_value
|
||||
fn value(&self) -> &TermInfo {
|
||||
&self.term_info_decoder.term_info()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,46 +1,54 @@
|
||||
#![allow(should_implement_trait)]
|
||||
|
||||
use std::io::{self, Write};
|
||||
use super::CheckPoint;
|
||||
use fst;
|
||||
|
||||
use fst::raw::Fst;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use common::CountingWriter;
|
||||
use std::cmp::Ordering;
|
||||
use postings::TermInfo;
|
||||
use schema::FieldType;
|
||||
use super::{TermDeltaEncoder, TermInfoDeltaEncoder, DeltaTermInfo};
|
||||
use fst::raw::Node;
|
||||
use super::streamer::stream_before;
|
||||
use termdict::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
use super::{TermStreamerImpl, TermStreamerBuilderImpl};
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::mem::transmute;
|
||||
|
||||
const BLOCK_SIZE: usize = 1024;
|
||||
const PADDING_SIZE: usize = 4;
|
||||
const INDEX_INTERVAL: usize = 1024;
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
|
||||
pub struct TermDictionaryBuilderImpl<W, V = TermInfo>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
{
|
||||
write: CountingWriter<W>,
|
||||
block_index: fst::MapBuilder<Vec<u8>>,
|
||||
last_key: Vec<u8>,
|
||||
len: usize,
|
||||
_phantom_: PhantomData<V>,
|
||||
fn has_positions(field_type: &FieldType) -> bool {
|
||||
match *field_type {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let indexing_options = text_options.get_indexing_options();
|
||||
if indexing_options.is_position_enabled() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn common_prefix_length(left: &[u8], right: &[u8]) -> usize {
|
||||
left.iter()
|
||||
.cloned()
|
||||
.zip(right.iter().cloned())
|
||||
.take_while(|&(b1, b2)| b1 == b2)
|
||||
.count()
|
||||
/// See [`TermDictionaryBuilder`](./trait.TermDictionaryBuilder.html)
|
||||
pub struct TermDictionaryBuilderImpl<W> {
|
||||
write: CountingWriter<W>,
|
||||
term_delta_encoder: TermDeltaEncoder,
|
||||
term_info_encoder: TermInfoDeltaEncoder,
|
||||
block_index: fst::MapBuilder<Vec<u8>>,
|
||||
checkpoints: Vec<u8>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
|
||||
fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
|
||||
while let Some(transition) = node.transitions().last() {
|
||||
buffer.push(transition.inp);
|
||||
@@ -48,14 +56,32 @@ fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
|
||||
}
|
||||
}
|
||||
|
||||
impl<W, V> TermDictionaryBuilderImpl<W, V>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
impl<W> TermDictionaryBuilderImpl<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn add_index_entry(&mut self) {
|
||||
let stream_offset = self.write.written_bytes() as u32;
|
||||
let term_info = self.term_info_encoder.term_info();
|
||||
let postings_offset = term_info.postings_offset as u32;
|
||||
let positions_offset = term_info.positions_offset as u32;
|
||||
let checkpoint = CheckPoint {
|
||||
stream_offset: stream_offset,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
};
|
||||
self.block_index
|
||||
.insert(&self.last_key, self.write.written_bytes() as u64)
|
||||
.unwrap();
|
||||
.insert(
|
||||
&self.term_delta_encoder.term(),
|
||||
self.checkpoints.len() as u64,
|
||||
)
|
||||
.expect(
|
||||
"Serializing fst on a Vec<u8> should never fail. \
|
||||
Where your terms not in order maybe?",
|
||||
);
|
||||
checkpoint.serialize(&mut self.checkpoints).expect(
|
||||
"Serializing checkpoint on a Vec<u8> should never fail.",
|
||||
);
|
||||
}
|
||||
|
||||
/// # Warning
|
||||
@@ -66,59 +92,131 @@ impl<W, V> TermDictionaryBuilderImpl<W, V>
|
||||
///
|
||||
/// Prefer using `.insert(key, value)`
|
||||
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
if self.len % BLOCK_SIZE == 0 {
|
||||
if self.len % INDEX_INTERVAL == 0 {
|
||||
self.add_index_entry();
|
||||
}
|
||||
self.len += 1;
|
||||
let common_len = common_prefix_length(key, &self.last_key);
|
||||
VInt(common_len as u64).serialize(&mut self.write)?;
|
||||
self.last_key.truncate(common_len);
|
||||
self.last_key.extend_from_slice(&key[common_len..]);
|
||||
VInt((key.len() - common_len) as u64)
|
||||
.serialize(&mut self.write)?;
|
||||
self.write.write_all(&key[common_len..])?;
|
||||
self.term_delta_encoder.encode(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn insert_value(&mut self, value: &V) -> io::Result<()> {
|
||||
value.serialize(&mut self.write)?;
|
||||
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
let delta_term_info = self.term_info_encoder.encode(term_info.clone());
|
||||
let (prefix_len, suffix) = self.term_delta_encoder.prefix_suffix();
|
||||
write_term_kv(
|
||||
prefix_len,
|
||||
suffix,
|
||||
&delta_term_info,
|
||||
self.term_info_encoder.has_positions,
|
||||
&mut self.write,
|
||||
)?;
|
||||
self.len += 1;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
|
||||
where W: Write,
|
||||
V: BinarySerializable + Default
|
||||
fn num_bytes_required(mut n: u32) -> u8 {
|
||||
for i in 1u8..5u8 {
|
||||
if n < 256u32 {
|
||||
return i;
|
||||
} else {
|
||||
n /= 256;
|
||||
}
|
||||
}
|
||||
0u8
|
||||
}
|
||||
|
||||
fn write_term_kv<W: Write>(
|
||||
prefix_len: usize,
|
||||
suffix: &[u8],
|
||||
delta_term_info: &DeltaTermInfo,
|
||||
has_positions: bool,
|
||||
write: &mut W,
|
||||
) -> io::Result<()> {
|
||||
let suffix_len = suffix.len();
|
||||
let mut code = 0u8;
|
||||
let num_bytes_docfreq = num_bytes_required(delta_term_info.doc_freq);
|
||||
let num_bytes_postings_offset = num_bytes_required(delta_term_info.delta_postings_offset);
|
||||
let num_bytes_positions_offset = num_bytes_required(delta_term_info.delta_positions_offset);
|
||||
code |= (num_bytes_docfreq - 1) << 1u8;
|
||||
code |= (num_bytes_postings_offset - 1) << 3u8;
|
||||
code |= (num_bytes_positions_offset - 1) << 5u8;
|
||||
if (prefix_len < 16) && (suffix_len < 16) {
|
||||
code |= 1u8;
|
||||
write.write_all(
|
||||
&[
|
||||
code,
|
||||
(prefix_len as u8) | ((suffix_len as u8) << 4u8),
|
||||
],
|
||||
)?;
|
||||
} else {
|
||||
write.write_all(&[code])?;
|
||||
(prefix_len as u32).serialize(write)?;
|
||||
(suffix_len as u32).serialize(write)?;
|
||||
}
|
||||
write.write_all(suffix)?;
|
||||
{
|
||||
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.doc_freq) };
|
||||
write.write_all(&bytes[0..num_bytes_docfreq as usize])?;
|
||||
}
|
||||
{
|
||||
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_postings_offset) };
|
||||
write.write_all(
|
||||
&bytes[0..num_bytes_postings_offset as usize],
|
||||
)?;
|
||||
}
|
||||
if has_positions {
|
||||
let bytes: [u8; 4] = unsafe { transmute(delta_term_info.delta_positions_offset) };
|
||||
write.write_all(
|
||||
&bytes[0..num_bytes_positions_offset as usize],
|
||||
)?;
|
||||
write.write_all(&[delta_term_info.positions_inner_offset])?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
}
|
||||
|
||||
impl<W> TermDictionaryBuilder<W> for TermDictionaryBuilderImpl<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// Creates a new `TermDictionaryBuilder`
|
||||
fn new(write: W) -> io::Result<Self> {
|
||||
let buffer: Vec<u8> = vec![];
|
||||
fn new(mut write: W, field_type: FieldType) -> io::Result<Self> {
|
||||
let has_positions = has_positions(&field_type);
|
||||
let has_positions_code = if has_positions { 255u8 } else { 0u8 };
|
||||
write.write_all(&[has_positions_code])?;
|
||||
Ok(TermDictionaryBuilderImpl {
|
||||
write: CountingWriter::wrap(write),
|
||||
block_index: fst::MapBuilder::new(buffer).expect("This cannot fail"),
|
||||
last_key: Vec::with_capacity(128),
|
||||
len: 0,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
write: CountingWriter::wrap(write),
|
||||
term_delta_encoder: TermDeltaEncoder::default(),
|
||||
term_info_encoder: TermInfoDeltaEncoder::new(has_positions),
|
||||
block_index: fst::MapBuilder::new(vec![]).expect("This cannot fail"),
|
||||
checkpoints: vec![],
|
||||
len: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Inserts a `(key, value)` pair in the term dictionary.
|
||||
///
|
||||
/// *Keys have to be inserted in order.*
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &V) -> io::Result<()> {
|
||||
fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
|
||||
let key = key_ref.as_ref();
|
||||
self.insert_key(key)?;
|
||||
self.insert_value(value)
|
||||
self.insert_value(value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finalize writing the builder, and returns the underlying
|
||||
/// `Write` object.
|
||||
fn finish(mut self) -> io::Result<W> {
|
||||
self.add_index_entry();
|
||||
let (mut w, split_len) = self.write.finish()?;
|
||||
self.write.write_all(&[0u8; PADDING_SIZE])?;
|
||||
let fst_addr = self.write.written_bytes();
|
||||
let fst_write = self.block_index.into_inner().map_err(convert_fst_error)?;
|
||||
w.write_all(&fst_write)?;
|
||||
(split_len as u64).serialize(&mut w)?;
|
||||
self.write.write_all(&fst_write)?;
|
||||
let check_points_addr = self.write.written_bytes();
|
||||
let (mut w, _) = self.write.finish()?;
|
||||
w.write_all(&self.checkpoints)?;
|
||||
(fst_addr as u64).serialize(&mut w)?;
|
||||
(check_points_addr as u64).serialize(&mut w)?;
|
||||
w.flush()?;
|
||||
Ok(w)
|
||||
}
|
||||
@@ -126,34 +224,37 @@ impl<W, V> TermDictionaryBuilder<W, V> for TermDictionaryBuilderImpl<W, V>
|
||||
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
try!(Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error))
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error))
|
||||
}
|
||||
}))
|
||||
use self::ReadOnlySource::*;
|
||||
let fst_result = match source {
|
||||
Anonymous(data) => Fst::from_shared_bytes(data.data, data.start, data.len),
|
||||
Mmap(mmap_readonly) => Fst::from_mmap(mmap_readonly),
|
||||
};
|
||||
let fst = fst_result.map_err(convert_fst_error)?;
|
||||
Ok(fst::Map::from(fst))
|
||||
}
|
||||
|
||||
/// See [`TermDictionary`](./trait.TermDictionary.html)
|
||||
pub struct TermDictionaryImpl<V = TermInfo>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
pub struct TermDictionaryImpl {
|
||||
stream_data: ReadOnlySource,
|
||||
fst_index: fst::Map,
|
||||
_phantom_: PhantomData<V>,
|
||||
checkpoints_data: ReadOnlySource,
|
||||
has_positions: bool,
|
||||
}
|
||||
|
||||
impl<V> TermDictionaryImpl<V>
|
||||
where V: BinarySerializable + Default
|
||||
{
|
||||
impl TermDictionaryImpl {
|
||||
pub(crate) fn stream_data(&self) -> &[u8] {
|
||||
self.stream_data.as_slice()
|
||||
}
|
||||
|
||||
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, u64) {
|
||||
pub(crate) fn strictly_previous_key(&self, key: &[u8]) -> (Vec<u8>, CheckPoint) {
|
||||
let (term, checkpoint_offset) = self.strictly_previous_key_checkpoint_offset(key);
|
||||
let mut checkpoint_data = &self.checkpoints_data.as_slice()[checkpoint_offset..];
|
||||
let checkpoint =
|
||||
CheckPoint::deserialize(&mut checkpoint_data).expect("Checkpoint data is corrupted");
|
||||
(term, checkpoint)
|
||||
}
|
||||
|
||||
fn strictly_previous_key_checkpoint_offset(&self, key: &[u8]) -> (Vec<u8>, usize) {
|
||||
let fst_map = &self.fst_index;
|
||||
let fst = fst_map.as_fst();
|
||||
let mut node = fst.root();
|
||||
@@ -186,12 +287,12 @@ impl<V> TermDictionaryImpl<V>
|
||||
result.push(last_transition.inp);
|
||||
let fork_node = fst.node(last_transition.addr);
|
||||
fill_last(fst, fork_node, &mut result);
|
||||
let val = fst_map.get(&result).unwrap();
|
||||
let val = fst_map.get(&result).expect("Fst data corrupted") as usize;
|
||||
return (result, val);
|
||||
} else if cur_node.is_final() {
|
||||
// the previous key is a prefix
|
||||
let result_buffer = Vec::from(&key[..i]);
|
||||
let val = fst_map.get(&result_buffer).unwrap();
|
||||
let val = fst_map.get(&result_buffer).expect("Fst data corrupted") as usize;
|
||||
return (result_buffer, val);
|
||||
}
|
||||
}
|
||||
@@ -200,51 +301,70 @@ impl<V> TermDictionaryImpl<V>
|
||||
}
|
||||
|
||||
|
||||
impl<'a, V> TermDictionary<'a, V> for TermDictionaryImpl<V>
|
||||
where V: BinarySerializable + Default + 'a
|
||||
{
|
||||
type Streamer = TermStreamerImpl<'a, V>;
|
||||
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a, V>;
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
type StreamBuilder = TermStreamerBuilderImpl<'a>;
|
||||
|
||||
/// Opens a `TermDictionary` given a data source.
|
||||
fn from_source(source: ReadOnlySource) -> io::Result<Self> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 8;
|
||||
let split_len: usize = {
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
u64::deserialize(&mut split_len_buffer)? as usize
|
||||
};
|
||||
let stream_data = source.slice(0, split_len);
|
||||
let fst_data = source.slice(split_len, length_offset);
|
||||
let fst_index = open_fst_index(fst_data)?;
|
||||
fn from_source(mut source: ReadOnlySource) -> Self {
|
||||
let has_positions = source.slice(0, 1)[0] == 255u8;
|
||||
source = source.slice_from(1);
|
||||
|
||||
Ok(TermDictionaryImpl {
|
||||
stream_data: stream_data,
|
||||
fst_index: fst_index,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
let total_len = source.len();
|
||||
let (body, footer) = source.split(total_len - 16);
|
||||
|
||||
let mut footer_buffer: &[u8] = footer.as_slice();
|
||||
let fst_addr = u64::deserialize(&mut footer_buffer).expect(
|
||||
"deserializing 8 byte should never fail",
|
||||
) as usize;
|
||||
let checkpoints_addr = u64::deserialize(&mut footer_buffer).expect(
|
||||
"deserializing 8 byte should never fail",
|
||||
) as usize;
|
||||
|
||||
let stream_data = body.slice(0, fst_addr - PADDING_SIZE);
|
||||
let fst_data = body.slice(fst_addr, checkpoints_addr);
|
||||
let checkpoints_data = body.slice_from(checkpoints_addr);
|
||||
|
||||
let fst_index = open_fst_index(fst_data).expect("Index FST data corrupted");
|
||||
|
||||
TermDictionaryImpl {
|
||||
has_positions: has_positions,
|
||||
stream_data: stream_data,
|
||||
checkpoints_data: checkpoints_data,
|
||||
fst_index: fst_index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookups the value corresponding to the key.
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V> {
|
||||
let mut streamer = stream_before(self, target_key.as_ref());
|
||||
while streamer.advance() {
|
||||
let position = streamer.key().cmp(target_key.as_ref());
|
||||
match position {
|
||||
Ordering::Less => {}
|
||||
Ordering::Equal => return Some(streamer.extract_value()),
|
||||
Ordering::Greater => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<TermInfo> {
|
||||
let mut streamer = self.range().ge(&target_key).into_stream();
|
||||
if streamer.advance() && streamer.key() == target_key.as_ref() {
|
||||
Some(streamer.value().clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns a range builder, to stream all of the terms
|
||||
/// within an interval.
|
||||
fn range(&'a self) -> Self::StreamBuilder {
|
||||
Self::StreamBuilder::new(self)
|
||||
Self::StreamBuilder::new(self, self.has_positions)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::num_bytes_required;
|
||||
|
||||
#[test]
|
||||
fn test_num_bytes_required() {
|
||||
assert_eq!(num_bytes_required(0), 1);
|
||||
assert_eq!(num_bytes_required(1), 1);
|
||||
assert_eq!(num_bytes_required(255), 1);
|
||||
assert_eq!(num_bytes_required(256), 2);
|
||||
assert_eq!(num_bytes_required(u32::max_value()), 4);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user