mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 09:32:54 +00:00
Merge branch 'master' of github.com:tantivy-search/tantivy into issue/130
This commit is contained in:
@@ -22,7 +22,9 @@ before_script:
|
||||
- |
|
||||
pip install 'travis-cargo<0.2' --user &&
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
- (cargo install rustfmt || true)
|
||||
script:
|
||||
- cargo fmt -- --write-mode=diff
|
||||
- |
|
||||
travis-cargo build &&
|
||||
travis-cargo test &&
|
||||
|
||||
@@ -18,11 +18,10 @@ pub trait StreamingIterator<'a, T> {
|
||||
|
||||
impl<'a, 'b> TokenIter<'b> {
|
||||
fn consume_token(&'a mut self) -> Option<&'a str> {
|
||||
for c in &mut self.chars {
|
||||
for c in &mut self.chars {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -32,9 +31,8 @@ impl<'a, 'b> TokenIter<'b> {
|
||||
|
||||
|
||||
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
|
||||
#[inline]
|
||||
fn next(&'a mut self,) -> Option<&'a str> {
|
||||
fn next(&'a mut self) -> Option<&'a str> {
|
||||
self.term_buffer.clear();
|
||||
// skipping non-letter characters.
|
||||
loop {
|
||||
@@ -45,24 +43,24 @@ impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
return self.consume_token();
|
||||
}
|
||||
}
|
||||
None => { return None; }
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
impl SimpleTokenizer {
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -7,8 +7,8 @@ use Score;
|
||||
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
#[inline]
|
||||
@@ -24,10 +24,10 @@ impl Collector for DoNothingCollector {
|
||||
/// are known at compile time.
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
ChainedCollector {
|
||||
@@ -38,7 +38,10 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
try!(self.left.set_segment(segment_local_id, segment));
|
||||
try!(self.right.set_segment(segment_local_id, segment));
|
||||
Ok(())
|
||||
@@ -70,9 +73,7 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain()
|
||||
.push(&mut top_collector)
|
||||
.push(&mut count_collector);
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
@@ -80,4 +81,4 @@ mod tests {
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
/// documents match the query.
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
@@ -14,20 +14,18 @@ pub struct CountCollector {
|
||||
impl CountCollector {
|
||||
/// Returns the count of documents that were
|
||||
/// collected.
|
||||
pub fn count(&self,) -> usize {
|
||||
pub fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CountCollector {
|
||||
fn default() -> CountCollector {
|
||||
CountCollector {count: 0,
|
||||
}
|
||||
CountCollector { count: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
@@ -47,11 +45,11 @@ mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,11 +16,11 @@ pub use self::top_collector::TopCollector;
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::chain;
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
///
|
||||
///
|
||||
/// For instance,
|
||||
/// For instance,
|
||||
///
|
||||
/// - keeping track of the top 10 best documents
|
||||
/// - computing a breakdown over a fast field
|
||||
@@ -29,7 +29,7 @@ pub use self::chained_collector::chain;
|
||||
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||
///
|
||||
/// As they work on multiple segments, they first inform
|
||||
/// the collector of a change in a segment and then
|
||||
/// the collector of a change in a segment and then
|
||||
/// call the `collect` method to push the document to the collector.
|
||||
///
|
||||
/// Temporally, our collector will receive calls
|
||||
@@ -46,16 +46,22 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>;
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
}
|
||||
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
@@ -77,7 +83,7 @@ pub mod tests {
|
||||
use fastfield::U64FastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in practise, as it does not store
|
||||
@@ -90,7 +96,7 @@ pub mod tests {
|
||||
|
||||
impl TestCollector {
|
||||
/// Return the exhalist of documents.
|
||||
pub fn docs(self,) -> Vec<DocId> {
|
||||
pub fn docs(self) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
@@ -106,7 +112,6 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
@@ -117,10 +122,10 @@ pub mod tests {
|
||||
self.docs.push(doc + self.offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
/// doc in the `DocSet`
|
||||
///
|
||||
@@ -140,11 +145,11 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self,) -> Vec<u64> {
|
||||
pub fn vals(self) -> Vec<u64> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
@@ -161,12 +166,12 @@ pub mod tests {
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use SegmentLocalId;
|
||||
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
pub struct MultiCollector<'a> {
|
||||
@@ -17,15 +17,16 @@ pub struct MultiCollector<'a> {
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector {
|
||||
collectors: collectors,
|
||||
}
|
||||
MultiCollector { collectors: collectors }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
fn set_segment(&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader)
|
||||
-> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
}
|
||||
@@ -52,7 +53,8 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector));
|
||||
let mut collectors = MultiCollector::from(vec![&mut top_collector,
|
||||
&mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
|
||||
@@ -12,8 +12,7 @@ use Score;
|
||||
#[derive(Clone, Copy)]
|
||||
struct GlobalScoredDoc {
|
||||
score: Score,
|
||||
doc_address: DocAddress
|
||||
|
||||
doc_address: DocAddress,
|
||||
}
|
||||
|
||||
impl PartialOrd for GlobalScoredDoc {
|
||||
@@ -25,10 +24,10 @@ impl PartialOrd for GlobalScoredDoc {
|
||||
impl Ord for GlobalScoredDoc {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||
other.score.partial_cmp(&self.score)
|
||||
.unwrap_or(
|
||||
other.doc_address.cmp(&self.doc_address)
|
||||
)
|
||||
other
|
||||
.score
|
||||
.partial_cmp(&self.score)
|
||||
.unwrap_or(other.doc_address.cmp(&self.doc_address))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +52,6 @@ pub struct TopCollector {
|
||||
}
|
||||
|
||||
impl TopCollector {
|
||||
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
@@ -68,9 +66,9 @@ impl TopCollector {
|
||||
segment_id: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns K best documents sorted in decreasing order.
|
||||
///
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn docs(&self) -> Vec<DocAddress> {
|
||||
@@ -81,30 +79,27 @@ impl TopCollector {
|
||||
}
|
||||
|
||||
/// Returns K best ScoredDocument sorted in decreasing order.
|
||||
///
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
|
||||
scored_docs.sort();
|
||||
scored_docs.into_iter()
|
||||
.map(|GlobalScoredDoc {score, doc_address}| (score, doc_address))
|
||||
scored_docs
|
||||
.into_iter()
|
||||
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline]
|
||||
pub fn at_capacity(&self, ) -> bool {
|
||||
pub fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
@@ -113,17 +108,21 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden");
|
||||
let limit_doc: GlobalScoredDoc =
|
||||
*self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden");
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let wrapped_doc = GlobalScoredDoc {
|
||||
score: score,
|
||||
doc_address: DocAddress(self.segment_id, doc)
|
||||
doc_address: DocAddress(self.segment_id, doc),
|
||||
};
|
||||
self.heap.push(wrapped_doc);
|
||||
}
|
||||
@@ -147,13 +146,12 @@ mod tests {
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
assert!(!top_collector.at_capacity());
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector.score_docs()
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
.score_docs()
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec!(
|
||||
(0.8, 1), (0.3, 5), (0.2, 3),
|
||||
));
|
||||
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -171,9 +169,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec!(
|
||||
(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)
|
||||
));
|
||||
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
}
|
||||
{
|
||||
let docs: Vec<DocId> = top_collector
|
||||
@@ -181,7 +177,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|doc_address| doc_address.doc())
|
||||
.collect();
|
||||
assert_eq!(docs, vec!(7, 1, 5, 3));
|
||||
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -6,19 +6,19 @@ use std::mem;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spawning over 9 bytes is possible for instance, if we do
|
||||
///
|
||||
/// Spawning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
@@ -30,12 +30,7 @@ use std::mem;
|
||||
/// number of bits.
|
||||
pub fn compute_num_bits(amplitude: u64) -> u8 {
|
||||
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
}
|
||||
else {
|
||||
64
|
||||
}
|
||||
if amplitude <= 64 - 8 { amplitude } else { 64 }
|
||||
}
|
||||
|
||||
pub struct BitPacker {
|
||||
@@ -46,7 +41,6 @@ pub struct BitPacker {
|
||||
}
|
||||
|
||||
impl BitPacker {
|
||||
|
||||
pub fn new(num_bits: usize) -> BitPacker {
|
||||
BitPacker {
|
||||
mini_buffer: 0u64,
|
||||
@@ -55,7 +49,7 @@ impl BitPacker {
|
||||
written_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
|
||||
let val_u64 = val as u64;
|
||||
if self.mini_buffer_written + self.num_bits > 64 {
|
||||
@@ -63,30 +57,29 @@ impl BitPacker {
|
||||
self.written_size += self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += self.num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.written_size += self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()>{
|
||||
|
||||
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.written_size += num_bytes;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<usize> {
|
||||
self.flush(output)?;
|
||||
Ok(self.written_size)
|
||||
@@ -99,26 +92,24 @@ pub struct BitUnpacker {
|
||||
num_bits: usize,
|
||||
mask: u64,
|
||||
data_ptr: *const u8,
|
||||
data_len: usize,
|
||||
data_len: usize,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
|
||||
let mask: u64 =
|
||||
if num_bits == 64 {
|
||||
!0u64
|
||||
}
|
||||
else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits,
|
||||
mask: mask,
|
||||
data_ptr: data.as_ptr(),
|
||||
data_len: data.len()
|
||||
data_len: data.len(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
@@ -127,13 +118,13 @@ impl BitUnpacker {
|
||||
let bit_shift = idx * self.num_bits - addr * 8;
|
||||
let val_unshifted_unmasked: u64;
|
||||
if addr + 8 <= self.data_len {
|
||||
val_unshifted_unmasked = unsafe { * (self.data_ptr.offset(addr as isize) as *const u64) };
|
||||
}
|
||||
else {
|
||||
val_unshifted_unmasked =
|
||||
unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) };
|
||||
} else {
|
||||
let mut arr = [0u8; 8];
|
||||
if addr < self.data_len {
|
||||
for i in 0..self.data_len - addr {
|
||||
arr[i] = unsafe { *self.data_ptr.offset( (addr + i) as isize) };
|
||||
arr[i] = unsafe { *self.data_ptr.offset((addr + i) as isize) };
|
||||
}
|
||||
}
|
||||
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
|
||||
@@ -141,7 +132,6 @@ impl BitUnpacker {
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & self.mask)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -150,7 +140,7 @@ impl BitUnpacker {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
@@ -162,31 +152,26 @@ mod test {
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: usize) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new(num_bits);
|
||||
let max_val: u64 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u64> = (0u64..len as u64).map(|i| {
|
||||
if max_val == 0 {
|
||||
0
|
||||
}
|
||||
else {
|
||||
i % max_val
|
||||
}
|
||||
}).collect();
|
||||
let vals: Vec<u64> = (0u64..len as u64)
|
||||
.map(|i| if max_val == 0 { 0 } else { i % max_val })
|
||||
.collect();
|
||||
for &val in &vals {
|
||||
bitpacker.write(val, &mut data).unwrap();
|
||||
}
|
||||
let num_bytes = bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
|
||||
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
|
||||
assert_eq!(data.len(), num_bytes);
|
||||
let bitunpacker = BitUnpacker::new(&data, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_bitpacker() {
|
||||
test_bitpacker_util(10, 3);
|
||||
@@ -195,4 +180,4 @@ mod test {
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,10 +19,10 @@ pub fn make_io_err(msg: String) -> io::Error {
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self,) -> usize;
|
||||
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self,) -> bool {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
@@ -32,13 +32,13 @@ const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// to
|
||||
/// `0 .. 2^64`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
@@ -47,7 +47,7 @@ pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by
|
||||
/// Reverse the mapping given by
|
||||
/// `i64_to_u64`.
|
||||
#[inline(always)]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
@@ -76,4 +76,4 @@ mod test {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
|
||||
pub trait BinarySerializable : fmt::Debug + Sized {
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self>;
|
||||
}
|
||||
@@ -45,14 +45,13 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
|
||||
Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write)))
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) )
|
||||
Ok((try!(Left::deserialize(reader)), try!(Right::deserialize(reader))))
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_u32::<Endianness>(*self)
|
||||
.map(|_| 4)
|
||||
writer.write_u32::<Endianness>(*self).map(|_| 4)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u32> {
|
||||
@@ -63,8 +62,7 @@ impl BinarySerializable for u32 {
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
.map(|_| 8)
|
||||
writer.write_u64::<Endianness>(*self).map(|_| 8)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u64> {
|
||||
reader.read_u64::<Endianness>()
|
||||
@@ -73,8 +71,7 @@ impl BinarySerializable for u64 {
|
||||
|
||||
impl BinarySerializable for i64 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_i64::<Endianness>(*self)
|
||||
.map(|_| 8)
|
||||
writer.write_i64::<Endianness>(*self).map(|_| 8)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<i64> {
|
||||
reader.read_i64::<Endianness>()
|
||||
@@ -104,7 +101,9 @@ impl BinarySerializable for String {
|
||||
fn deserialize(reader: &mut Read) -> io::Result<String> {
|
||||
let string_length = try!(VInt::deserialize(reader)).val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
try!(reader.take(string_length as u64).read_to_string(&mut result));
|
||||
try!(reader
|
||||
.take(string_length as u64)
|
||||
.read_to_string(&mut result));
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
@@ -122,8 +121,7 @@ mod test {
|
||||
if num_bytes != 0 {
|
||||
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
}
|
||||
let mut cursor = &buffer[..];
|
||||
@@ -147,15 +145,15 @@ mod test {
|
||||
#[test]
|
||||
fn test_serialize_string() {
|
||||
serialize_test(String::from(""), 1);
|
||||
serialize_test(String::from("ぽよぽよ"), 1 + 3*4);
|
||||
serialize_test(String::from("富士さん見える。"), 1 + 3*8);
|
||||
serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4);
|
||||
serialize_test(String::from("富士さん見える。"), 1 + 3 * 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vec() {
|
||||
let v: Vec<u8> = Vec::new();
|
||||
serialize_test(v, 1);
|
||||
serialize_test(vec!(1u32, 3u32), 1 + 4*2);
|
||||
serialize_test(vec![1u32, 3u32], 1 + 4 * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -10,7 +10,7 @@ pub struct OpenTimer<'a> {
|
||||
impl<'a> OpenTimer<'a> {
|
||||
/// Starts timing a new named subtask
|
||||
///
|
||||
/// The timer is stopped automatically
|
||||
/// The timer is stopped automatically
|
||||
/// when the `OpenTimer` is dropped.
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
@@ -23,12 +23,17 @@ impl<'a> OpenTimer<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Drop for OpenTimer<'a> {
|
||||
fn drop(&mut self,) {
|
||||
self.timer_tree.timings.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
fn drop(&mut self) {
|
||||
self.timer_tree
|
||||
.timings
|
||||
.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,12 +52,11 @@ pub struct TimerTree {
|
||||
}
|
||||
|
||||
impl TimerTree {
|
||||
|
||||
/// Returns the total time elapsed in microseconds
|
||||
pub fn total_time(&self,) -> i64 {
|
||||
/// Returns the total time elapsed in microseconds
|
||||
pub fn total_time(&self) -> i64 {
|
||||
self.timings.last().unwrap().duration
|
||||
}
|
||||
|
||||
|
||||
/// Open a new named subtask
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
@@ -66,9 +70,7 @@ impl TimerTree {
|
||||
|
||||
impl Default for TimerTree {
|
||||
fn default() -> TimerTree {
|
||||
TimerTree {
|
||||
timings: Vec::new(),
|
||||
}
|
||||
TimerTree { timings: Vec::new() }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,12 +5,12 @@ use std::io::Read;
|
||||
|
||||
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct VInt(pub u64);
|
||||
|
||||
impl VInt {
|
||||
pub fn val(&self,) -> u64 {
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
@@ -27,8 +27,7 @@ impl BinarySerializable for VInt {
|
||||
buffer[written] = next_byte | 128u8;
|
||||
written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
buffer[written] = next_byte;
|
||||
written += 1;
|
||||
}
|
||||
@@ -50,12 +49,9 @@ impl BinarySerializable for VInt {
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer"))
|
||||
}
|
||||
_ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")),
|
||||
}
|
||||
}
|
||||
Ok(VInt(result))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ pub struct CompositeEncoder {
|
||||
}
|
||||
|
||||
impl CompositeEncoder {
|
||||
|
||||
pub fn new() -> CompositeEncoder {
|
||||
CompositeEncoder {
|
||||
block_encoder: BlockEncoder::new(),
|
||||
@@ -21,12 +20,14 @@ impl CompositeEncoder {
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
let mut offset = 0u32;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
|
||||
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed = self.block_encoder.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
|
||||
let vint_compressed =
|
||||
self.block_encoder
|
||||
.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
@@ -35,11 +36,12 @@ impl CompositeEncoder {
|
||||
self.output.clear();
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed = self.block_encoder.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
|
||||
let vint_compressed = self.block_encoder
|
||||
.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
@@ -60,7 +62,10 @@ impl CompositeDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
|
||||
pub fn uncompress_sorted(&mut self,
|
||||
mut compressed_data: &[u8],
|
||||
uncompressed_len: usize)
|
||||
-> &[u32] {
|
||||
if uncompressed_len > self.vals.capacity() {
|
||||
let extra_capacity = uncompressed_len - self.vals.capacity();
|
||||
self.vals.reserve(extra_capacity);
|
||||
@@ -69,24 +74,37 @@ impl CompositeDecoder {
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder.uncompress_block_sorted(compressed_data, offset);
|
||||
compressed_data = self.block_decoder
|
||||
.uncompress_block_sorted(compressed_data, offset);
|
||||
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder.uncompress_vint_sorted(compressed_data, offset, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(compressed_data,
|
||||
offset,
|
||||
uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
|
||||
pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
|
||||
pub fn uncompress_unsorted(&mut self,
|
||||
mut compressed_data: &[u8],
|
||||
uncompressed_len: usize)
|
||||
-> &[u32] {
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder.uncompress_block_unsorted(compressed_data);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
compressed_data = self.block_decoder
|
||||
.uncompress_block_unsorted(compressed_data);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
self.block_decoder
|
||||
.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals
|
||||
.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
}
|
||||
@@ -138,9 +156,7 @@ pub mod tests {
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_sorted(&data);
|
||||
});
|
||||
b.iter(|| { encoder.compress_sorted(&data); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -149,11 +165,6 @@ pub mod tests {
|
||||
let data = tests::generate_array(BENCH_NUM_INTS, 0.1);
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_sorted(compressed, BENCH_NUM_INTS);
|
||||
});
|
||||
b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); });
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -38,12 +38,18 @@ pub trait VIntEncoder {
|
||||
}
|
||||
|
||||
pub trait VIntDecoder {
|
||||
fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> &'a [u8];
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> &'a [u8];
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> &'a [u8];
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
|
||||
vint::compress_sorted(input, &mut self.output, offset)
|
||||
}
|
||||
@@ -54,20 +60,19 @@ impl VIntEncoder for BlockEncoder {
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize) -> &'a [u8] {
|
||||
fn uncompress_vint_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize)
|
||||
-> &'a [u8] {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize) -> &'a [u8] {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize)
|
||||
-> &'a [u8] {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
@@ -85,7 +90,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| i*7).collect();
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 0);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
@@ -100,7 +105,7 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block_with_offset() {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i*7).collect();
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 10);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
@@ -117,7 +122,7 @@ pub mod tests {
|
||||
fn test_encode_sorted_block_with_junk() {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32).collect();
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 10);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
@@ -137,7 +142,7 @@ pub mod tests {
|
||||
fn test_encode_unsorted_block_with_junk() {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32 % 12).collect();
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_unsorted(&vals);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
@@ -159,15 +164,13 @@ pub mod tests {
|
||||
{
|
||||
let expected_length = 154;
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| 4 + i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
|
||||
for offset in &[0u32, 1u32, 2u32] {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
let remaining_data =
|
||||
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(0, remaining_data.len());
|
||||
assert_eq!(input, decoder.output_array());
|
||||
}
|
||||
@@ -179,9 +182,7 @@ pub mod tests {
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
b.iter(|| { encoder.compress_block_sorted(&data, 0u32); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -190,9 +191,7 @@ pub mod tests {
|
||||
let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32);
|
||||
});
|
||||
b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); });
|
||||
}
|
||||
|
||||
|
||||
@@ -202,9 +201,7 @@ pub mod tests {
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
b.iter(|| { encoder.compress_vint_sorted(&data, 0u32); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -213,9 +210,7 @@ pub mod tests {
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); });
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -4,10 +4,10 @@ use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize {
|
||||
let mut max_delta = 0;
|
||||
let mut max_delta = 0;
|
||||
{
|
||||
let mut local_offset = offset;
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
@@ -24,7 +24,10 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
|
||||
for val in vals {
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
|
||||
1 +
|
||||
bit_packer
|
||||
.close(&mut output)
|
||||
.expect("packing in memory should never fail")
|
||||
}
|
||||
|
||||
|
||||
@@ -36,36 +39,40 @@ pub struct BlockEncoder {
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
self.input_buffer.clone_from_slice(vals);
|
||||
let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size: usize = {
|
||||
let mut output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter().cloned().max().expect("compress unsorted called with an empty array");
|
||||
let mut output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.expect("compress unsorted called with an empty array");
|
||||
let num_bits = compute_num_bits(max);
|
||||
output.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
|
||||
1 +
|
||||
bit_packer
|
||||
.close(&mut output)
|
||||
.expect("packing in memory should never fail")
|
||||
};
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct BlockDecoder {
|
||||
@@ -78,15 +85,18 @@ impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], mut offset: u32) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32)
|
||||
-> &'a [u8] {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
@@ -96,13 +106,13 @@ impl BlockDecoder {
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
|
||||
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
|
||||
};
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
@@ -112,16 +122,14 @@ impl BlockDecoder {
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self,) -> &[u32] {
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,28 +1,21 @@
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
mod simdcomp {
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
pub fn compress_sorted(
|
||||
data: *const u32,
|
||||
output: *mut u8,
|
||||
offset: u32) -> size_t;
|
||||
extern "C" {
|
||||
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
|
||||
|
||||
pub fn uncompress_sorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn compress_unsorted(
|
||||
data: *const u32,
|
||||
output: *mut u8) -> size_t;
|
||||
pub fn uncompress_sorted(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
|
||||
pub fn uncompress_unsorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32) -> size_t;
|
||||
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,24 +42,22 @@ pub struct BlockEncoder {
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
let compressed_size = compress_sorted(vals, &mut self.output, offset);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size = compress_unsorted(vals, &mut self.output);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct BlockDecoder {
|
||||
@@ -79,31 +70,34 @@ impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] {
|
||||
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self,) -> &[u32] {
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
@@ -111,3 +105,16 @@ impl BlockDecoder {
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BlockEncoder;
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_len() {
|
||||
let data: Vec<u32> = (0u32..128u32).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
assert_eq!(compressed.len(), 17);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,8 +12,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
@@ -34,8 +33,7 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
@@ -45,10 +43,10 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
@@ -69,9 +67,7 @@ pub fn uncompress_sorted<'a>(
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
|
||||
@@ -3,28 +3,25 @@ mod streamvbyte {
|
||||
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32) -> size_t;
|
||||
extern "C" {
|
||||
pub fn streamvbyte_delta_encode(data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize) -> size_t;
|
||||
pub fn streamvbyte_delta_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32)
|
||||
-> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize)
|
||||
-> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,11 +29,10 @@ mod streamvbyte {
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
streamvbyte::streamvbyte_delta_encode(input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
@@ -44,39 +40,29 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) ->
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr())
|
||||
};
|
||||
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32)
|
||||
-> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len())
|
||||
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
|
||||
@@ -153,11 +153,10 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
@@ -186,13 +185,13 @@ impl Index {
|
||||
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
|
||||
Ok(load_metas(self.directory())?.segments)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
@@ -203,9 +202,9 @@ impl Index {
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.collect();
|
||||
|
||||
@@ -2,9 +2,9 @@ use schema::Schema;
|
||||
use core::SegmentMeta;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
/// It keeps information about
|
||||
/// It keeps information about
|
||||
/// * the searchable segments,
|
||||
/// * the index docstamp
|
||||
/// * the schema
|
||||
@@ -19,7 +19,7 @@ pub struct IndexMeta {
|
||||
impl IndexMeta {
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: vec!(),
|
||||
segments: vec![],
|
||||
schema: schema,
|
||||
opstamp: 0u64,
|
||||
}
|
||||
|
||||
@@ -40,4 +40,4 @@ lazy_static! {
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,20 +13,17 @@ pub struct GenerationItem<T> {
|
||||
|
||||
// See https://github.com/crossbeam-rs/crossbeam/issues/91
|
||||
struct NonLeakingMsQueue<T> {
|
||||
underlying_queue: MsQueue<T>
|
||||
underlying_queue: MsQueue<T>,
|
||||
}
|
||||
|
||||
impl<T> Default for NonLeakingMsQueue<T> {
|
||||
fn default() -> NonLeakingMsQueue<T> {
|
||||
NonLeakingMsQueue {
|
||||
underlying_queue: MsQueue::new(),
|
||||
}
|
||||
NonLeakingMsQueue { underlying_queue: MsQueue::new() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> NonLeakingMsQueue<T> {
|
||||
|
||||
fn pop(&self,) -> T {
|
||||
fn pop(&self) -> T {
|
||||
self.underlying_queue.pop()
|
||||
}
|
||||
|
||||
@@ -48,7 +45,6 @@ pub struct Pool<T> {
|
||||
}
|
||||
|
||||
impl<T> Pool<T> {
|
||||
|
||||
pub fn new() -> Pool<T> {
|
||||
Pool {
|
||||
queue: Arc::default(),
|
||||
@@ -68,23 +64,24 @@ impl<T> Pool<T> {
|
||||
}
|
||||
self.advertise_generation(next_generation);
|
||||
}
|
||||
|
||||
/// At the exit of this method,
|
||||
|
||||
/// At the exit of this method,
|
||||
/// - freshest_generation has a value greater or equal than generation
|
||||
/// - freshest_generation has a value that has been advertised
|
||||
/// - freshest_generation has
|
||||
/// - freshest_generation has
|
||||
fn advertise_generation(&self, generation: usize) {
|
||||
// not optimal at all but the easiest to read proof.
|
||||
// not optimal at all but the easiest to read proof.
|
||||
loop {
|
||||
let former_generation = self.freshest_generation.load(Ordering::Acquire);
|
||||
if former_generation >= generation {
|
||||
break;
|
||||
}
|
||||
self.freshest_generation.compare_and_swap(former_generation, generation, Ordering::SeqCst);
|
||||
}
|
||||
self.freshest_generation
|
||||
.compare_and_swap(former_generation, generation, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
|
||||
fn generation(&self,) -> usize {
|
||||
|
||||
fn generation(&self) -> usize {
|
||||
self.freshest_generation.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
@@ -94,19 +91,16 @@ impl<T> Pool<T> {
|
||||
let gen_item = self.queue.pop();
|
||||
if gen_item.generation >= generation {
|
||||
return LeasedItem {
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
}
|
||||
}
|
||||
else {
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
};
|
||||
} else {
|
||||
// this searcher is obsolete,
|
||||
// removing it from the pool.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
pub struct LeasedItem<T> {
|
||||
@@ -115,23 +109,29 @@ pub struct LeasedItem<T> {
|
||||
}
|
||||
|
||||
impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item.as_ref().expect("Unwrapping a leased item should never fail").item // unwrap is safe here
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item.as_mut().expect("Unwrapping a mut leased item should never fail").item // unwrap is safe here
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for LeasedItem<T> {
|
||||
fn drop(&mut self) {
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect("Unwrapping a leased item should never fail");
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||
.expect("Unwrapping a leased item should never fail");
|
||||
self.recycle_queue.push(gen_item);
|
||||
}
|
||||
}
|
||||
@@ -158,4 +158,4 @@ mod tests {
|
||||
assert_eq!(*pool.acquire(), 11);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,36 +13,35 @@ use std::fmt;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
///
|
||||
pub struct Searcher {
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
|
||||
impl Searcher {
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// the request to the right `Segment`.
|
||||
/// the request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
|
||||
let DocAddress(segment_local_id, doc_id) = *doc_address;
|
||||
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
||||
segment_reader.doc(doc_id)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self,) -> DocId {
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.num_docs())
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
|
||||
/// Return the overall number of documents containing
|
||||
/// the given term.
|
||||
/// the given term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
@@ -63,15 +62,15 @@ impl Searcher {
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self,) -> &[SegmentReader] {
|
||||
pub fn segment_readers(&self) -> &[SegmentReader] {
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ordinal
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
|
||||
query.search(self, collector)
|
||||
@@ -81,9 +80,7 @@ impl Searcher {
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
segment_readers: segment_readers,
|
||||
}
|
||||
Searcher { segment_readers: segment_readers }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,4 +92,4 @@ impl fmt::Debug for Searcher {
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({:?})", segment_ids)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,8 +26,8 @@ impl fmt::Debug for Segment {
|
||||
}
|
||||
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment {
|
||||
index: index,
|
||||
@@ -36,9 +36,8 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
|
||||
/// Returns our index's schema.
|
||||
pub fn schema(&self,) -> Schema {
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.index.schema()
|
||||
}
|
||||
|
||||
@@ -53,13 +52,13 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Returns the segment's id.
|
||||
pub fn id(&self,) -> SegmentId {
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.meta.id()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
self.meta.relative_path(component)
|
||||
@@ -77,14 +76,18 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
pub fn open_read(&self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = try!(self.index.directory().open_read(&path));
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
/// Open one of the component file for *regular* write.
|
||||
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
|
||||
pub fn open_write(&mut self,
|
||||
component: SegmentComponent)
|
||||
-> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = try!(self.index.directory_mut().open_write(&path));
|
||||
Ok(write)
|
||||
@@ -114,10 +117,10 @@ mod tests {
|
||||
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
|
||||
let segment = index.new_segment();
|
||||
let path = segment.relative_path(SegmentComponent::POSTINGS);
|
||||
|
||||
|
||||
let directory = index.directory_mut();
|
||||
directory.atomic_write(&*path, &vec!(0u8)).unwrap();
|
||||
|
||||
directory.atomic_write(&*path, &vec![0u8]).unwrap();
|
||||
|
||||
let living_files = HashSet::new();
|
||||
{
|
||||
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
|
||||
@@ -130,4 +133,4 @@ mod tests {
|
||||
assert!(!directory.exists(&*path));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,22 +6,18 @@ pub enum SegmentComponent {
|
||||
FIELDNORMS,
|
||||
TERMS,
|
||||
STORE,
|
||||
DELETE
|
||||
DELETE,
|
||||
}
|
||||
|
||||
impl SegmentComponent {
|
||||
|
||||
pub fn iterator() -> impl Iterator<Item=&'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE
|
||||
];
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE];
|
||||
SEGMENT_COMPONENTS.into_iter()
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::sync::atomic;
|
||||
|
||||
/// Tantivy SegmentId.
|
||||
///
|
||||
/// Tantivy's segment are identified
|
||||
/// Tantivy's segment are identified
|
||||
/// by a UUID which is used to prefix the filenames
|
||||
/// of all of the file associated with the segment.
|
||||
///
|
||||
@@ -52,14 +52,14 @@ impl SegmentId {
|
||||
/// We are using UUID4, so only 6 bits are fixed,
|
||||
/// and the rest is random.
|
||||
///
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message.
|
||||
pub fn short_uuid_string(&self,) -> String {
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.simple().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
pub fn uuid_string(&self,) -> String {
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.simple().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,12 +17,11 @@ struct DeleteMeta {
|
||||
pub struct SegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
|
||||
/// Creates a new segment meta for
|
||||
/// Creates a new segment meta for
|
||||
/// a segment with no deletes and no documents.
|
||||
pub fn new(segment_id: SegmentId) -> SegmentMeta {
|
||||
SegmentMeta {
|
||||
@@ -53,28 +52,28 @@ impl SegmentMeta {
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| {
|
||||
self.relative_path(*component)
|
||||
})
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
|
||||
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
|
||||
});
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => {
|
||||
format!(".{}.del", self.delete_opstamp().unwrap_or(0))
|
||||
}
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
@@ -95,9 +94,7 @@ impl SegmentMeta {
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
self.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
@@ -114,8 +111,8 @@ impl SegmentMeta {
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.deletes = Some(DeleteMeta {
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ impl SegmentReader {
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_meta.max_doc()
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
@@ -69,7 +69,7 @@ impl SegmentReader {
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_meta.num_docs()
|
||||
}
|
||||
|
||||
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
@@ -91,39 +91,39 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>
|
||||
(&self,
|
||||
field: Field)
|
||||
-> fastfield::Result<TFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
else {
|
||||
Ok(
|
||||
self.fast_fields_reader
|
||||
.open_reader(field)
|
||||
.expect("Fast field file corrupted.")
|
||||
)
|
||||
} else {
|
||||
Ok(self.fast_fields_reader
|
||||
.open_reader(field)
|
||||
.expect("Fast field file corrupted."))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
///
|
||||
/// Field norms are the length (in tokens) of the fields.
|
||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
|
||||
self.fieldnorms_reader.open_reader(field)
|
||||
self.fieldnorms_reader.open_reader(field)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
match self.get_term_info(term) {
|
||||
Some(term_info) => term_info.doc_freq,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self) -> &StoreReader {
|
||||
&self.store_reader
|
||||
@@ -136,46 +136,44 @@ impl SegmentReader {
|
||||
let term_infos = try!(FstMap::from_source(source));
|
||||
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
|
||||
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
|
||||
|
||||
|
||||
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
|
||||
let fast_fields_reader = try!(FastFieldsReader::open(fast_field_data));
|
||||
|
||||
|
||||
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
|
||||
let fieldnorms_reader = try!(FastFieldsReader::open(fieldnorms_data));
|
||||
|
||||
|
||||
let positions_data = segment
|
||||
.open_read(SegmentComponent::POSITIONS)
|
||||
.unwrap_or_else(|_| ReadOnlySource::empty());
|
||||
|
||||
let delete_bitset =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
DeleteBitSet::open(delete_data)
|
||||
}
|
||||
else {
|
||||
DeleteBitSet::empty()
|
||||
};
|
||||
|
||||
|
||||
let delete_bitset = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
DeleteBitSet::open(delete_data)
|
||||
} else {
|
||||
DeleteBitSet::empty()
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
segment_meta: segment.meta().clone(),
|
||||
postings_data: postings_shared_mmap,
|
||||
term_infos: Arc::new(term_infos),
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
fast_fields_reader: Arc::new(fast_fields_reader),
|
||||
fieldnorms_reader: Arc::new(fieldnorms_reader),
|
||||
delete_bitset: delete_bitset,
|
||||
positions_data: positions_data,
|
||||
schema: schema,
|
||||
})
|
||||
segment_meta: segment.meta().clone(),
|
||||
postings_data: postings_shared_mmap,
|
||||
term_infos: Arc::new(term_infos),
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
fast_fields_reader: Arc::new(fast_fields_reader),
|
||||
fieldnorms_reader: Arc::new(fieldnorms_reader),
|
||||
delete_bitset: delete_bitset,
|
||||
positions_data: positions_data,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn term_infos(&self) -> &FstMap<TermInfo> {
|
||||
&self.term_infos
|
||||
}
|
||||
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
/// bearing the given doc id.
|
||||
/// This method is slow and should seldom be called from
|
||||
@@ -186,15 +184,18 @@ impl SegmentReader {
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
|
||||
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
|
||||
pub fn read_postings(&self,
|
||||
term: &Term,
|
||||
option: SegmentPostingsOption)
|
||||
-> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(&term));
|
||||
@@ -204,44 +205,40 @@ impl SegmentReader {
|
||||
FieldType::Str(ref options) => {
|
||||
let indexing_options = options.get_indexing_options();
|
||||
match option {
|
||||
SegmentPostingsOption::NoFreq => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(),
|
||||
SegmentPostingsOption::Freq => {
|
||||
if indexing_options.is_termfreq_enabled() {
|
||||
FreqHandler::new_with_freq()
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
}
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition {
|
||||
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
|
||||
let offseted_position_data = &self.positions_data[term_info.positions_offset as
|
||||
usize..];
|
||||
FreqHandler::new_with_freq_and_position(offseted_position_data)
|
||||
}
|
||||
else if indexing_options.is_termfreq_enabled()
|
||||
{
|
||||
} else if indexing_options.is_termfreq_enabled() {
|
||||
FreqHandler::new_with_freq()
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
_ => FreqHandler::new_without_freq(),
|
||||
};
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq,
|
||||
postings_data,
|
||||
&self.delete_bitset,
|
||||
freq_handler))
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the posting list associated with a term.
|
||||
///
|
||||
/// If the term is not found, return None.
|
||||
/// Even when non-null, because of deletes, the posting object
|
||||
/// Even when non-null, because of deletes, the posting object
|
||||
/// returned by this method may contain no documents.
|
||||
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
|
||||
let field_entry = self.schema.get_field_entry(term.field());
|
||||
@@ -249,15 +246,18 @@ impl SegmentReader {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SegmentPostingsOption::FreqAndPositions
|
||||
}
|
||||
_ => SegmentPostingsOption::NoFreq,
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) => SegmentPostingsOption::NoFreq
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) => SegmentPostingsOption::NoFreq,
|
||||
};
|
||||
self.read_postings(term, segment_posting_option)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.term_infos.get(term.as_slice())
|
||||
|
||||
@@ -64,14 +64,15 @@ impl<'a> TermIterator<'a> {
|
||||
loop {
|
||||
match self.heap.peek() {
|
||||
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
|
||||
_ => { break; }
|
||||
_ => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_segment_ords.push(next_heap_it.segment_ord);
|
||||
}
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -92,17 +93,18 @@ impl<'a> TermIterator<'a> {
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn segment_ords(&self) -> &[usize]{
|
||||
pub fn segment_ords(&self) -> &[usize] {
|
||||
&self.current_segment_ords[..]
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
for segment_ord in self.current_segment_ords.drain(..) {
|
||||
if let Some(term) = self.key_streams[segment_ord].next() {
|
||||
self.heap.push(HeapItem {
|
||||
term: Term::from_bytes(term),
|
||||
segment_ord: segment_ord,
|
||||
});
|
||||
self.heap
|
||||
.push(HeapItem {
|
||||
term: Term::from_bytes(term),
|
||||
segment_ord: segment_ord,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,8 +116,7 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
if self.advance() {
|
||||
Some(&self.current_term)
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -123,12 +124,10 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
|
||||
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
|
||||
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
|
||||
TermIterator::new(
|
||||
segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect()
|
||||
)
|
||||
TermIterator::new(segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,4 +179,4 @@ mod tests {
|
||||
assert_eq!(terms, "abcdef");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,18 +20,17 @@ pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
}
|
||||
|
||||
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
|
||||
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
|
||||
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
|
||||
Ok(FstMapBuilder {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Horribly unsafe, nobody should ever do that... except me :)
|
||||
///
|
||||
///
|
||||
/// If used, it must be used by systematically alternating calls
|
||||
/// to insert_key and insert_value.
|
||||
///
|
||||
@@ -39,8 +38,8 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
/// in a nice way.
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
try!(self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -53,17 +52,14 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
#[cfg(test)]
|
||||
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
|
||||
try!(self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
try!(value.serialize(&mut self.data));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self,) -> io::Result<W> {
|
||||
let mut file = try!(
|
||||
self.fst_builder
|
||||
.into_inner()
|
||||
.map_err(convert_fst_error));
|
||||
pub fn finish(self) -> io::Result<W> {
|
||||
let mut file = try!(self.fst_builder.into_inner().map_err(convert_fst_error));
|
||||
let footer_size = self.data.len() as u32;
|
||||
try!(file.write_all(&self.data));
|
||||
try!((footer_size as u32).serialize(&mut file));
|
||||
@@ -81,31 +77,35 @@ pub struct FstMap<V: BinarySerializable> {
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
|
||||
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
|
||||
}))
|
||||
ReadOnlySource::Anonymous(data) => {
|
||||
try!(Fst::from_shared_bytes(data.data, data.start, data.len)
|
||||
.map_err(convert_fst_error))
|
||||
}
|
||||
ReadOnlySource::Mmap(mmap_readonly) => {
|
||||
try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error))
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
impl<V: BinarySerializable> FstMap<V> {
|
||||
|
||||
pub fn keys(&self,) -> fst::map::Keys {
|
||||
pub fn keys(&self) -> fst::map::Keys {
|
||||
self.fst_index.keys()
|
||||
}
|
||||
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
|
||||
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
let fst_index = try!(open_fst_index(fst_source));
|
||||
Ok(FstMap {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn read_value(&self, offset: u64) -> V {
|
||||
|
||||
@@ -114,9 +114,9 @@ mod tests {
|
||||
let mut skip_list: SkipList<()> = SkipList::from(output.as_slice());
|
||||
assert_eq!(skip_list.next().unwrap(), (0, ()));
|
||||
skip_list.seek(431);
|
||||
assert_eq!(skip_list.next().unwrap(), (431,()) );
|
||||
assert_eq!(skip_list.next().unwrap(), (431, ()));
|
||||
skip_list.seek(1003);
|
||||
assert_eq!(skip_list.next().unwrap(), (1004,()) );
|
||||
assert_eq!(skip_list.next().unwrap(), (1004, ()));
|
||||
assert_eq!(skip_list.next(), None);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,14 +13,12 @@ struct Layer<'a, T> {
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
fn next(&mut self) -> Option<(DocId, T)> {
|
||||
if self.next_id == u32::max_value() {
|
||||
None
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let cur_val = T::deserialize(&mut self.cursor).unwrap();
|
||||
let cur_id = self.next_id;
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
@@ -31,7 +29,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
|
||||
fn from(data: &'a [u8]) -> Layer<'a, T> {
|
||||
let mut cursor = data;
|
||||
let mut cursor = data;
|
||||
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
|
||||
Layer {
|
||||
data: data,
|
||||
@@ -43,7 +41,6 @@ impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
|
||||
fn empty() -> Layer<'a, T> {
|
||||
Layer {
|
||||
data: &EMPTY,
|
||||
@@ -53,11 +50,11 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_offset(&mut self, offset: usize) {
|
||||
fn seek_offset(&mut self, offset: usize) {
|
||||
self.cursor = &self.data[offset..];
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
}
|
||||
|
||||
|
||||
// Returns the last element (key, val)
|
||||
// such that (key < doc_id)
|
||||
//
|
||||
@@ -67,8 +64,12 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
let mut val = None;
|
||||
while self.next_id < doc_id {
|
||||
match self.next() {
|
||||
None => { break; },
|
||||
v => { val = v; }
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
v => {
|
||||
val = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
val
|
||||
@@ -82,16 +83,14 @@ pub struct SkipList<'a, T: BinarySerializable> {
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
fn next(&mut self) -> Option<(DocId, T)> {
|
||||
self.data_layer.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
|
||||
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
|
||||
let mut next_layer_skip: Option<(DocId, u32)> = None;
|
||||
for skip_layer in &mut self.skip_layers {
|
||||
@@ -99,39 +98,33 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
skip_layer.seek_offset(offset as usize);
|
||||
}
|
||||
next_layer_skip = skip_layer.seek(doc_id);
|
||||
}
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
self.data_layer.seek_offset(offset as usize);
|
||||
}
|
||||
self.data_layer.seek(doc_id)
|
||||
}
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
self.data_layer.seek_offset(offset as usize);
|
||||
}
|
||||
self.data_layer.seek(doc_id)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
|
||||
|
||||
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
|
||||
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
|
||||
let num_layers = offsets.len();
|
||||
let layers_data: &[u8] = data;
|
||||
let data_layer: Layer<'a, T> =
|
||||
if num_layers == 0 { Layer::empty() }
|
||||
else {
|
||||
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
|
||||
Layer::from(first_layer_data)
|
||||
};
|
||||
let data_layer: Layer<'a, T> = if num_layers == 0 {
|
||||
Layer::empty()
|
||||
} else {
|
||||
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
|
||||
Layer::from(first_layer_data)
|
||||
};
|
||||
let skip_layers = (0..max(1, num_layers) - 1)
|
||||
.map(|i| (offsets[i] as usize, offsets[i + 1] as usize))
|
||||
.map(|(start, stop)| {
|
||||
Layer::from(&layers_data[start..stop])
|
||||
})
|
||||
.map(|(start, stop)| Layer::from(&layers_data[start..stop]))
|
||||
.collect();
|
||||
SkipList {
|
||||
skip_layers: skip_layers,
|
||||
data_layer: data_layer,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -13,8 +13,7 @@ struct LayerBuilder<T: BinarySerializable> {
|
||||
}
|
||||
|
||||
impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
|
||||
fn written_size(&self,) -> usize {
|
||||
fn written_size(&self) -> usize {
|
||||
self.buffer.len()
|
||||
}
|
||||
|
||||
@@ -42,8 +41,9 @@ impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
Ok(if self.remaining == 0 {
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
}
|
||||
else { None })
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -56,7 +56,6 @@ pub struct SkipListBuilder<T: BinarySerializable> {
|
||||
|
||||
|
||||
impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
|
||||
pub fn new(period: usize) -> SkipListBuilder<T> {
|
||||
SkipListBuilder {
|
||||
period: period,
|
||||
@@ -78,11 +77,13 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest));
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) =>
|
||||
try!(self
|
||||
.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)),
|
||||
None => { return Ok(()); }
|
||||
Some((skip_doc_id, skip_offset)) => {
|
||||
try!(self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset))
|
||||
}
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
layer_id += 1;
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ pub fn is_power_of_2(val: u32) -> bool {
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,6 @@ pub struct ExpUnrolledLinkedList {
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap: heap,
|
||||
@@ -42,10 +41,10 @@ impl ExpUnrolledLinkedList {
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
@@ -77,23 +76,21 @@ pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self,) -> Option<u32> {
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,7 +100,7 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::Heap;
|
||||
use test::Bencher;
|
||||
@@ -147,7 +144,7 @@ mod tests {
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
@@ -163,4 +160,4 @@ mod tests {
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use super::heap::{Heap, HeapAllocable, BytesRef};
|
||||
|
||||
/// dbj2 hash function
|
||||
fn djb2(key: &[u8]) -> u64 {
|
||||
let mut state: u64 = 5381;
|
||||
let mut state: u64 = 5381;
|
||||
for &b in key {
|
||||
state = (state << 5).wrapping_add(state).wrapping_add(b as u64);
|
||||
}
|
||||
@@ -29,7 +29,7 @@ impl Default for BytesRef {
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key: BytesRef,
|
||||
@@ -37,7 +37,7 @@ struct KeyValue {
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self,) -> bool {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key.stop == 0u32
|
||||
}
|
||||
}
|
||||
@@ -49,7 +49,7 @@ pub enum Entry {
|
||||
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
@@ -57,7 +57,9 @@ pub enum Entry {
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct HashMap<'a, V> where V: HeapAllocable {
|
||||
pub struct HashMap<'a, V>
|
||||
where V: HeapAllocable
|
||||
{
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
_phantom: PhantomData<V>,
|
||||
@@ -65,13 +67,12 @@ pub struct HashMap<'a, V> where V: HeapAllocable {
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
|
||||
impl<'a, V> HashMap<'a, V>
|
||||
where V: HeapAllocable
|
||||
{
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
|
||||
.take(table_size)
|
||||
.collect();
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
HashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap: heap,
|
||||
@@ -99,23 +100,23 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], (u32, &'a V))> + 'b {
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], (u32, &'a V))> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = table[bucket];
|
||||
let addr = kv.value_addr;
|
||||
let v: &V = heap.get_mut_ref::<V>(addr);
|
||||
(heap.get_slice(kv.key), (addr, v))
|
||||
})
|
||||
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
|
||||
let kv = table[bucket];
|
||||
let addr = kv.value_addr;
|
||||
let v: &V = heap.get_mut_ref::<V>(addr);
|
||||
(heap.get_slice(kv.key), (addr, v))
|
||||
})
|
||||
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
|
||||
}
|
||||
|
||||
pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator<Item=&'a mut V> + 'b {
|
||||
pub fn values_mut<'b: 'a>(&'b self) -> impl Iterator<Item = &'a mut V> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
@@ -128,9 +129,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
pub fn get_or_create<S: AsRef<[u8]>>(&mut self, key: S) -> &mut V {
|
||||
let entry = self.lookup(key.as_ref());
|
||||
match entry {
|
||||
Entry::Occupied(addr) => {
|
||||
self.heap.get_mut_ref(addr)
|
||||
}
|
||||
Entry::Occupied(addr) => self.heap.get_mut_ref(addr),
|
||||
Entry::Vacant(bucket) => {
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
self.set_bucket(key.as_ref(), bucket, addr);
|
||||
@@ -138,7 +137,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let mut bucket = self.bucket(key_bytes);
|
||||
@@ -150,7 +149,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
if self.get_key(kv.key) == key_bytes {
|
||||
return Entry::Occupied(kv.value_addr);
|
||||
}
|
||||
bucket = (bucket + 1) & self.mask;
|
||||
bucket = (bucket + 1) & self.mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -158,7 +157,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::djb2;
|
||||
@@ -186,10 +185,10 @@ mod tests {
|
||||
let mut hash_map: HashMap<TestValue> = HashMap::new(18, &heap);
|
||||
{
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -214,20 +213,17 @@ mod tests {
|
||||
#[bench]
|
||||
fn bench_djb2(bench: &mut Bencher) {
|
||||
let v = String::from("abwer");
|
||||
bench.iter(|| {
|
||||
djb2(v.as_bytes())
|
||||
});
|
||||
bench.iter(|| djb2(v.as_bytes()));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_siphasher(bench: &mut Bencher) {
|
||||
let v = String::from("abwer");
|
||||
bench.iter(|| {
|
||||
let mut h = DefaultHasher::new();
|
||||
h.write(v.as_bytes());
|
||||
h.finish()
|
||||
});
|
||||
let mut h = DefaultHasher::new();
|
||||
h.write(v.as_bytes());
|
||||
h.finish()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -22,17 +22,13 @@ pub struct Heap {
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(
|
||||
InnerHeap::with_capacity(num_bytes)
|
||||
),
|
||||
}
|
||||
Heap { inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)) }
|
||||
}
|
||||
|
||||
fn inner(&self,) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
@@ -40,19 +36,19 @@ impl Heap {
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
|
||||
/// Return the heap capacity.
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
pub fn capacity(&self) -> u32 {
|
||||
self.inner().capacity()
|
||||
}
|
||||
|
||||
/// Return the amount of memory that has been allocated so far.
|
||||
pub fn len(&self,) -> u32 {
|
||||
|
||||
/// Return the amount of memory that has been allocated so far.
|
||||
pub fn len(&self) -> u32 {
|
||||
self.inner().len()
|
||||
}
|
||||
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
@@ -61,31 +57,31 @@ impl Heap {
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
|
||||
}
|
||||
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
@@ -106,7 +102,6 @@ struct InnerHeap {
|
||||
|
||||
|
||||
impl InnerHeap {
|
||||
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
@@ -122,23 +117,22 @@ impl InnerHeap {
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
pub fn capacity(&self) -> u32 {
|
||||
self.buffer.len() as u32
|
||||
}
|
||||
|
||||
pub fn len(&self,) -> u32 {
|
||||
pub fn len(&self) -> u32 {
|
||||
self.used
|
||||
}
|
||||
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
@@ -146,32 +140,35 @@ impl InnerHeap {
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.");
|
||||
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.",);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
@@ -188,9 +185,11 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
@@ -200,9 +199,11 @@ impl InnerHeap {
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
@@ -211,9 +212,11 @@ impl InnerHeap {
|
||||
|
||||
fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
@@ -223,4 +226,4 @@ impl InnerHeap {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,12 +17,12 @@ fn test_unrolled_linked_list() {
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: HashMap<ExpUnrolledLinkedList> = HashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let mut list = hashmap.get_or_create(i.to_string());
|
||||
list.push(i*j, &heap);
|
||||
list.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
for i in 0..500 {
|
||||
@@ -31,7 +31,7 @@ fn test_unrolled_linked_list() {
|
||||
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
|
||||
let mut it = v.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i*j);
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
@@ -41,6 +41,6 @@ fn test_unrolled_linked_list() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,31 +8,30 @@ use std::io;
|
||||
use std::marker::Sync;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where
|
||||
/// tantivy's data should be stored.
|
||||
/// tantivy's data should be stored.
|
||||
///
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
///
|
||||
///
|
||||
pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
|
||||
/// Opens a virtual file for read.
|
||||
///
|
||||
///
|
||||
/// Once a virtual file is open, its data may not
|
||||
/// change.
|
||||
///
|
||||
/// Specifically, subsequent writes or flushes should
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
|
||||
|
||||
/// Removes a file
|
||||
///
|
||||
/// Removing a file will not affect an eventual
|
||||
/// existing ReadOnlySource pointing to it.
|
||||
///
|
||||
///
|
||||
/// Removing a nonexistent file, yields a
|
||||
/// `DeleteError::DoesNotExist`.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
|
||||
@@ -40,18 +39,18 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
/// Returns true iff the file exists
|
||||
fn exists(&self, path: &Path) -> bool;
|
||||
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// a Path.
|
||||
///
|
||||
/// Right after this call, the file should be created
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// same path should return a `ReadOnlySource`.
|
||||
///
|
||||
///
|
||||
/// Write operations may be aggressively buffered.
|
||||
/// The client of this trait is responsible for calling flush
|
||||
/// to ensure that subsequent `read` operations
|
||||
/// to ensure that subsequent `read` operations
|
||||
/// will take into account preceding `write` operations.
|
||||
///
|
||||
///
|
||||
/// Flush operation should also be persistent.
|
||||
///
|
||||
/// The user shall not rely on `Drop` triggering `flush`.
|
||||
@@ -60,7 +59,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
///
|
||||
/// The file may not previously exist.
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
|
||||
|
||||
|
||||
/// Reads the full content file that has been written using
|
||||
/// atomic_write.
|
||||
///
|
||||
@@ -68,17 +67,13 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
|
||||
|
||||
/// Atomically replace the content of a file with data.
|
||||
///
|
||||
///
|
||||
/// This calls ensure that reads can never *observe*
|
||||
/// a partially written file.
|
||||
///
|
||||
///
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
|
||||
/// Clones the directory and boxes the clone
|
||||
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io;
|
||||
/// Error that may occur when opening a directory
|
||||
#[derive(Debug)]
|
||||
pub enum OpenDirectoryError {
|
||||
/// The underlying directory does not exists.
|
||||
/// The underlying directory does not exists.
|
||||
DoesNotExist(PathBuf),
|
||||
/// The path exists but is not a directory.
|
||||
NotADirectory(PathBuf),
|
||||
@@ -14,9 +14,9 @@ pub enum OpenDirectoryError {
|
||||
#[derive(Debug)]
|
||||
pub enum OpenWriteError {
|
||||
/// Our directory is WORM, writing an existing file is forbidden.
|
||||
/// Checkout the `Directory` documentation.
|
||||
/// Checkout the `Directory` documentation.
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// writing in the underlying IO device.
|
||||
IOError(io::Error),
|
||||
}
|
||||
@@ -32,7 +32,7 @@ impl From<io::Error> for OpenWriteError {
|
||||
pub enum OpenReadError {
|
||||
/// The file does not exists.
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(io::Error),
|
||||
}
|
||||
@@ -43,10 +43,10 @@ pub enum OpenReadError {
|
||||
pub enum DeleteError {
|
||||
/// The file does not exists.
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(io::Error),
|
||||
/// The file may not be deleted because it is
|
||||
/// The file may not be deleted because it is
|
||||
/// protected.
|
||||
FileProtected(PathBuf),
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use Error;
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
/// A managed directory is just a wrapper of a directory
|
||||
/// that keeps a (persisted) list of the files that
|
||||
/// that keeps a (persisted) list of the files that
|
||||
/// have been created (and not deleted) by tantivy so far.
|
||||
///
|
||||
/// Thanks to this list, it implements a `garbage_collect` method
|
||||
@@ -46,19 +46,18 @@ pub struct FileProtection {
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory.meta_informations
|
||||
let mut meta_informations_wlock = directory
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock
|
||||
.protected_files
|
||||
.get_mut(path) {
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileProtection {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,7 +69,9 @@ impl Drop for FileProtection {
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard<MetaInformation>) -> io::Result<()> {
|
||||
fn save_managed_paths(directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>)
|
||||
-> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
write!(&mut w, "\n")?;
|
||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
@@ -78,32 +79,30 @@ fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard<MetaIn
|
||||
}
|
||||
|
||||
impl ManagedDirectory {
|
||||
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
|
||||
let managed_files: HashSet<PathBuf> =
|
||||
serde_json::from_str(&managed_files_json)
|
||||
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(
|
||||
MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default()
|
||||
})),
|
||||
})
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files:
|
||||
HashMap::default(),
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => {
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::IOError(e)) => {
|
||||
Err(From::from(e))
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +110,7 @@ impl ManagedDirectory {
|
||||
///
|
||||
/// Removes the files that were created by `tantivy` and are not
|
||||
/// used by any segment anymore.
|
||||
///
|
||||
///
|
||||
/// * `living_files` - List of files that are still used by the index.
|
||||
///
|
||||
/// This method does not panick nor returns errors.
|
||||
@@ -119,19 +118,21 @@ impl ManagedDirectory {
|
||||
/// an error is simply logged, and the file remains in the list of managed
|
||||
/// files.
|
||||
pub fn garbage_collect(&mut self, living_files: HashSet<PathBuf>) {
|
||||
let mut files_to_delete = vec!();
|
||||
{ // releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock =
|
||||
self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_files = vec!();
|
||||
|
||||
let mut deleted_files = vec![];
|
||||
{
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
@@ -155,7 +156,7 @@ impl ManagedDirectory {
|
||||
// this is expected.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -163,7 +164,7 @@ impl ManagedDirectory {
|
||||
|
||||
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
@@ -186,7 +187,7 @@ impl ManagedDirectory {
|
||||
///
|
||||
/// The method returns a `FileProtection` object.
|
||||
/// The file will not be garbage collected as long as the
|
||||
/// `FileProtection` object is kept alive.
|
||||
/// `FileProtection` object is kept alive.
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
@@ -194,9 +195,9 @@ impl ManagedDirectory {
|
||||
.write()
|
||||
.expect("Managed file lock poisoned on protect");
|
||||
*meta_informations_wlock
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
.protected_files
|
||||
.entry(pathbuf.clone())
|
||||
.or_insert(0) += 1;
|
||||
}
|
||||
FileProtection {
|
||||
directory: self.clone(),
|
||||
@@ -205,16 +206,16 @@ impl ManagedDirectory {
|
||||
}
|
||||
|
||||
/// Registers a file as managed
|
||||
///
|
||||
/// This method must be called before the file is
|
||||
///
|
||||
/// This method must be called before the file is
|
||||
/// actually created to ensure that a failure between
|
||||
/// registering the filepath and creating the file
|
||||
/// will not lead to garbage files that will
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
|
||||
@@ -224,7 +225,6 @@ impl ManagedDirectory {
|
||||
}
|
||||
|
||||
impl Directory for ManagedDirectory {
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
self.directory.open_read(path)
|
||||
}
|
||||
@@ -250,7 +250,7 @@ impl Directory for ManagedDirectory {
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
if *counter > 0 {
|
||||
return Err(DeleteError::FileProtected(path.to_owned()))
|
||||
return Err(DeleteError::FileProtected(path.to_owned()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -260,11 +260,10 @@ impl Directory for ManagedDirectory {
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.directory.exists(path)
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
@@ -284,10 +283,10 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use directory::MmapDirectory;
|
||||
use std::path::Path;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
use tempdir::TempDir;
|
||||
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
@@ -305,17 +304,17 @@ mod tests {
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = [TEST_PATH1.to_owned()]
|
||||
.into_iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(living_files);
|
||||
}
|
||||
{
|
||||
@@ -338,7 +337,7 @@ mod tests {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -349,10 +348,12 @@ mod tests {
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
@@ -363,8 +364,7 @@ mod tests {
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
@@ -379,7 +379,9 @@ mod tests {
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
{
|
||||
@@ -390,7 +392,7 @@ mod tests {
|
||||
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -24,33 +24,24 @@ use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let convert_file_error = |err: io::Error| {
|
||||
if err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
}
|
||||
else {
|
||||
OpenReadError::IOError(err)
|
||||
}
|
||||
let convert_file_error = |err: io::Error| if err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(err)
|
||||
};
|
||||
let file = File::open(&full_path).map_err(convert_file_error)?;
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
.map_err(|e| OpenReadError::IOError(e))?;
|
||||
let meta_data = file.metadata().map_err(|e| OpenReadError::IOError(e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return an anonymous mmap_cache
|
||||
// instead.
|
||||
return Ok(None)
|
||||
return Ok(None);
|
||||
}
|
||||
match Mmap::open(&file, Protection::Read) {
|
||||
Ok(mmap) => {
|
||||
Ok(Some(Arc::new(mmap)))
|
||||
}
|
||||
Err(e) => {
|
||||
Err(OpenReadError::IOError(e))
|
||||
}
|
||||
Ok(mmap) => Ok(Some(Arc::new(mmap))),
|
||||
Err(e) => Err(OpenReadError::IOError(e)),
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
|
||||
@@ -91,8 +82,7 @@ impl Default for MmapCache {
|
||||
|
||||
|
||||
impl MmapCache {
|
||||
|
||||
fn cleanup(&mut self) {
|
||||
fn cleanup(&mut self) {
|
||||
let previous_cache_size = self.cache.len();
|
||||
let mut new_cache = HashMap::new();
|
||||
mem::swap(&mut new_cache, &mut self.cache);
|
||||
@@ -107,9 +97,7 @@ impl MmapCache {
|
||||
|
||||
fn get_info(&mut self) -> CacheInfo {
|
||||
self.cleanup();
|
||||
let paths: Vec<PathBuf> = self.cache.keys()
|
||||
.cloned()
|
||||
.collect();
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
counters: self.counters.clone(),
|
||||
mmapped: paths,
|
||||
@@ -123,68 +111,63 @@ impl MmapCache {
|
||||
self.cleanup();
|
||||
}
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
}
|
||||
else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
} else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Directory storing data in files, read via mmap.
|
||||
///
|
||||
/// The Mmap object are cached to limit the
|
||||
/// system calls.
|
||||
/// The Mmap object are cached to limit the
|
||||
/// system calls.
|
||||
#[derive(Clone)]
|
||||
pub struct MmapDirectory {
|
||||
root_path: PathBuf,
|
||||
mmap_cache: Arc<RwLock<MmapCache>>,
|
||||
_temp_directory: Arc<Option<TempDir>>,
|
||||
|
||||
}
|
||||
|
||||
impl fmt::Debug for MmapDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||
}
|
||||
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
|
||||
let tempdir = try!(TempDir::new("index"));
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let directory = MmapDirectory {
|
||||
root_path: PathBuf::from(tempdir_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(Some(tempdir))
|
||||
_temp_directory: Arc::new(Some(tempdir)),
|
||||
};
|
||||
Ok(directory)
|
||||
}
|
||||
@@ -196,16 +179,14 @@ impl MmapDirectory {
|
||||
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
|
||||
}
|
||||
else if !directory_path.is_dir() {
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Ok(MmapDirectory {
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None)
|
||||
})
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -232,7 +213,8 @@ impl MmapDirectory {
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
|
||||
open_opts.write(true)
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
@@ -242,8 +224,8 @@ impl MmapDirectory {
|
||||
}
|
||||
/// Returns some statistical information
|
||||
/// about the Mmap cache.
|
||||
///
|
||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||
///
|
||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||
/// to avoid multiplying the `mmap` system calls.
|
||||
pub fn get_cache_info(&mut self) -> CacheInfo {
|
||||
self.mmap_cache
|
||||
@@ -251,12 +233,10 @@ impl MmapDirectory {
|
||||
.expect("Mmap cache lock is poisoned.")
|
||||
.get_info()
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/// This Write wraps a File, but has the specificity of
|
||||
/// call `sync_all` on flush.
|
||||
/// This Write wraps a File, but has the specificity of
|
||||
/// call `sync_all` on flush.
|
||||
struct SafeFileWriter(File);
|
||||
|
||||
impl SafeFileWriter {
|
||||
@@ -266,7 +246,6 @@ impl SafeFileWriter {
|
||||
}
|
||||
|
||||
impl Write for SafeFileWriter {
|
||||
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.0.write(buf)
|
||||
}
|
||||
@@ -285,51 +264,46 @@ impl Seek for SafeFileWriter {
|
||||
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| OpenReadError::IOError(
|
||||
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
|
||||
))?;
|
||||
|
||||
Ok(mmap_cache.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
|
||||
)
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
}
|
||||
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
debug!("Open Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
|
||||
let open_res = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(full_path);
|
||||
|
||||
let mut file = try!(
|
||||
open_res.map_err(|err| {
|
||||
if err.kind() == io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(PathBuf::from(path))
|
||||
}
|
||||
else {
|
||||
OpenWriteError::IOError(err)
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
let mut file = try!(open_res.map_err(|err| if err.kind() ==
|
||||
io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(PathBuf::from(path))
|
||||
} else {
|
||||
OpenWriteError::IOError(err)
|
||||
}));
|
||||
|
||||
// making sure the file is created.
|
||||
try!(file.flush());
|
||||
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
try!(self.sync_directory());
|
||||
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
}
|
||||
@@ -347,15 +321,11 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => {
|
||||
self.sync_directory()
|
||||
.map_err(|e| DeleteError::IOError(e))
|
||||
}
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError(e)),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Err(DeleteError::IOError(e))
|
||||
}
|
||||
}
|
||||
@@ -379,26 +349,23 @@ impl Directory for MmapDirectory {
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Err(OpenReadError::IOError(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
debug!("Atomic Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
|
||||
try!(meta_file.write(|f| {
|
||||
f.write_all(data)
|
||||
}));
|
||||
try!(meta_file.write(|f| f.write_all(data)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self,) -> Box<Directory> {
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
@@ -457,9 +424,9 @@ mod tests {
|
||||
}
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
|
||||
|
||||
|
||||
{
|
||||
|
||||
|
||||
{
|
||||
// test weak miss
|
||||
// the first pass create the weak refs.
|
||||
for path in &paths {
|
||||
@@ -475,7 +442,7 @@ mod tests {
|
||||
}
|
||||
|
||||
{
|
||||
let mut saved_readmmaps = vec!();
|
||||
let mut saved_readmmaps = vec![];
|
||||
// Keeps reference alive
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
let r = mmap_directory.open_read(path).unwrap();
|
||||
@@ -494,7 +461,7 @@ mod tests {
|
||||
}
|
||||
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::Path;
|
||||
use std::path::Path;
|
||||
use std::io::{Write, Seek, SeekFrom};
|
||||
|
||||
lazy_static! {
|
||||
@@ -65,7 +65,7 @@ mod tests {
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7,3,5]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
@@ -81,9 +81,9 @@ mod tests {
|
||||
{
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
|
||||
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
||||
write_file.seek(SeekFrom::Start(0)).unwrap();
|
||||
write_file.write_all(&[3,1]).unwrap();
|
||||
write_file.write_all(&[3, 1]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
@@ -98,7 +98,7 @@ mod tests {
|
||||
{
|
||||
directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
|
||||
|
||||
}
|
||||
{
|
||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||
|
||||
@@ -11,14 +11,14 @@ use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// On drop, if the writer was left in a *dirty* state.
|
||||
/// That is, if flush was not called after the last call
|
||||
/// to write.
|
||||
/// to write.
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
@@ -40,8 +40,9 @@ impl VecWriter {
|
||||
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", self.path)
|
||||
if !self.is_flushed {
|
||||
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -61,7 +62,8 @@ impl Write for VecWriter {
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.is_flushed = true;
|
||||
try!(self.shared_directory.write(self.path.clone(), self.data.get_ref()));
|
||||
try!(self.shared_directory
|
||||
.write(self.path.clone(), self.data.get_ref()));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -72,22 +74,22 @@ struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
|
||||
|
||||
|
||||
impl InnerDirectory {
|
||||
|
||||
fn new() -> InnerDirectory {
|
||||
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
|
||||
}
|
||||
|
||||
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||
let mut map = try!(
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| make_io_err(format!("Failed to lock the directory, when trying to write {:?}", path)))
|
||||
);
|
||||
let mut map = try!(self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
make_io_err(format!("Failed to lock the directory, when trying to write {:?}",
|
||||
path))
|
||||
}));
|
||||
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||
Ok(prev_value.is_some())
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
self.0
|
||||
.read()
|
||||
.map_err(|_| {
|
||||
@@ -129,13 +131,12 @@ impl InnerDirectory {
|
||||
.expect("Failed to get read lock directory.")
|
||||
.contains_key(path)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "RAMDirectory")
|
||||
}
|
||||
write!(f, "RAMDirectory")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -150,12 +151,9 @@ pub struct RAMDirectory {
|
||||
}
|
||||
|
||||
impl RAMDirectory {
|
||||
|
||||
/// Constructor
|
||||
pub fn create() -> RAMDirectory {
|
||||
RAMDirectory {
|
||||
fs: InnerDirectory::new()
|
||||
}
|
||||
RAMDirectory { fs: InnerDirectory::new() }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,15 +161,14 @@ impl Directory for RAMDirectory {
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
self.fs.open_read(path)
|
||||
}
|
||||
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if try!(self.fs.write(path_buf.clone(), &Vec::new())) {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Ok(BufWriter::new(Box::new(vec_writer)))
|
||||
}
|
||||
}
|
||||
@@ -180,15 +177,14 @@ impl Directory for RAMDirectory {
|
||||
self.fs.delete(path)
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.fs.exists(path)
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
let read = self.open_read(path)?;
|
||||
Ok(read.as_slice()
|
||||
.to_owned())
|
||||
Ok(read.as_slice().to_owned())
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
@@ -200,8 +196,7 @@ impl Directory for RAMDirectory {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self,) -> Box<Directory> {
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use common::HasLen;
|
||||
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
///
|
||||
/// These read objects are only in charge to deliver
|
||||
/// the data in the form of a constant read-only `&[u8]`.
|
||||
/// Whatever happens to the directory file, the data
|
||||
@@ -13,12 +13,11 @@ use common::HasLen;
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
}
|
||||
|
||||
impl Deref for ReadOnlySource {
|
||||
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
@@ -27,35 +26,30 @@ impl Deref for ReadOnlySource {
|
||||
}
|
||||
|
||||
impl ReadOnlySource {
|
||||
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self,) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe {
|
||||
mmap_read_only.as_slice()
|
||||
},
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
shared_vec.as_slice()
|
||||
},
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a ReadOnlySource that is just a
|
||||
/// Creates a ReadOnlySource that is just a
|
||||
/// view over a slice of the data.
|
||||
///
|
||||
///
|
||||
/// Keep in mind that any living slice extends
|
||||
/// the lifetime of the original ReadOnlySource,
|
||||
///
|
||||
///
|
||||
/// For instance, if `ReadOnlySource` wraps 500MB
|
||||
/// worth of data in anonymous memory, and only a
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
match *self {
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
@@ -63,13 +57,13 @@ impl ReadOnlySource {
|
||||
}
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for ReadOnlySource {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.as_slice().len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,11 @@ use std::sync::Arc;
|
||||
#[derive(Clone)]
|
||||
pub struct SharedVecSlice {
|
||||
pub data: Arc<Vec<u8>>,
|
||||
pub start: usize,
|
||||
pub len: usize
|
||||
pub start: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedVecSlice {
|
||||
|
||||
pub fn empty() -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(Vec::new()))
|
||||
}
|
||||
@@ -23,11 +22,11 @@ impl SharedVecSlice {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self,) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.data[self.start..self.start + self.len]
|
||||
}
|
||||
|
||||
pub fn slice(&self, from_offset: usize, to_offset:usize) -> SharedVecSlice {
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
|
||||
SharedVecSlice {
|
||||
data: self.data.clone(),
|
||||
start: self.start + from_offset,
|
||||
|
||||
20
src/error.rs
20
src/error.rs
@@ -38,7 +38,7 @@ pub enum Error {
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(String),
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(FastFieldNotAvailableError)
|
||||
FastFieldError(FastFieldNotAvailableError),
|
||||
}
|
||||
|
||||
impl From<FastFieldNotAvailableError> for Error {
|
||||
@@ -83,10 +83,8 @@ impl From<schema::DocParsingError> for Error {
|
||||
impl From<OpenWriteError> for Error {
|
||||
fn from(error: OpenWriteError) -> Error {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) =>
|
||||
Error::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
Error::IOError(io_error),
|
||||
OpenWriteError::FileAlreadyExists(filepath) => Error::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) => Error::IOError(io_error),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,10 +92,12 @@ impl From<OpenWriteError> for Error {
|
||||
impl From<OpenDirectoryError> for Error {
|
||||
fn from(error: OpenDirectoryError) -> Error {
|
||||
match error {
|
||||
OpenDirectoryError::DoesNotExist(directory_path) =>
|
||||
Error::PathDoesNotExist(directory_path),
|
||||
OpenDirectoryError::NotADirectory(directory_path) =>
|
||||
Error::InvalidArgument(format!("{:?} is not a directory", directory_path)),
|
||||
OpenDirectoryError::DoesNotExist(directory_path) => {
|
||||
Error::PathDoesNotExist(directory_path)
|
||||
}
|
||||
OpenDirectoryError::NotADirectory(directory_path) => {
|
||||
Error::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,4 +106,4 @@ impl From<serde_json::Error> for Error {
|
||||
fn from(error: serde_json::Error) -> Error {
|
||||
Error::IOError(error.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,8 +21,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
|
||||
writer.write_all(&[byte])?;
|
||||
shift = 0;
|
||||
byte = 0;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
shift += 1;
|
||||
}
|
||||
}
|
||||
@@ -36,15 +35,14 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: ReadOnlySource,
|
||||
len: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
let num_deleted: usize = data.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
@@ -71,15 +69,13 @@ impl DeleteBitSet {
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -132,4 +128,4 @@ mod tests {
|
||||
test_delete_bitset_helper(&bitset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
|
||||
/// FastFieldNotAvailableError is returned when the
|
||||
/// FastFieldNotAvailableError is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
#[derive(Debug)]
|
||||
@@ -10,17 +10,14 @@ pub struct FastFieldNotAvailableError {
|
||||
}
|
||||
|
||||
impl FastFieldNotAvailableError {
|
||||
|
||||
/// Creates a `FastFieldNotAvailable` error.
|
||||
/// `field_entry` is the configuration of the field
|
||||
/// `field_entry` is the configuration of the field
|
||||
/// for which fast fields are not available.
|
||||
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
|
||||
FastFieldNotAvailableError {
|
||||
field_name: field_entry.name().to_string(),
|
||||
}
|
||||
FastFieldNotAvailableError { field_name: field_entry.name().to_string() }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Result when trying to access a fast field reader.
|
||||
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
|
||||
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
|
||||
|
||||
@@ -3,20 +3,20 @@
|
||||
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
|
||||
//! Fast fields is a non-compressed column-oriented fashion storage
|
||||
//! of `tantivy`.
|
||||
//!
|
||||
//!
|
||||
//! It is designed for the fast random access of some document
|
||||
//! fields given a document id.
|
||||
//!
|
||||
//! `FastField` are useful when a field is required for all or most of
|
||||
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
//!
|
||||
//!
|
||||
//!
|
||||
//!
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently only 64-bits integers (signed or unsigned) are
|
||||
//! supported.
|
||||
//!
|
||||
//! They are stored in a bitpacked fashion so that their
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! values stored.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
@@ -67,13 +67,13 @@ mod tests {
|
||||
doc.add_u64(field, value);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = U64FastFieldReader::from(vec!(100,200,300));
|
||||
let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -96,7 +96,8 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -129,7 +130,8 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -141,9 +143,9 @@ mod tests {
|
||||
assert_eq!(fast_field_reader.get(8), 215u64);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() {
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
|
||||
@@ -164,18 +166,19 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() {
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
@@ -183,7 +186,9 @@ mod tests {
|
||||
// forcing the amplitude to be high
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
|
||||
for i in 0u64..10_000u64 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i);
|
||||
add_single_field_doc(&mut fast_field_writers,
|
||||
*FIELD,
|
||||
5_000_000_000_000_000_000u64 + i);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
@@ -194,10 +199,12 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
|
||||
assert_eq!(fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -229,7 +236,8 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
fast_field_readers.open_reader(i64_field).unwrap();
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
@@ -255,11 +263,12 @@ mod tests {
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader =
|
||||
fast_field_readers.open_reader(i64_field).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
}
|
||||
@@ -291,10 +300,14 @@ mod tests {
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
println!("i {}=> {} {}",
|
||||
a,
|
||||
fast_field_reader.get(a as u32),
|
||||
permutation[a as usize]);
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
a = fast_field_reader.get(a as u32);
|
||||
}
|
||||
@@ -305,26 +318,26 @@ mod tests {
|
||||
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_veclookup(b: &mut Bencher) {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -345,15 +358,16 @@ mod tests {
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -375,15 +389,16 @@ mod tests {
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader =
|
||||
fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,6 @@ use common;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
pub trait FastFieldReader: Sized {
|
||||
|
||||
/// Type of the value stored in the fastfield.
|
||||
type ValueType;
|
||||
|
||||
@@ -33,7 +32,7 @@ pub trait FastFieldReader: Sized {
|
||||
fn open(source: ReadOnlySource) -> Self;
|
||||
|
||||
/// Returns true iff the given field_type makes
|
||||
/// it possible to access the field values via a
|
||||
/// it possible to access the field values via a
|
||||
/// fastfield.
|
||||
fn is_enabled(field_type: &FieldType) -> bool;
|
||||
}
|
||||
@@ -47,37 +46,35 @@ pub struct U64FastFieldReader {
|
||||
}
|
||||
|
||||
impl U64FastFieldReader {
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self,) -> u64 {
|
||||
pub fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self,) -> u64 {
|
||||
pub fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for U64FastFieldReader {
|
||||
type ValueType = u64;
|
||||
|
||||
|
||||
fn get(&self, doc: DocId) -> u64 {
|
||||
self.min_value + self.bit_unpacker.get(doc as usize)
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match field_type {
|
||||
&FieldType::U64(ref integer_options) =>
|
||||
integer_options.is_fast(),
|
||||
&FieldType::U64(ref integer_options) => integer_options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -90,11 +87,13 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
let min_value: u64;
|
||||
let max_value: u64;
|
||||
let bit_unpacker: BitUnpacker;
|
||||
|
||||
|
||||
{
|
||||
let mut cursor: &[u8] = data.as_slice();
|
||||
min_value = u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
min_value = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the min_value of fast field.");
|
||||
let amplitude = u64::deserialize(&mut cursor)
|
||||
.expect("Failed to read the amplitude of fast field.");
|
||||
max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
bit_unpacker = BitUnpacker::new(cursor, num_bits as usize)
|
||||
@@ -107,7 +106,6 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
max_value: max_value,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -132,7 +130,7 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
fast_field_readers.open_reader(field).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FastFieldReader for signed 64-bits integers.
|
||||
@@ -144,37 +142,35 @@ impl I64FastFieldReader {
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self,) -> i64 {
|
||||
pub fn min_value(&self) -> i64 {
|
||||
common::u64_to_i64(self.underlying.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self,) -> i64 {
|
||||
pub fn max_value(&self) -> i64 {
|
||||
common::u64_to_i64(self.underlying.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for I64FastFieldReader {
|
||||
type ValueType = i64;
|
||||
|
||||
|
||||
fn get(&self, doc: DocId) -> i64 {
|
||||
common::u64_to_i64(self.underlying.get(doc))
|
||||
}
|
||||
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
fn open(data: ReadOnlySource) -> I64FastFieldReader {
|
||||
I64FastFieldReader {
|
||||
underlying: U64FastFieldReader::open(data)
|
||||
}
|
||||
I64FastFieldReader { underlying: U64FastFieldReader::open(data) }
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
@@ -182,15 +178,13 @@ impl FastFieldReader for I64FastFieldReader {
|
||||
&FieldType::I64(ref integer_options) => {
|
||||
if integer_options.is_fast() {
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
},
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -198,7 +192,7 @@ impl FastFieldReader for I64FastFieldReader {
|
||||
/// The FastFieldsReader` is the datastructure containing
|
||||
/// all of the fast fields' data.
|
||||
///
|
||||
/// It contains a mapping that associated these fields to
|
||||
/// It contains a mapping that associated these fields to
|
||||
/// the proper slice in the fastfield reader file.
|
||||
pub struct FastFieldsReader {
|
||||
source: ReadOnlySource,
|
||||
@@ -206,11 +200,10 @@ pub struct FastFieldsReader {
|
||||
}
|
||||
|
||||
impl FastFieldsReader {
|
||||
|
||||
/// Opens the `FastFieldsReader` file
|
||||
///
|
||||
/// When opening the fast field reader, the
|
||||
/// the list of the offset is read (as a footer of the
|
||||
/// the list of the offset is read (as a footer of the
|
||||
/// data file).
|
||||
pub fn open(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
|
||||
let header_offset;
|
||||
@@ -223,23 +216,21 @@ impl FastFieldsReader {
|
||||
}
|
||||
{
|
||||
let mut cursor = &buffer[header_offset as usize..];
|
||||
field_offsets = Vec::deserialize(&mut cursor)?;
|
||||
field_offsets = Vec::deserialize(&mut cursor)?;
|
||||
}
|
||||
}
|
||||
let mut end_offsets: Vec<u32> = field_offsets
|
||||
.iter()
|
||||
.map(|&(_, offset)| offset)
|
||||
.collect();
|
||||
let mut end_offsets: Vec<u32> = field_offsets.iter().map(|&(_, offset)| offset).collect();
|
||||
end_offsets.push(header_offset);
|
||||
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
|
||||
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
|
||||
for (field_start_offsets, stop_offset) in
|
||||
field_offsets.iter().zip(end_offsets.iter().skip(1)) {
|
||||
let (field, start_offset) = *field_start_offsets;
|
||||
field_offsets_map.insert(field, (start_offset, *stop_offset));
|
||||
}
|
||||
Ok(FastFieldsReader {
|
||||
field_offsets: field_offsets_map,
|
||||
source: source,
|
||||
})
|
||||
field_offsets: field_offsets_map,
|
||||
source: source,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the u64 fast value reader if the field
|
||||
@@ -254,8 +245,8 @@ impl FastFieldsReader {
|
||||
self.field_offsets
|
||||
.get(&field)
|
||||
.map(|&(start, stop)| {
|
||||
let field_source = self.source.slice(start as usize, stop as usize);
|
||||
FFReader::open(field_source)
|
||||
})
|
||||
let field_source = self.source.slice(start as usize, stop as usize);
|
||||
FFReader::open(field_source)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,9 +7,9 @@ use std::io::{self, Write, Seek, SeekFrom};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
///
|
||||
/// Fast fields are encoded using bit-packing.
|
||||
///
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
@@ -41,17 +41,21 @@ impl FastFieldSerializer {
|
||||
// just making room for the pointer to header.
|
||||
let written_size: usize = try!(0u32.serialize(&mut write));
|
||||
Ok(FastFieldSerializer {
|
||||
write: write,
|
||||
written_size: written_size,
|
||||
fields: Vec::new(),
|
||||
min_value: 0,
|
||||
field_open: false,
|
||||
bit_packer: BitPacker::new(0),
|
||||
})
|
||||
write: write,
|
||||
written_size: written_size,
|
||||
fields: Vec::new(),
|
||||
min_value: 0,
|
||||
field_open: false,
|
||||
bit_packer: BitPacker::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> {
|
||||
pub fn new_u64_fast_field(&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64)
|
||||
-> io::Result<()> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
|
||||
}
|
||||
@@ -68,15 +72,15 @@ impl FastFieldSerializer {
|
||||
}
|
||||
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer.write(val_to_write, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close the u64 fast field.
|
||||
pub fn close_field(&mut self,) -> io::Result<()> {
|
||||
|
||||
/// Close the u64 fast field.
|
||||
pub fn close_field(&mut self) -> io::Result<()> {
|
||||
if !self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
|
||||
}
|
||||
@@ -87,12 +91,12 @@ impl FastFieldSerializer {
|
||||
self.written_size += self.bit_packer.close(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(mut self,) -> io::Result<usize> {
|
||||
pub fn close(mut self) -> io::Result<usize> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ pub struct FastFieldsWriter {
|
||||
}
|
||||
|
||||
impl FastFieldsWriter {
|
||||
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let field_writers: Vec<IntFastFieldWriter> = schema
|
||||
@@ -27,40 +26,33 @@ impl FastFieldsWriter {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
|
||||
Some(fast_field_writer)
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
&FieldType::U64(ref int_options) => {
|
||||
if int_options.is_fast() {
|
||||
Some(IntFastFieldWriter::new(field))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
FastFieldsWriter {
|
||||
field_writers: field_writers,
|
||||
}
|
||||
FastFieldsWriter { field_writers: field_writers }
|
||||
}
|
||||
|
||||
/// Returns a `FastFieldsWriter`
|
||||
/// with a `IntFastFieldWriter` for each
|
||||
|
||||
/// Returns a `FastFieldsWriter`
|
||||
/// with a `IntFastFieldWriter` for each
|
||||
/// of the field given in argument.
|
||||
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
field_writers: fields
|
||||
.into_iter()
|
||||
.map(IntFastFieldWriter::new)
|
||||
.collect(),
|
||||
field_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
@@ -68,7 +60,7 @@ impl FastFieldsWriter {
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
@@ -77,7 +69,7 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
for field_writer in &self.field_writers {
|
||||
@@ -85,10 +77,10 @@ impl FastFieldsWriter {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Ensures all of the fast field writers have
|
||||
/// reached `doc`. (included)
|
||||
///
|
||||
///
|
||||
/// The missing values will be filled with 0.
|
||||
pub fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
for field_writer in &mut self.field_writers {
|
||||
@@ -99,16 +91,16 @@ impl FastFieldsWriter {
|
||||
|
||||
/// Fast field writer for ints.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disc, the fast field writer is
|
||||
/// persisted on disc, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
///
|
||||
/// Both u64, and i64 use the same writer.
|
||||
/// i64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64`.
|
||||
@@ -119,7 +111,6 @@ pub struct IntFastFieldWriter {
|
||||
}
|
||||
|
||||
impl IntFastFieldWriter {
|
||||
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
@@ -128,10 +119,10 @@ impl IntFastFieldWriter {
|
||||
val_if_missing: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Sets the default value.
|
||||
///
|
||||
/// This default value is recorded for documents if
|
||||
/// This default value is recorded for documents if
|
||||
/// a document does not have any value.
|
||||
fn set_val_if_missing(&mut self, val_if_missing: u64) {
|
||||
self.val_if_missing = val_if_missing;
|
||||
@@ -139,7 +130,7 @@ impl IntFastFieldWriter {
|
||||
|
||||
/// Ensures all of the fast field writer have
|
||||
/// reached `doc`. (included)
|
||||
///
|
||||
///
|
||||
/// The missing values will be filled with 0.
|
||||
fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
let target = doc as usize + 1;
|
||||
@@ -158,9 +149,9 @@ impl IntFastFieldWriter {
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
|
||||
/// Extract the value associated to the fast field for
|
||||
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 are remapped to u64 using the logic
|
||||
@@ -174,14 +165,12 @@ impl IntFastFieldWriter {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
match *v {
|
||||
Value::U64(ref val) => { *val },
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => { panic!("Expected a u64field, got {:?} ", v) }
|
||||
_ => panic!("Expected a u64field, got {:?} ", v),
|
||||
}
|
||||
},
|
||||
None => {
|
||||
self.val_if_missing
|
||||
}
|
||||
}
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,8 +193,3 @@ impl IntFastFieldWriter {
|
||||
serializer.close_field()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
@@ -41,14 +41,11 @@ fn test_indexing() {
|
||||
let searcher = index.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(&searcher, &committed_docs);
|
||||
}
|
||||
else {
|
||||
if committed_docs.remove(&random_val) ||
|
||||
uncommitted_docs.remove(&random_val) {
|
||||
} else {
|
||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
|
||||
@@ -6,17 +6,17 @@ use std::ops::DerefMut;
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
//
|
||||
//
|
||||
// All consumer will receive all messages.
|
||||
//
|
||||
//
|
||||
// Consumer of the delete queue are holding a `DeleteCursor`,
|
||||
// which points to a specific place of the `DeleteQueue`.
|
||||
//
|
||||
//
|
||||
// New consumer can be created in two ways
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that
|
||||
// will include all future delete operation (and no past operations).
|
||||
// - cloning an existing cursor returns a new cursor, that
|
||||
// is at the exact same position, and can now advance independantly
|
||||
// is at the exact same position, and can now advance independantly
|
||||
// from the original cursor.
|
||||
#[derive(Default)]
|
||||
struct InnerDeleteQueue {
|
||||
@@ -31,32 +31,27 @@ pub struct DeleteQueue {
|
||||
|
||||
|
||||
impl DeleteQueue {
|
||||
|
||||
// Creates a new delete queue.
|
||||
pub fn new() -> DeleteQueue {
|
||||
|
||||
let delete_queue = DeleteQueue {
|
||||
inner: Arc::default(),
|
||||
};
|
||||
|
||||
|
||||
let delete_queue = DeleteQueue { inner: Arc::default() };
|
||||
|
||||
let next_block = NextBlock::from(delete_queue.clone());
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(
|
||||
Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
})
|
||||
);
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
|
||||
delete_queue
|
||||
}
|
||||
|
||||
|
||||
// Creates a new cursor that makes it possible to
|
||||
|
||||
// Creates a new cursor that makes it possible to
|
||||
// consume future delete operations.
|
||||
//
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self.inner
|
||||
@@ -85,40 +80,37 @@ impl DeleteQueue {
|
||||
|
||||
// DeleteQueue is a linked list of blocks of
|
||||
// delete operations.
|
||||
//
|
||||
//
|
||||
// Writing happens by simply appending to a vec.
|
||||
// `.flush()` takes this pending delete operations vec
|
||||
// creates a new read-only block from it,
|
||||
// creates a new read-only block from it,
|
||||
// and appends it to the linked list.
|
||||
//
|
||||
// `.flush()` happens when, for instance,
|
||||
//
|
||||
// `.flush()` happens when, for instance,
|
||||
// a consumer reaches the last read-only operations.
|
||||
// It then ask the delete queue if there happen to
|
||||
// It then ask the delete queue if there happen to
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
|
||||
let delete_operations;
|
||||
{
|
||||
let writer: &mut Vec<DeleteOperation> = &mut self_wlock.writer;
|
||||
if writer.is_empty() {
|
||||
return None;
|
||||
}
|
||||
delete_operations = mem::replace(writer, vec!());
|
||||
delete_operations = mem::replace(writer, vec![]);
|
||||
}
|
||||
|
||||
let next_block = NextBlock::from(self.clone());
|
||||
{
|
||||
self_wlock.last_block = Some(
|
||||
Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
})
|
||||
);
|
||||
self_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
self_wlock.last_block.clone()
|
||||
}
|
||||
@@ -137,7 +129,7 @@ impl From<DeleteQueue> for NextBlock {
|
||||
}
|
||||
}
|
||||
|
||||
impl NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
@@ -171,7 +163,7 @@ impl NextBlock {
|
||||
}
|
||||
}
|
||||
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone());
|
||||
return Some(next_block)
|
||||
return Some(next_block);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -189,10 +181,9 @@ pub struct DeleteCursor {
|
||||
}
|
||||
|
||||
|
||||
impl DeleteCursor {
|
||||
|
||||
impl DeleteCursor {
|
||||
/// Skips operations and position it so that
|
||||
/// - either all of the delete operation currently in the
|
||||
/// - either all of the delete operation currently in the
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
@@ -203,18 +194,17 @@ impl DeleteCursor {
|
||||
if operation.opstamp >= target_opstamp {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// If the current block has been entirely
|
||||
/// If the current block has been entirely
|
||||
/// consumed, try to load the next one.
|
||||
///
|
||||
/// Return `true`, if after this attempt,
|
||||
///
|
||||
/// Return `true`, if after this attempt,
|
||||
/// the cursor is on a block that has not
|
||||
/// been entirely consumed.
|
||||
/// Return `false`, if we have reached the end of the queue.
|
||||
@@ -229,24 +219,20 @@ impl DeleteCursor {
|
||||
self.pos = 0;
|
||||
true
|
||||
}
|
||||
None => {
|
||||
false
|
||||
}
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Advance to the next delete operation.
|
||||
/// Returns true iff there is such an operation.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.load_block_if_required() {
|
||||
self.pos += 1;
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -256,12 +242,10 @@ impl DeleteCursor {
|
||||
pub fn get(&mut self) -> Option<&DeleteOperation> {
|
||||
if self.load_block_if_required() {
|
||||
Some(&self.block.operations[self.pos])
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -278,12 +262,12 @@ mod tests {
|
||||
#[test]
|
||||
fn test_deletequeue() {
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
|
||||
let make_op = |i: usize| {
|
||||
let field = Field(1u32);
|
||||
DeleteOperation {
|
||||
opstamp: i as u64,
|
||||
term: Term::from_field_u64(field, i as u64)
|
||||
term: Term::from_field_u64(field, i as u64),
|
||||
}
|
||||
};
|
||||
|
||||
@@ -299,7 +283,7 @@ mod tests {
|
||||
operations_it.advance();
|
||||
assert!(operations_it.get().is_none());
|
||||
operations_it.advance();
|
||||
|
||||
|
||||
let mut snapshot2 = delete_queue.cursor();
|
||||
assert!(snapshot2.get().is_none());
|
||||
delete_queue.push(make_op(3));
|
||||
@@ -310,7 +294,7 @@ mod tests {
|
||||
assert!(operations_it.get().is_none());
|
||||
operations_it.advance();
|
||||
}
|
||||
{
|
||||
{
|
||||
let mut operations_it = snapshot.clone();
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 1);
|
||||
operations_it.advance();
|
||||
@@ -320,6 +304,6 @@ mod tests {
|
||||
operations_it.advance();
|
||||
assert!(operations_it.get().is_none());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,4 +26,4 @@ impl Drop for DirectoryLock {
|
||||
error!("Failed to remove the lock file. {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use DocId;
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
//
|
||||
//
|
||||
// Since the docset matching the query of a delete operation
|
||||
// is not computed right when the delete operation is received,
|
||||
// we need to find a way to evaluate, for each document,
|
||||
@@ -14,13 +14,13 @@ use DocId;
|
||||
//
|
||||
// The doc to opstamp mapping stores precisely an array
|
||||
// indexed by doc id and storing the opstamp of the document.
|
||||
//
|
||||
//
|
||||
// This mapping is (for the moment) stricly increasing
|
||||
// because of the way document id are allocated.
|
||||
#[derive(Clone)]
|
||||
pub enum DocToOpstampMapping {
|
||||
WithMap(Arc<Vec<u64>>),
|
||||
None
|
||||
None,
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
@@ -30,9 +30,8 @@ impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
}
|
||||
|
||||
impl DocToOpstampMapping {
|
||||
|
||||
/// Given an opstamp return the limit doc id L
|
||||
/// such that all doc id D such that
|
||||
/// such that all doc id D such that
|
||||
// D >= L iff opstamp(D) >= than `target_opstamp`.
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
@@ -58,23 +57,24 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_none() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value());
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_complex() {
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!());
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64));
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64));
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
for i in 2u64..13u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
|
||||
@@ -90,4 +90,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::mem::swap;
|
||||
use std::thread::JoinHandle;
|
||||
use super::directory_lock::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
@@ -54,11 +54,10 @@ type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
/// Each indexing thread builds its own independant `Segment`, via
|
||||
/// a `SegmentWriter` object.
|
||||
pub struct IndexWriter {
|
||||
|
||||
// the lock is just used to bind the
|
||||
// the lock is just used to bind the
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: DirectoryLock,
|
||||
|
||||
_directory_lock: DirectoryLock,
|
||||
|
||||
index: Index,
|
||||
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
@@ -102,36 +101,34 @@ impl !Sync for IndexWriter {}
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize) -> Result<IndexWriter> {
|
||||
pub fn open_index_writer(index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize)
|
||||
-> Result<IndexWriter> {
|
||||
|
||||
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!("The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT));
|
||||
HEAP_SIZE_LIMIT));
|
||||
}
|
||||
|
||||
|
||||
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
|
||||
|
||||
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
|
||||
let current_opstamp = index.opstamp();
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater = SegmentUpdater::new(index.clone(),
|
||||
stamper.clone(),
|
||||
delete_queue.cursor())?;
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::new(index.clone(), stamper.clone(), delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
|
||||
_directory_lock: directory_lock,
|
||||
|
||||
|
||||
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
@@ -140,7 +137,7 @@ pub fn open_index_writer(
|
||||
|
||||
segment_updater: segment_updater,
|
||||
|
||||
workers_join_handle: vec!(),
|
||||
workers_join_handle: vec![],
|
||||
num_threads: num_threads,
|
||||
|
||||
delete_queue: delete_queue,
|
||||
@@ -158,28 +155,28 @@ pub fn open_index_writer(
|
||||
|
||||
|
||||
|
||||
pub fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: DocToOpstampMapping,
|
||||
target_opstamp: u64) -> Result<bool> {
|
||||
|
||||
pub fn compute_deleted_bitset(delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: DocToOpstampMapping,
|
||||
target_opstamp: u64)
|
||||
-> Result<bool> {
|
||||
|
||||
let mut might_have_changed = false;
|
||||
|
||||
|
||||
loop {
|
||||
if let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
|
||||
if let Some(mut docset) =
|
||||
segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
@@ -189,8 +186,7 @@ pub fn compute_deleted_bitset(
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
delete_cursor.advance();
|
||||
@@ -200,10 +196,10 @@ pub fn compute_deleted_bitset(
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64) -> Result<Option<FileProtection>> {
|
||||
pub fn advance_deletes(mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64)
|
||||
-> Result<Option<FileProtection>> {
|
||||
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
@@ -216,24 +212,20 @@ pub fn advance_deletes(
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment.clone())?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
let mut delete_bitset: BitSet =
|
||||
match segment_entry.delete_bitset() {
|
||||
Some(ref previous_delete_bitset) =>
|
||||
(*previous_delete_bitset).clone(),
|
||||
None =>
|
||||
BitSet::with_capacity(max_doc as usize)
|
||||
};
|
||||
|
||||
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(ref previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_capacity(max_doc as usize),
|
||||
};
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
DocToOpstampMapping::None,
|
||||
target_opstamp)?;
|
||||
|
||||
compute_deleted_bitset(&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
DocToOpstampMapping::None,
|
||||
target_opstamp)?;
|
||||
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
delete_bitset.insert(doc as usize);
|
||||
@@ -257,7 +249,7 @@ fn index_documents(heap: &mut Heap,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item=AddOperation>,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor)
|
||||
-> Result<bool> {
|
||||
@@ -273,10 +265,10 @@ fn index_documents(heap: &mut Heap,
|
||||
}
|
||||
}
|
||||
let num_docs = segment_writer.max_doc();
|
||||
|
||||
|
||||
// this is ensured by the call to peek before starting
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
|
||||
@@ -284,58 +276,54 @@ fn index_documents(heap: &mut Heap,
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
doc_to_opstamps,
|
||||
last_docstamp)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_meta,
|
||||
delete_cursor,
|
||||
{ if may_have_deletes { Some(deleted_bitset) }
|
||||
else { None } }
|
||||
);
|
||||
|
||||
Ok(
|
||||
segment_updater
|
||||
.add_segment(generation, segment_entry)
|
||||
)
|
||||
|
||||
}
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl IndexWriter {
|
||||
/// The index writer
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
|
||||
|
||||
// this will stop the indexing thread,
|
||||
// dropping the last reference to the segment_updater.
|
||||
drop(self.document_sender);
|
||||
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
|
||||
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
||||
for join_handle in former_workers_handles {
|
||||
try!(join_handle.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.map_err(|e| {
|
||||
Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e))
|
||||
}));
|
||||
try!(join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.map_err(|e| {
|
||||
Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e))
|
||||
}));
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.map_err(|_|
|
||||
Error::ErrorInThread("Failed to join merging thread.".to_string())
|
||||
);
|
||||
|
||||
let result =
|
||||
self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.map_err(|_| Error::ErrorInThread("Failed to join merging thread.".to_string()));
|
||||
|
||||
if let &Err(ref e) = &result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
}
|
||||
@@ -351,37 +339,34 @@ impl IndexWriter {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread);
|
||||
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let join_handle: JoinHandle<Result<()>> =
|
||||
thread::Builder::new()
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
|
||||
.spawn(move || {
|
||||
|
||||
|
||||
loop {
|
||||
|
||||
let mut document_iterator = document_receiver_clone.clone()
|
||||
.into_iter()
|
||||
.peekable();
|
||||
|
||||
let mut document_iterator =
|
||||
document_receiver_clone.clone().into_iter().peekable();
|
||||
|
||||
// the peeking here is to avoid
|
||||
// creating a new segment's files
|
||||
// if no document are available.
|
||||
//
|
||||
// this is a valid guarantee as the
|
||||
// this is a valid guarantee as the
|
||||
// peeked document now belongs to
|
||||
// our local iterator.
|
||||
if let Some(operation) = document_iterator.peek() {
|
||||
delete_cursor.skip_to(operation.opstamp);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// No more documents.
|
||||
// Happens when there is a commit, or if the `IndexWriter`
|
||||
// was dropped.
|
||||
return Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(&mut heap,
|
||||
@@ -391,7 +376,7 @@ impl IndexWriter {
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone())?;
|
||||
|
||||
|
||||
}
|
||||
})?;
|
||||
self.worker_id += 1;
|
||||
@@ -408,7 +393,7 @@ impl IndexWriter {
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
self.segment_updater.set_merge_policy(merge_policy);
|
||||
}
|
||||
|
||||
|
||||
fn start_workers(&mut self) -> Result<()> {
|
||||
for _ in 0..self.num_threads {
|
||||
try!(self.add_indexing_worker());
|
||||
@@ -423,7 +408,9 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
|
||||
pub fn merge(&mut self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -436,7 +423,8 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender,
|
||||
DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
swap(&mut self.document_sender, &mut document_sender);
|
||||
swap(&mut self.document_receiver, &mut document_receiver);
|
||||
@@ -464,12 +452,9 @@ impl IndexWriter {
|
||||
let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread;
|
||||
drop(self);
|
||||
for _ in receiver_clone {}
|
||||
|
||||
let index_writer = open_index_writer(
|
||||
&index,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread)?;
|
||||
|
||||
|
||||
let index_writer = open_index_writer(&index, num_threads, heap_size_in_bytes_per_thread)?;
|
||||
|
||||
Ok(index_writer)
|
||||
|
||||
}
|
||||
@@ -511,23 +496,24 @@ impl IndexWriter {
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle);
|
||||
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = try!(worker_handle.join()
|
||||
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
|
||||
let indexing_worker_result =
|
||||
try!(worker_handle
|
||||
.join()
|
||||
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
|
||||
try!(indexing_worker_result);
|
||||
// add a new worker for the next generation.
|
||||
try!(self.add_indexing_worker());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// wait for the segment update thread to have processed the info
|
||||
self.segment_updater
|
||||
.commit(self.committed_opstamp)?;
|
||||
|
||||
self.segment_updater.commit(self.committed_opstamp)?;
|
||||
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete all documents containing a given term.
|
||||
///
|
||||
@@ -535,7 +521,7 @@ impl IndexWriter {
|
||||
/// were added in previous commits, and documents
|
||||
/// that were added previously in the same commit.
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
||||
let opstamp = self.stamper.stamp();
|
||||
@@ -548,7 +534,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Returns the opstamp of the last successful commit.
|
||||
///
|
||||
///
|
||||
/// This is, for instance, the opstamp the index will
|
||||
/// rollback to if there is a failure like a power surge.
|
||||
///
|
||||
@@ -602,16 +588,18 @@ mod tests {
|
||||
_ => panic!("Expected FileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }");
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }");
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy");
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -620,12 +608,12 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
// the lock should be released when the
|
||||
// the lock should be released when the
|
||||
// index_writer leaves the scope.
|
||||
}
|
||||
let _index_writer_two = index.writer(40_000_000).unwrap();
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_commit_and_rollback() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
@@ -648,7 +636,7 @@ mod tests {
|
||||
}
|
||||
|
||||
index_writer = index_writer.rollback().unwrap();
|
||||
|
||||
|
||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
|
||||
@@ -701,12 +689,14 @@ mod tests {
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.wait_merging_threads().expect("waiting merging thread failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
|
||||
|
||||
/// LogMergePolicy tries tries to merge segments that have a similar number of
|
||||
/// LogMergePolicy tries tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
@@ -24,7 +24,7 @@ impl LogMergePolicy {
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
}
|
||||
|
||||
@@ -52,14 +52,16 @@ impl MergePolicy for LogMergePolicy {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut size_sorted_tuples = segments.iter()
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(|x| x.num_docs())
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.cmp(x));
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples.into_iter()
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
|
||||
.collect();
|
||||
|
||||
@@ -77,14 +79,10 @@ impl MergePolicy for LogMergePolicy {
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter()
|
||||
.map(|&ind| segments[ind].id())
|
||||
.collect())
|
||||
})
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box self.clone()
|
||||
}
|
||||
@@ -128,9 +126,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_log_merge_policy_pair() {
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10)];
|
||||
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
|
||||
@@ -4,26 +4,26 @@ use std::marker;
|
||||
use std::fmt::Debug;
|
||||
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MergeCandidate(pub Vec<SegmentId>);
|
||||
|
||||
|
||||
/// The Merge policy defines which segments should be merged.
|
||||
///
|
||||
/// The Merge policy defines which segments should be merged.
|
||||
///
|
||||
/// Every time a the list of segments changes, the segment updater
|
||||
/// asks the merge policy if some segments should be merged.
|
||||
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
///
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
}
|
||||
|
||||
/// Never merge segments.
|
||||
/// Never merge segments.
|
||||
#[derive(Debug)]
|
||||
pub struct NoMergePolicy;
|
||||
|
||||
@@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box NoMergePolicy
|
||||
}
|
||||
@@ -66,15 +66,14 @@ pub mod tests {
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect::<Vec<SegmentId>>();
|
||||
if segment_ids.len() > 1 {
|
||||
vec!(MergeCandidate(segment_ids))
|
||||
}
|
||||
else {
|
||||
vec!()
|
||||
vec![MergeCandidate(segment_ids)]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box MergeWheneverPossible
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,9 +31,7 @@ struct DeltaPositionComputer {
|
||||
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer {
|
||||
buffer: vec![0u32; 512]
|
||||
}
|
||||
DeltaPositionComputer { buffer: vec![0u32; 512] }
|
||||
}
|
||||
|
||||
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
|
||||
@@ -50,16 +48,17 @@ impl DeltaPositionComputer {
|
||||
}
|
||||
|
||||
|
||||
fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> {
|
||||
fn compute_min_max_val(u64_reader: &U64FastFieldReader,
|
||||
max_doc: DocId,
|
||||
delete_bitset: &DeleteBitSet)
|
||||
-> Option<(u64, u64)> {
|
||||
if max_doc == 0 {
|
||||
None
|
||||
}
|
||||
else if !delete_bitset.has_deletes() {
|
||||
// no deleted documents,
|
||||
} else if !delete_bitset.has_deletes() {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
Some((u64_reader.min_value(), u64_reader.max_value()))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
@@ -70,19 +69,21 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_b
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
|
||||
segment_reader
|
||||
.fast_fields_reader()
|
||||
.open_reader(field)
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader,
|
||||
field: Field)
|
||||
-> Option<U64FastFieldReader> {
|
||||
segment_reader.fast_fields_reader().open_reader(field)
|
||||
}
|
||||
|
||||
impl IndexMerger {
|
||||
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
|
||||
let mut readers = vec!();
|
||||
let mut readers = vec![];
|
||||
let mut max_doc: u32 = 0u32;
|
||||
for segment in segments {
|
||||
if segment.meta().num_docs() > 0 {
|
||||
@@ -92,65 +93,75 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
schema: schema,
|
||||
readers: readers,
|
||||
max_doc: max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn write_fieldnorms(&self,
|
||||
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fieldnorm_fastfields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer)
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fieldnorm_fastfields,
|
||||
&extract_fieldnorm_reader,
|
||||
fast_field_serializer)
|
||||
}
|
||||
|
||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
let fast_fields: Vec<Field> = self.schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fast_fields,
|
||||
&extract_fast_field_reader,
|
||||
fast_field_serializer)
|
||||
}
|
||||
|
||||
|
||||
// used both to merge field norms and regular u64 fast fields.
|
||||
fn generic_write_fast_field(&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field)
|
||||
-> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer)
|
||||
-> Result<()> {
|
||||
|
||||
for field in fields {
|
||||
|
||||
let mut u64_readers = vec!();
|
||||
|
||||
let mut u64_readers = vec![];
|
||||
let mut min_val = u64::max_value();
|
||||
let mut max_val = u64::min_value();
|
||||
|
||||
|
||||
for reader in &self.readers {
|
||||
match field_reader_extractor(reader, field) {
|
||||
Some(u64_reader) => {
|
||||
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) {
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader,
|
||||
reader.max_doc(),
|
||||
reader.delete_bitset()) {
|
||||
// the segment has some non-deleted documents
|
||||
min_val = min(min_val, seg_min_val);
|
||||
max_val = max(max_val, seg_max_val);
|
||||
u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
|
||||
}
|
||||
u64_readers
|
||||
.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let error_msg = format!("Failed to find a u64_reader for field {:?}", field);
|
||||
let error_msg = format!("Failed to find a u64_reader for field {:?}",
|
||||
field);
|
||||
error!("{}", error_msg);
|
||||
return Err(Error::SchemaError(error_msg))
|
||||
return Err(Error::SchemaError(error_msg));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if u64_readers.is_empty() {
|
||||
@@ -160,7 +171,7 @@ impl IndexMerger {
|
||||
}
|
||||
|
||||
assert!(min_val <= max_val);
|
||||
|
||||
|
||||
try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val));
|
||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||
for doc_id in 0..max_doc {
|
||||
@@ -176,32 +187,29 @@ impl IndexMerger {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_postings(
|
||||
&self,
|
||||
postings_serializer: &mut PostingsSerializer) -> Result<()> {
|
||||
|
||||
fn write_postings(&self, postings_serializer: &mut PostingsSerializer) -> Result<()> {
|
||||
|
||||
let mut merged_terms = TermIterator::from(&self.readers[..]);
|
||||
let mut delta_position_computer = DeltaPositionComputer::new();
|
||||
|
||||
|
||||
let mut max_doc = 0;
|
||||
|
||||
// map from segment doc ids to the resulting merged segment doc id.
|
||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
|
||||
|
||||
|
||||
for reader in &self.readers {
|
||||
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
||||
for doc_id in 0..reader.max_doc() {
|
||||
if reader.is_deleted(doc_id) {
|
||||
segment_local_map.push(None);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
segment_local_map.push(Some(max_doc));
|
||||
max_doc += 1u32;
|
||||
}
|
||||
}
|
||||
merged_doc_id_map.push(segment_local_map);
|
||||
}
|
||||
|
||||
|
||||
while merged_terms.advance() {
|
||||
// Create the total list of doc ids
|
||||
// by stacking the doc ids from the different segment.
|
||||
@@ -215,27 +223,28 @@ impl IndexMerger {
|
||||
let term = merged_terms.term();
|
||||
let mut term_written = false;
|
||||
let segment_postings = merged_terms
|
||||
.segment_ords()
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|segment_ord| {
|
||||
self.readers[segment_ord]
|
||||
.read_postings_all_info(&term)
|
||||
.map(|segment_postings| (segment_ord, segment_postings))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
.segment_ords()
|
||||
.iter()
|
||||
.cloned()
|
||||
.flat_map(|segment_ord| {
|
||||
self.readers[segment_ord]
|
||||
.read_postings_all_info(&term)
|
||||
.map(|segment_postings| (segment_ord, segment_postings))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// We can remove the term if all documents which
|
||||
// contained it have been deleted.
|
||||
if segment_postings.len() > 0 {
|
||||
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
|
||||
// postings serializer.
|
||||
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
while segment_postings.advance() {
|
||||
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
if let Some(remapped_doc_id) =
|
||||
old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
if !term_written {
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
@@ -243,11 +252,11 @@ impl IndexMerger {
|
||||
term_written = true;
|
||||
}
|
||||
let delta_positions: &[u32] =
|
||||
delta_position_computer.compute_delta_positions(segment_postings.positions());
|
||||
try!(postings_serializer.write_doc(
|
||||
remapped_doc_id,
|
||||
segment_postings.term_freq(),
|
||||
delta_positions));
|
||||
delta_position_computer
|
||||
.compute_delta_positions(segment_postings.positions());
|
||||
try!(postings_serializer.write_doc(remapped_doc_id,
|
||||
segment_postings.term_freq(),
|
||||
delta_positions));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -256,7 +265,7 @@ impl IndexMerger {
|
||||
try!(postings_serializer.close_term());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -267,12 +276,10 @@ impl IndexMerger {
|
||||
for doc_id in 0..reader.max_doc() {
|
||||
if !reader.is_deleted(doc_id) {
|
||||
let doc = try!(store_reader.get(doc_id));
|
||||
let field_values: Vec<&FieldValue> = doc.field_values()
|
||||
.iter()
|
||||
.collect();
|
||||
let field_values: Vec<&FieldValue> = doc.field_values().iter().collect();
|
||||
try!(store_writer.store(&field_values));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -311,8 +318,8 @@ mod tests {
|
||||
fn test_index_merger_no_deletes() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
|
||||
.set_stored();
|
||||
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast();
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
@@ -361,11 +368,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
{
|
||||
@@ -379,13 +389,13 @@ mod tests {
|
||||
};
|
||||
{
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec!(1, 2, 4,));
|
||||
vec![1, 2, 4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec!(0, 3,));
|
||||
vec![0, 3]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||
vec!(4,));
|
||||
vec![4]);
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec!(0, 1, 2, 3, 4,));
|
||||
vec![0, 1, 2, 3, 4]);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||
@@ -415,12 +425,12 @@ mod tests {
|
||||
collector.vals()
|
||||
};
|
||||
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec!(5, 7, 13,));
|
||||
vec![5, 7, 13]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
|
||||
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
|
||||
let mut collector = FastFieldTestCollector::for_field(Field(1));
|
||||
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
|
||||
searcher.search(&term_query, &mut collector).unwrap();
|
||||
@@ -430,8 +440,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_index_merger_with_deletes() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_fieldtype = schema::TextOptions
|
||||
::default()
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
@@ -441,21 +450,19 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
|
||||
let empty_vec = Vec::<u64>::new();
|
||||
|
||||
{ // a first commit
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
|
||||
{
|
||||
// a first commit
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "a b d",
|
||||
score_field => 1u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "b c",
|
||||
score_field => 2u64
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "c d",
|
||||
score_field => 3u64
|
||||
));
|
||||
@@ -465,31 +472,32 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!(1));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!(1));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(1, 3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
vec![1]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![1, 3]);
|
||||
}
|
||||
{ // a second commit
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
{
|
||||
// a second commit
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "a d e",
|
||||
score_field => 4_000u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "e f",
|
||||
score_field => 5_000u64
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "f g",
|
||||
score_field => 6_000u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "g h",
|
||||
score_field => 7_000u64
|
||||
));
|
||||
@@ -503,71 +511,112 @@ mod tests {
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(1)
|
||||
.get_fast_field_reader(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{ // merging the segments
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
index_writer.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
{
|
||||
// merging the segments
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
vec![3]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{
|
||||
{
|
||||
// test a commit with only deletes
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{ // Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
index_writer.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
{
|
||||
// Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
@@ -575,31 +624,45 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")),
|
||||
empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")),
|
||||
vec![6_000]);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")),
|
||||
vec![6_000, 7_000]);
|
||||
let score_field_reader: U64FastFieldReader = searcher
|
||||
.segment_reader(0)
|
||||
.get_fast_field_reader(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
|
||||
{ // Test removing all docs
|
||||
{
|
||||
// Test removing all docs
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
index_writer.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,11 +8,11 @@ use std::fmt;
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum SegmentState {
|
||||
Ready,
|
||||
InMerge,
|
||||
InMerge,
|
||||
}
|
||||
|
||||
impl SegmentState {
|
||||
pub fn letter_code(&self,) -> char {
|
||||
pub fn letter_code(&self) -> char {
|
||||
match *self {
|
||||
SegmentState::InMerge => 'M',
|
||||
SegmentState::Ready => 'R',
|
||||
@@ -21,12 +21,12 @@ impl SegmentState {
|
||||
}
|
||||
|
||||
|
||||
/// A segment entry describes the state of
|
||||
/// A segment entry describes the state of
|
||||
/// a given segment, at a given instant.
|
||||
///
|
||||
/// In addition to segment meta,
|
||||
/// it contains a few transient states
|
||||
/// - state expresses whether the segment is already in the
|
||||
/// - state expresses whether the segment is already in the
|
||||
/// middle of a merge
|
||||
/// - delete_bitset is a bitset describing
|
||||
/// documents that were deleted during the commit
|
||||
@@ -40,16 +40,14 @@ pub struct SegmentEntry {
|
||||
state: SegmentState,
|
||||
delete_bitset: Option<BitSet>,
|
||||
delete_cursor: DeleteCursor,
|
||||
|
||||
}
|
||||
|
||||
impl SegmentEntry {
|
||||
|
||||
|
||||
/// Create a new `SegmentEntry`
|
||||
pub fn new(segment_meta: SegmentMeta,
|
||||
pub fn new(segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>) -> SegmentEntry {
|
||||
delete_bitset: Option<BitSet>)
|
||||
-> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
state: SegmentState::Ready,
|
||||
@@ -62,7 +60,7 @@ impl SegmentEntry {
|
||||
/// Return a reference to the segment entry deleted bitset.
|
||||
///
|
||||
/// `DocId` in this bitset are flagged as deleted.
|
||||
pub fn delete_bitset(&self,) -> Option<&BitSet> {
|
||||
pub fn delete_bitset(&self) -> Option<&BitSet> {
|
||||
self.delete_bitset.as_ref()
|
||||
}
|
||||
|
||||
@@ -77,7 +75,7 @@ impl SegmentEntry {
|
||||
&mut self.delete_cursor
|
||||
}
|
||||
|
||||
/// Return the `SegmentEntry`.
|
||||
/// Return the `SegmentEntry`.
|
||||
///
|
||||
/// The state describes whether the segment is available for
|
||||
/// a merge or not.
|
||||
@@ -89,7 +87,7 @@ impl SegmentEntry {
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.meta.id()
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Accessor to the `SegmentMeta`
|
||||
pub fn meta(&self) -> &SegmentMeta {
|
||||
@@ -99,9 +97,9 @@ impl SegmentEntry {
|
||||
|
||||
/// Mark the `SegmentEntry` as in merge.
|
||||
///
|
||||
/// Only segments that are not already
|
||||
/// Only segments that are not already
|
||||
/// in a merge are elligible for future merge.
|
||||
pub fn start_merge(&mut self,) {
|
||||
pub fn start_merge(&mut self) {
|
||||
self.state = SegmentState::InMerge;
|
||||
}
|
||||
|
||||
@@ -110,14 +108,14 @@ impl SegmentEntry {
|
||||
/// If a merge fails, it is important to switch
|
||||
/// the segment back to a idle state, so that it
|
||||
/// may be elligible for future merges.
|
||||
pub fn cancel_merge(&mut self,) {
|
||||
pub fn cancel_merge(&mut self) {
|
||||
self.state = SegmentState::Ready;
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff a segment should
|
||||
/// be considered for a merge.
|
||||
pub fn is_ready(&self,) -> bool {
|
||||
pub fn is_ready(&self) -> bool {
|
||||
self.state == SegmentState::Ready
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use indexer::delete_queue::DeleteCursor;
|
||||
struct SegmentRegisters {
|
||||
uncommitted: SegmentRegister,
|
||||
committed: SegmentRegister,
|
||||
writing: HashSet<SegmentId>,
|
||||
writing: HashSet<SegmentId>,
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ struct SegmentRegisters {
|
||||
/// The segment manager stores the list of segments
|
||||
/// as well as their state.
|
||||
///
|
||||
/// It guarantees the atomicity of the
|
||||
/// It guarantees the atomicity of the
|
||||
/// changes (merges especially)
|
||||
#[derive(Default)]
|
||||
pub struct SegmentManager {
|
||||
@@ -32,43 +32,43 @@ pub struct SegmentManager {
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(f, "{{ uncommitted: {:?}, committed: {:?} }}", lock.uncommitted, lock.committed)
|
||||
write!(f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted,
|
||||
lock.committed)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
pub fn get_mergeable_segments(segment_manager: &SegmentManager)
|
||||
-> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
let registers_lock = segment_manager.read();
|
||||
(registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments())
|
||||
}
|
||||
|
||||
impl SegmentManager {
|
||||
|
||||
pub fn from_segments(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentManager {
|
||||
pub fn from_segments(segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: DeleteCursor)
|
||||
-> SegmentManager {
|
||||
SegmentManager {
|
||||
registers: RwLock::new(SegmentRegisters {
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
uncommitted: SegmentRegister::default(),
|
||||
committed: SegmentRegister::new(segment_metas,
|
||||
delete_cursor),
|
||||
writing: HashSet::new(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns all of the segment entries (committed or uncommitted)
|
||||
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
|
||||
let mut segment_entries = self.read()
|
||||
.uncommitted
|
||||
.segment_entries();
|
||||
segment_entries.extend(
|
||||
self.read()
|
||||
.committed
|
||||
.segment_entries()
|
||||
);
|
||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||
let mut segment_entries = self.read().uncommitted.segment_entries();
|
||||
segment_entries.extend(self.read().committed.segment_entries());
|
||||
segment_entries
|
||||
}
|
||||
|
||||
/// Returns the overall number of segments in the `SegmentManager`
|
||||
pub fn num_segments(&self,) -> usize {
|
||||
pub fn num_segments(&self) -> usize {
|
||||
let registers_lock = self.read();
|
||||
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
||||
}
|
||||
@@ -78,19 +78,14 @@ impl SegmentManager {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.clone());
|
||||
files.insert(LOCKFILE_FILEPATH.clone());
|
||||
|
||||
let segment_metas: Vec<SegmentMeta> =
|
||||
registers_lock.committed
|
||||
.get_all_segments()
|
||||
.into_iter()
|
||||
.chain(registers_lock.uncommitted
|
||||
.get_all_segments()
|
||||
.into_iter())
|
||||
.chain(registers_lock.writing
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(SegmentMeta::new))
|
||||
.collect();
|
||||
|
||||
let segment_metas: Vec<SegmentMeta> = registers_lock
|
||||
.committed
|
||||
.get_all_segments()
|
||||
.into_iter()
|
||||
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
|
||||
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
|
||||
.collect();
|
||||
for segment_meta in segment_metas {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
@@ -102,18 +97,22 @@ impl SegmentManager {
|
||||
registers
|
||||
.committed
|
||||
.segment_entry(segment_id)
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
fn read(&self,) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers.read().expect("Failed to acquire read lock on SegmentManager.")
|
||||
// and the operations cannot panic.
|
||||
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.read()
|
||||
.expect("Failed to acquire read lock on SegmentManager.")
|
||||
}
|
||||
|
||||
fn write(&self,) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
|
||||
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on SegmentManager.")
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
@@ -124,42 +123,42 @@ impl SegmentManager {
|
||||
registers_lock.committed.add_segment_entry(segment_entry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
|
||||
let mut registers_lock = self.write();
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.uncommitted.start_merge(segment_id);
|
||||
}
|
||||
}
|
||||
else if registers_lock.committed.contains_all(segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.committed.start_merge(segment_id);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
error!("Merge operation sent for segments that are not all uncommited or commited.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId) {
|
||||
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
|
||||
|
||||
// we mark all segments are ready for merge.
|
||||
{
|
||||
let target_segment_register: &mut SegmentRegister;
|
||||
target_segment_register = {
|
||||
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(&before_merge_segment_ids) {
|
||||
&mut registers_lock.uncommitted
|
||||
}
|
||||
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(&before_merge_segment_ids) {
|
||||
&mut registers_lock.committed
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
return;
|
||||
}
|
||||
@@ -185,19 +184,24 @@ impl SegmentManager {
|
||||
registers_lock.writing.remove(&segment_entry.segment_id());
|
||||
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
|
||||
pub fn end_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry) {
|
||||
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry) {
|
||||
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.writing.remove(&after_merge_segment_entry.segment_id());
|
||||
|
||||
registers_lock
|
||||
.writing
|
||||
.remove(&after_merge_segment_entry.segment_id());
|
||||
|
||||
let mut target_register: &mut SegmentRegister = {
|
||||
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(&before_merge_segment_ids) {
|
||||
&mut registers_lock.uncommitted
|
||||
}
|
||||
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(&before_merge_segment_ids) {
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
@@ -209,12 +213,12 @@ impl SegmentManager {
|
||||
}
|
||||
target_register.add_segment_entry(after_merge_segment_entry);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
pub fn committed_segment_metas(&self,) -> Vec<SegmentMeta> {
|
||||
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let registers_lock = self.read();
|
||||
registers_lock.committed.segment_metas()
|
||||
}
|
||||
|
||||
@@ -9,14 +9,14 @@ use indexer::delete_queue::DeleteCursor;
|
||||
/// The segment register keeps track
|
||||
/// of the list of segment, their size as well
|
||||
/// as the state they are in.
|
||||
///
|
||||
/// It is consumed by indexes to get the list of
|
||||
///
|
||||
/// It is consumed by indexes to get the list of
|
||||
/// segments that are currently searchable,
|
||||
/// and by the index merger to identify
|
||||
/// and by the index merger to identify
|
||||
/// merge candidates.
|
||||
#[derive(Default)]
|
||||
pub struct SegmentRegister {
|
||||
segment_states: HashMap<SegmentId, SegmentEntry>,
|
||||
segment_states: HashMap<SegmentId, SegmentEntry>,
|
||||
}
|
||||
|
||||
|
||||
@@ -33,8 +33,7 @@ impl Debug for SegmentRegister {
|
||||
|
||||
|
||||
impl SegmentRegister {
|
||||
|
||||
pub fn clear(&mut self,) {
|
||||
pub fn clear(&mut self) {
|
||||
self.segment_states.clear();
|
||||
}
|
||||
|
||||
@@ -42,29 +41,26 @@ impl SegmentRegister {
|
||||
self.segment_states.len()
|
||||
}
|
||||
|
||||
pub fn get_all_segments(&self,) -> Vec<SegmentMeta> {
|
||||
pub fn get_all_segments(&self) -> Vec<SegmentMeta> {
|
||||
self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(&self,) -> Vec<SegmentMeta> {
|
||||
|
||||
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
|
||||
self.segment_states
|
||||
.values()
|
||||
.filter(|segment_entry| segment_entry.is_ready())
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
|
||||
self.segment_states
|
||||
.values()
|
||||
.cloned()
|
||||
.collect()
|
||||
|
||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||
self.segment_states.values().cloned().collect()
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self,) -> Vec<SegmentMeta> {
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
@@ -72,28 +68,28 @@ impl SegmentRegister {
|
||||
segment_ids.sort_by_key(|meta| meta.id());
|
||||
segment_ids
|
||||
}
|
||||
|
||||
|
||||
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||
self.segment_states
|
||||
.get(&segment_id)
|
||||
.map(|segment_entry| segment_entry.clone())
|
||||
}
|
||||
|
||||
|
||||
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
||||
segment_ids
|
||||
.iter()
|
||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||
}
|
||||
|
||||
|
||||
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
||||
let segment_id = segment_entry.segment_id();
|
||||
self.segment_states.insert(segment_id, segment_entry);
|
||||
}
|
||||
|
||||
|
||||
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states.remove(segment_id);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
|
||||
self.segment_states
|
||||
.get_mut(segment_id)
|
||||
@@ -106,21 +102,16 @@ impl SegmentRegister {
|
||||
.get_mut(segment_id)
|
||||
.expect("Received a merge notification for a segment that is not registered")
|
||||
.start_merge();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentRegister {
|
||||
let mut segment_states = HashMap::new();
|
||||
for segment_meta in segment_metas {
|
||||
let segment_id = segment_meta.id();
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_meta,
|
||||
delete_cursor.clone(),
|
||||
None);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
|
||||
segment_states.insert(segment_id, segment_entry);
|
||||
}
|
||||
SegmentRegister {
|
||||
segment_states: segment_states
|
||||
}
|
||||
SegmentRegister { segment_states: segment_states }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +131,7 @@ mod tests {
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect()
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_segment_register() {
|
||||
let delete_queue = DeleteQueue::new();
|
||||
@@ -149,32 +140,48 @@ mod tests {
|
||||
let segment_id_a = SegmentId::generate_random();
|
||||
let segment_id_b = SegmentId::generate_random();
|
||||
let segment_id_merged = SegmentId::generate_random();
|
||||
|
||||
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_a);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready);
|
||||
assert_eq!(segment_ids(&segment_register), vec!(segment_id_a));
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::Ready);
|
||||
segment_register.start_merge(&segment_id_a);
|
||||
segment_register.start_merge(&segment_id_b);
|
||||
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::InMerge);
|
||||
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::InMerge);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_a)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
assert_eq!(segment_register
|
||||
.segment_entry(&segment_id_b)
|
||||
.unwrap()
|
||||
.state(),
|
||||
SegmentState::InMerge);
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged));
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -29,11 +29,11 @@ impl SegmentSerializer {
|
||||
|
||||
let postings_serializer = try!(PostingsSerializer::open(segment));
|
||||
Ok(SegmentSerializer {
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
postings_serializer: postings_serializer,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
fast_field_serializer: fast_field_serializer,
|
||||
fieldnorms_serializer: fieldnorms_serializer,
|
||||
})
|
||||
}
|
||||
|
||||
/// Accessor to the `PostingsSerializer`.
|
||||
|
||||
@@ -49,11 +49,8 @@ use super::segment_manager::{SegmentManager, get_mergeable_segments};
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(schema: Schema,
|
||||
opstamp: u64,
|
||||
directory: &mut Directory)
|
||||
-> Result<()> {
|
||||
save_metas(vec!(), schema, opstamp, directory)
|
||||
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
||||
save_metas(vec![], schema, opstamp, directory)
|
||||
}
|
||||
|
||||
|
||||
@@ -82,7 +79,7 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
Ok(res)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +87,7 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
// of the `SegmentUpdate`s.
|
||||
//
|
||||
// All this processing happens on a single thread
|
||||
// consuming a common queue.
|
||||
// consuming a common queue.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
|
||||
@@ -99,56 +96,56 @@ pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||
fn perform_merge(segment_ids: &[SegmentId],
|
||||
segment_updater: &SegmentUpdater,
|
||||
mut merged_segment: Segment,
|
||||
target_opstamp: u64) -> Result<SegmentEntry> {
|
||||
target_opstamp: u64)
|
||||
-> Result<SegmentEntry> {
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids);
|
||||
|
||||
|
||||
let ref index = segment_updater.0.index;
|
||||
let schema = index.schema();
|
||||
let mut segment_entries = vec!();
|
||||
let mut segment_entries = vec![];
|
||||
|
||||
let mut file_protections: Vec<FileProtection> = vec!();
|
||||
let mut file_protections: Vec<FileProtection> = vec![];
|
||||
|
||||
for segment_id in segment_ids {
|
||||
if let Some(mut segment_entry) = segment_updater.0
|
||||
.segment_manager
|
||||
.segment_entry(segment_id) {
|
||||
if let Some(mut segment_entry) =
|
||||
segment_updater.0.segment_manager.segment_entry(segment_id) {
|
||||
let segment = index.segment(segment_entry.meta().clone());
|
||||
if let Some(file_protection) = advance_deletes(segment, &mut segment_entry, target_opstamp)? {
|
||||
if let Some(file_protection) =
|
||||
advance_deletes(segment, &mut segment_entry, target_opstamp)? {
|
||||
file_protections.push(file_protection);
|
||||
}
|
||||
segment_entries.push(segment_entry);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
error!("Error, had to abort merge as some of the segment is not managed anymore.a");
|
||||
return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id)));
|
||||
return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.",
|
||||
segment_id)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
let delete_cursor = segment_entries[0].delete_cursor().clone();
|
||||
|
||||
let segments: Vec<Segment> = segment_entries
|
||||
.iter()
|
||||
.map(|segment_entry| {
|
||||
index.segment(segment_entry.meta().clone())
|
||||
})
|
||||
.map(|segment_entry| index.segment(segment_entry.meta().clone()))
|
||||
.collect();
|
||||
|
||||
|
||||
|
||||
|
||||
// An IndexMerger is like a "view" of our merged segments.
|
||||
let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?;
|
||||
|
||||
// ... we just serialize this index merger in our new segment
|
||||
// to merge the two segments.
|
||||
|
||||
let segment_serializer =
|
||||
SegmentSerializer::for_segment(&mut merged_segment)
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
|
||||
.expect("Creating index serializer failed");
|
||||
|
||||
let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed");
|
||||
let num_docs = merger
|
||||
.write(segment_serializer)
|
||||
.expect("Serializing merged index failed");
|
||||
let mut segment_meta = SegmentMeta::new(merged_segment.id());
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
|
||||
|
||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||
Ok(after_merge_segment_entry)
|
||||
}
|
||||
@@ -162,30 +159,28 @@ struct InnerSegmentUpdater {
|
||||
merging_thread_id: AtomicUsize,
|
||||
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
||||
generation: AtomicUsize,
|
||||
killed: AtomicBool,
|
||||
killed: AtomicBool,
|
||||
stamper: Stamper,
|
||||
}
|
||||
|
||||
impl SegmentUpdater {
|
||||
|
||||
pub fn new(index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor) -> Result<SegmentUpdater> {
|
||||
delete_cursor: DeleteCursor)
|
||||
-> Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
Ok(
|
||||
SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
}))
|
||||
)
|
||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
pool: CpuPool::new(1),
|
||||
index: index,
|
||||
segment_manager: segment_manager,
|
||||
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper: stamper,
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
@@ -200,40 +195,41 @@ impl SegmentUpdater {
|
||||
}
|
||||
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
*self.0.merge_policy.write().unwrap()= merge_policy;
|
||||
*self.0.merge_policy.write().unwrap() = merge_policy;
|
||||
}
|
||||
|
||||
fn get_merging_thread_id(&self) -> usize {
|
||||
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
|
||||
}
|
||||
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(&self, f: F) -> CpuFuture<T, Error> {
|
||||
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>
|
||||
(&self,
|
||||
f: F)
|
||||
-> CpuFuture<T, Error> {
|
||||
let me_clone = self.clone();
|
||||
self.0.pool.spawn_fn(move || {
|
||||
Ok(f(me_clone))
|
||||
})
|
||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||
}
|
||||
|
||||
|
||||
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
|
||||
if generation >= self.0.generation.load(Ordering::Acquire) {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
}).forget();
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
})
|
||||
.forget();
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn kill(&mut self,) {
|
||||
pub fn kill(&mut self) {
|
||||
self.0.killed.store(true, Ordering::Release);
|
||||
}
|
||||
|
||||
fn is_alive(&self,) -> bool {
|
||||
fn is_alive(&self) -> bool {
|
||||
!self.0.killed.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
@@ -243,77 +239,80 @@ impl SegmentUpdater {
|
||||
/// Tne method returns copies of the segment entries,
|
||||
/// updated with the delete information.
|
||||
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
|
||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = self.0.index.segment(segment_entry.meta().clone());
|
||||
advance_deletes(segment, segment_entry, target_opstamp)?;
|
||||
}
|
||||
Ok(segment_entries)
|
||||
|
||||
|
||||
}
|
||||
|
||||
pub fn save_metas(&self, opstamp: u64) {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
save_metas(
|
||||
self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut()).expect("Could not save metas.");
|
||||
save_metas(self.0.segment_manager.committed_segment_metas(),
|
||||
index.schema(),
|
||||
opstamp,
|
||||
directory.box_clone().borrow_mut())
|
||||
.expect("Could not save metas.");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
}).wait()
|
||||
self.run_async(move |segment_updater| { segment_updater.garbage_collect_files_exec(); })
|
||||
.wait()
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
let living_files = self.0.segment_manager.list_files();
|
||||
let mut index = self.0.index.clone();
|
||||
index.directory_mut().garbage_collect(living_files);
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
.purge_deletes(opstamp)
|
||||
.expect("Failed purge deletes");
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
}
|
||||
}).wait()
|
||||
self.run_async(move |segment_updater| if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
.purge_deletes(opstamp)
|
||||
.expect("Failed purge deletes");
|
||||
segment_updater.0.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp);
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
segment_updater.consider_merge_options();
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
|
||||
|
||||
pub fn start_merge(&self,
|
||||
segment_ids: &[SegmentId])
|
||||
-> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
|
||||
self.0.segment_manager.start_merge(segment_ids);
|
||||
let segment_updater_clone = self.clone();
|
||||
|
||||
let segment_ids_vec = segment_ids.to_vec();
|
||||
|
||||
|
||||
let segment_ids_vec = segment_ids.to_vec();
|
||||
|
||||
let merging_thread_id = self.get_merging_thread_id();
|
||||
let (merging_future_send, merging_future_recv) = oneshot();
|
||||
|
||||
|
||||
if segment_ids.is_empty() {
|
||||
return merging_future_recv;
|
||||
}
|
||||
|
||||
|
||||
let target_opstamp = self.0.stamper.stamp();
|
||||
let merging_join_handle = thread::spawn(move || {
|
||||
|
||||
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids_vec);
|
||||
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp);
|
||||
let merge_result = perform_merge(&segment_ids_vec,
|
||||
&segment_updater_clone,
|
||||
merged_segment,
|
||||
target_opstamp);
|
||||
|
||||
match merge_result {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
@@ -321,11 +320,11 @@ impl SegmentUpdater {
|
||||
segment_updater_clone
|
||||
.end_merge(segment_ids_vec, after_merge_segment_entry)
|
||||
.expect("Segment updater thread is corrupted.");
|
||||
|
||||
// the future may fail if the listener of the oneshot future
|
||||
|
||||
// the future may fail if the listener of the oneshot future
|
||||
// has been destroyed.
|
||||
//
|
||||
// This is not a problem here, so we just ignore any
|
||||
// This is not a problem here, so we just ignore any
|
||||
// possible error.
|
||||
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
||||
}
|
||||
@@ -339,16 +338,26 @@ impl SegmentUpdater {
|
||||
// merging_future_send will be dropped, sending an error to the future.
|
||||
}
|
||||
}
|
||||
segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id);
|
||||
segment_updater_clone
|
||||
.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(&merging_thread_id);
|
||||
Ok(())
|
||||
});
|
||||
self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle);
|
||||
self.0
|
||||
.merging_threads
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(merging_thread_id, merging_join_handle);
|
||||
merging_future_recv
|
||||
}
|
||||
|
||||
|
||||
fn consider_merge_options(&self) {
|
||||
let (committed_segments, uncommitted_segments) = get_mergeable_segments(&self.0.segment_manager);
|
||||
let (committed_segments, uncommitted_segments) =
|
||||
get_mergeable_segments(&self.0.segment_manager);
|
||||
// Committed segments cannot be merged with uncommitted_segments.
|
||||
// We therefore consider merges using these two sets of segments independently.
|
||||
let merge_policy = self.get_merge_policy();
|
||||
@@ -360,17 +369,20 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId) {
|
||||
self.0.segment_manager.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||
fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentId) {
|
||||
self.0
|
||||
.segment_manager
|
||||
.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||
}
|
||||
|
||||
|
||||
fn end_merge(&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry) -> Result<()> {
|
||||
|
||||
|
||||
fn end_merge(&self,
|
||||
before_merge_segment_ids: Vec<SegmentId>,
|
||||
mut after_merge_segment_entry: SegmentEntry)
|
||||
-> Result<()> {
|
||||
|
||||
self.run_async(move |segment_updater| {
|
||||
debug!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
@@ -406,22 +418,22 @@ impl SegmentUpdater {
|
||||
///
|
||||
/// Upon termination of the current merging threads,
|
||||
/// merge opportunity may appear.
|
||||
//
|
||||
//
|
||||
/// We keep waiting until the merge policy judges that
|
||||
/// no opportunity is available.
|
||||
///
|
||||
/// Note that it is not required to call this
|
||||
/// Note that it is not required to call this
|
||||
/// method in your application.
|
||||
/// Terminating your application without letting
|
||||
/// Terminating your application without letting
|
||||
/// merge terminate is perfectly safe.
|
||||
///
|
||||
///
|
||||
/// Obsolete files will eventually be cleaned up
|
||||
/// by the directory garbage collector.
|
||||
pub fn wait_merging_thread(&self) -> Result<()> {
|
||||
|
||||
let mut num_segments: usize;
|
||||
loop {
|
||||
|
||||
|
||||
num_segments = self.0.segment_manager.num_segments();
|
||||
|
||||
let mut new_merging_threads = HashMap::new();
|
||||
@@ -434,9 +446,7 @@ impl SegmentUpdater {
|
||||
merging_thread_handle
|
||||
.join()
|
||||
.map(|_| ())
|
||||
.map_err(|_| {
|
||||
Error::ErrorInThread("Merging thread failed.".to_string())
|
||||
})?
|
||||
.map_err(|_| Error::ErrorInThread("Merging thread failed.".to_string()))?
|
||||
}
|
||||
// Our merging thread may have queued their completed
|
||||
self.run_async(move |_| {}).wait()?;
|
||||
@@ -446,10 +456,9 @@ impl SegmentUpdater {
|
||||
if new_num_segments >= num_segments {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -469,7 +478,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.set_merge_policy(box MergeWheneverPossible);
|
||||
@@ -481,7 +490,7 @@ mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
@@ -489,7 +498,7 @@ mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>"e"));
|
||||
index_writer.add_document(doc!(text_field=>"f"));
|
||||
@@ -506,8 +515,9 @@ mod tests {
|
||||
assert_eq!(index.searcher().num_docs(), 302);
|
||||
|
||||
{
|
||||
index_writer.wait_merging_threads()
|
||||
.expect( "waiting for merging threads");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting for merging threads");
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use Result;
|
||||
use DocId;
|
||||
use std::io;
|
||||
use schema::Schema;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
@@ -22,220 +22,219 @@ use super::operation::AddOperation;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
///
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
heap: &'a Heap,
|
||||
max_doc: DocId,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FastFieldsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FastFieldsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
}
|
||||
|
||||
|
||||
fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter {
|
||||
let u64_fields: Vec<Field> = schema.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
FastFieldsWriter::new(u64_fields)
|
||||
let u64_fields: Vec<Field> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
FastFieldsWriter::new(u64_fields)
|
||||
}
|
||||
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
_ => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
|
||||
heap: &'a Heap)
|
||||
-> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
_ => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
|
||||
FieldType::I64(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
///
|
||||
/// - heap: most of the segment writer data (terms, and postings lists recorders)
|
||||
/// is stored in a user-defined heap object. This makes it possible for the user to define
|
||||
/// the flushing behavior as a buffer limit
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(heap: &'a Heap,
|
||||
mut segment: Segment,
|
||||
schema: &Schema) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
|
||||
let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
|
||||
for field_entry in schema.fields() {
|
||||
let postings_writer: Box<PostingsWriter + 'a> = posting_from_field_entry(field_entry, heap);
|
||||
per_field_postings_writers.push(postings_writer);
|
||||
}
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
per_field_postings_writers: per_field_postings_writers,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
///
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(mut self) -> Result<Vec<u64>> {
|
||||
for per_field_postings_writer in &mut self.per_field_postings_writers {
|
||||
per_field_postings_writer.close(self.heap);
|
||||
}
|
||||
write(&self.per_field_postings_writers,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
self.heap)?;
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self,) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
for (field, field_values) in doc.get_sorted_field_values() {
|
||||
let field_posting_writer: &mut Box<PostingsWriter> = &mut self.per_field_postings_writers[field.0 as usize];
|
||||
let field_options = schema.get_field_entry(field);
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 =
|
||||
if text_options.get_indexing_options().is_tokenized() {
|
||||
field_posting_writer.index_text(doc_id, field, &field_values, self.heap)
|
||||
}
|
||||
else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_text(field, field_value.value().text());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| {
|
||||
field_norms_writer.add_val(num_tokens as u64)
|
||||
});
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.fieldnorms_writer.fill_val_up_to(doc_id);
|
||||
self.fast_field_writers.add_document(&doc);
|
||||
let stored_fieldvalues: Vec<&FieldValue> = doc
|
||||
.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
.collect();
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
try!(doc_writer.store(&stored_fieldvalues));
|
||||
self.max_doc += 1;
|
||||
Ok(())
|
||||
/// Creates a new `SegmentWriter`
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
///
|
||||
/// - heap: most of the segment writer data (terms, and postings lists recorders)
|
||||
/// is stored in a user-defined heap object. This makes it possible for the user to define
|
||||
/// the flushing behavior as a buffer limit
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(heap: &'a Heap,
|
||||
mut segment: Segment,
|
||||
schema: &Schema)
|
||||
-> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
|
||||
let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
|
||||
for field_entry in schema.fields() {
|
||||
let postings_writer: Box<PostingsWriter + 'a> = posting_from_field_entry(field_entry,
|
||||
heap);
|
||||
per_field_postings_writers.push(postings_writer);
|
||||
}
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
per_field_postings_writers: per_field_postings_writers,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
/// Max doc is
|
||||
/// - the number of documents in the segment assuming there is no deletes
|
||||
/// - the maximum document id (including deleted documents) + 1
|
||||
///
|
||||
/// Currently, **tantivy** does not handle deletes anyway,
|
||||
/// so `max_doc == num_docs`
|
||||
pub fn max_doc(&self,) -> u32 {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Number of documents in the index.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
/// Currently, **tantivy** does not handle deletes anyway,
|
||||
/// so `max_doc == num_docs`
|
||||
#[allow(dead_code)]
|
||||
pub fn num_docs(&self,) -> u32 {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Lay on disk the current content of the `SegmentWriter`
|
||||
///
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(mut self) -> Result<Vec<u64>> {
|
||||
for per_field_postings_writer in &mut self.per_field_postings_writers {
|
||||
per_field_postings_writer.close(self.heap);
|
||||
}
|
||||
write(&self.per_field_postings_writers,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
self.heap)?;
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment writer's buffer has reached capacity.
|
||||
///
|
||||
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
|
||||
/// The `Segment` is `finalize`d when the buffer gets full.
|
||||
///
|
||||
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
|
||||
/// exceeds the heap size.
|
||||
pub fn is_buffer_full(&self) -> bool {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(&mut self,
|
||||
add_operation: &AddOperation,
|
||||
schema: &Schema)
|
||||
-> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
for (field, field_values) in doc.get_sorted_field_values() {
|
||||
let field_posting_writer: &mut Box<PostingsWriter> =
|
||||
&mut self.per_field_postings_writers[field.0 as usize];
|
||||
let field_options = schema.get_field_entry(field);
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
|
||||
field_posting_writer.index_text(doc_id, field, &field_values, self.heap)
|
||||
} else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_text(field, field_value.value().text());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));
|
||||
}
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u64(field_value.field(),
|
||||
field_value.value().u64_value());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(),
|
||||
field_value.value().i64_value());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.fieldnorms_writer.fill_val_up_to(doc_id);
|
||||
self.fast_field_writers.add_document(&doc);
|
||||
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
|
||||
.iter()
|
||||
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
|
||||
.collect();
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
try!(doc_writer.store(&stored_fieldvalues));
|
||||
self.max_doc += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Max doc is
|
||||
/// - the number of documents in the segment assuming there is no deletes
|
||||
/// - the maximum document id (including deleted documents) + 1
|
||||
///
|
||||
/// Currently, **tantivy** does not handle deletes anyway,
|
||||
/// so `max_doc == num_docs`
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Number of documents in the index.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
/// Currently, **tantivy** does not handle deletes anyway,
|
||||
/// so `max_doc == num_docs`
|
||||
#[allow(dead_code)]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.max_doc
|
||||
}
|
||||
}
|
||||
|
||||
// This method is used as a trick to workaround the borrow checker
|
||||
fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
heap: &'a Heap,) -> Result<()> {
|
||||
for per_field_postings_writer in per_field_postings_writers {
|
||||
try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
|
||||
}
|
||||
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
|
||||
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
|
||||
try!(serializer.close());
|
||||
Ok(())
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
heap: &'a Heap)
|
||||
-> Result<()> {
|
||||
for per_field_postings_writer in per_field_postings_writers {
|
||||
try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
|
||||
}
|
||||
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
|
||||
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
|
||||
try!(serializer.close());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(&self.per_field_postings_writers,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer,
|
||||
self.heap)?;
|
||||
Ok(max_doc)
|
||||
}
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(&self.per_field_postings_writers,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer,
|
||||
self.heap)?;
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,12 +6,11 @@ use std::sync::Arc;
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self,) -> u64 {
|
||||
pub fn stamp(&self) -> u64 {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
157
src/lib.rs
157
src/lib.rs
@@ -64,8 +64,10 @@ extern crate libc;
|
||||
#[cfg(windows)]
|
||||
extern crate winapi;
|
||||
|
||||
#[cfg(test)] extern crate test;
|
||||
#[cfg(test)] extern crate rand;
|
||||
#[cfg(test)]
|
||||
extern crate test;
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -137,10 +139,9 @@ pub use core::TermIterator;
|
||||
/// Expose the current version of tantivy, as well
|
||||
/// whether it was compiled with the simd compression.
|
||||
pub fn version() -> &'static str {
|
||||
if cfg!(feature="simdcompression") {
|
||||
if cfg!(feature = "simdcompression") {
|
||||
concat!(version!(), "-simd")
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
concat!(version!(), "-nosimd")
|
||||
}
|
||||
}
|
||||
@@ -167,17 +168,16 @@ pub type Score = f32;
|
||||
pub type SegmentLocalId = u32;
|
||||
|
||||
impl DocAddress {
|
||||
|
||||
/// Return the segment ordinal.
|
||||
/// The segment ordinal is an id identifying the segment
|
||||
/// hosting the document. It is only meaningful, in the context
|
||||
/// of a searcher.
|
||||
pub fn segment_ord(&self,) -> SegmentLocalId {
|
||||
pub fn segment_ord(&self) -> SegmentLocalId {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the segment local `DocId`
|
||||
pub fn doc(&self,) -> DocId {
|
||||
pub fn doc(&self) -> DocId {
|
||||
self.1
|
||||
}
|
||||
}
|
||||
@@ -214,7 +214,7 @@ mod tests {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32()< ratio)
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
@@ -226,9 +226,7 @@ mod tests {
|
||||
fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..n)
|
||||
.filter(|_| rng.next_f32() < ratio)
|
||||
.collect()
|
||||
(0..n).filter(|_| rng.next_f32() < ratio).collect()
|
||||
}
|
||||
|
||||
pub fn sample(n: u32, ratio: f32) -> Vec<u32> {
|
||||
@@ -344,19 +342,23 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{ // 0
|
||||
{
|
||||
// 0
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 1
|
||||
{
|
||||
// 1
|
||||
let doc = doc!(text_field=>" a c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 2
|
||||
{
|
||||
// 2
|
||||
let doc = doc!(text_field=>" b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 3
|
||||
{
|
||||
// 3
|
||||
let doc = doc!(text_field=>" b d");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -366,11 +368,13 @@ mod tests {
|
||||
{
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
}
|
||||
{ // 4
|
||||
{
|
||||
// 4
|
||||
let doc = doc!(text_field=>" b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 5
|
||||
{
|
||||
// 5
|
||||
let doc = doc!(text_field=>" a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -380,15 +384,21 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -399,11 +409,13 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{ // 0
|
||||
{
|
||||
// 0
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 1
|
||||
{
|
||||
// 1
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
}
|
||||
index_writer.rollback().unwrap();
|
||||
@@ -412,15 +424,21 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -446,13 +464,19 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "a"))
|
||||
.unwrap();
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "b"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(postings.advance());
|
||||
@@ -460,7 +484,9 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
{
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "c")).unwrap();
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "c"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!postings.advance());
|
||||
@@ -477,14 +503,15 @@ mod tests {
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.add_document(
|
||||
doc!(field=>1u64)
|
||||
);
|
||||
index_writer.add_document(doc!(field=>1u64));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_u64(field, 1u64);
|
||||
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
@@ -499,16 +526,15 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let negative_val = -1i64;
|
||||
index_writer.add_document(
|
||||
doc!(value_field => negative_val)
|
||||
);
|
||||
index_writer.add_document(doc!(value_field => negative_val));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_i64(value_field, negative_val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
@@ -569,8 +595,12 @@ mod tests {
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
|
||||
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "af")).unwrap();
|
||||
assert!(reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "abcd"))
|
||||
.is_none());
|
||||
let mut postings = reader
|
||||
.read_postings_all_info(&Term::from_field_text(text_field, "af"))
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert_eq!(postings.term_freq(), 3);
|
||||
@@ -612,35 +642,29 @@ mod tests {
|
||||
collector.docs()
|
||||
};
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "a"))),
|
||||
vec!(1, 2));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||
vec![1, 2]);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "af"))),
|
||||
vec!(0));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||
vec![0]);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "b"))),
|
||||
vec!(0, 1, 2));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||
vec![0, 1, 2]);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "c"))),
|
||||
vec!(1, 2));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||
vec![1, 2]);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "d"))),
|
||||
vec!(2));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||
vec![2]);
|
||||
}
|
||||
{
|
||||
assert_eq!(
|
||||
get_doc_ids(vec!(Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a"), )),
|
||||
vec!(0, 1, 2));
|
||||
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"),
|
||||
Term::from_field_text(text_field, "a")]),
|
||||
vec![0, 1, 2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -709,26 +733,31 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
let fast_field_reader_res =
|
||||
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
|
||||
@@ -123,4 +123,4 @@ impl FreqHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +63,7 @@ mod tests {
|
||||
let term = Term::from_field_text(text_field, "abc");
|
||||
posting_serializer.new_term(&term).unwrap();
|
||||
for doc_id in 0u32..3u32 {
|
||||
let positions = vec!(1,2,3,2);
|
||||
let positions = vec![1, 2, 3, 2];
|
||||
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
|
||||
}
|
||||
posting_serializer.close_term().unwrap();
|
||||
@@ -81,7 +81,8 @@ mod tests {
|
||||
let segment = index.new_segment();
|
||||
let heap = Heap::with_capacity(10_000_000);
|
||||
{
|
||||
let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap();
|
||||
let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema)
|
||||
.unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b a c a d a a.");
|
||||
@@ -120,7 +121,7 @@ mod tests {
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.get(0), 8 + 5);
|
||||
assert_eq!(fieldnorm_reader.get(1), 2);
|
||||
for i in 2 .. 1000 {
|
||||
for i in 2..1000 {
|
||||
assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
|
||||
}
|
||||
}
|
||||
@@ -139,7 +140,7 @@ mod tests {
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.doc(), 1u32);
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
for i in 2u32 .. 1000u32 {
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_a.advance());
|
||||
assert_eq!(postings_a.term_freq(), 1);
|
||||
assert_eq!(postings_a.positions(), [i]);
|
||||
@@ -151,7 +152,7 @@ mod tests {
|
||||
let term_e = Term::from_field_text(text_field, "e");
|
||||
let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap();
|
||||
assert_eq!(postings_e.len(), 1000 - 2);
|
||||
for i in 2u32 .. 1000u32 {
|
||||
for i in 2u32..1000u32 {
|
||||
assert!(postings_e.advance());
|
||||
assert_eq!(postings_e.term_freq(), i);
|
||||
let positions = postings_e.positions();
|
||||
@@ -187,7 +188,8 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let searcher = index.searcher();
|
||||
let mut term_weight = term_query.specialized_weight(&*searcher);
|
||||
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
|
||||
@@ -201,9 +203,9 @@ mod tests {
|
||||
#[test]
|
||||
fn test_intersection() {
|
||||
{
|
||||
let left = VecPostings::from(vec!(1, 3, 9));
|
||||
let right = VecPostings::from(vec!(3, 4, 9, 18));
|
||||
let mut intersection = IntersectionDocSet::from(vec!(left, right));
|
||||
let left = VecPostings::from(vec![1, 3, 9]);
|
||||
let right = VecPostings::from(vec![3, 4, 9, 18]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![left, right]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 3);
|
||||
assert!(intersection.advance());
|
||||
@@ -211,10 +213,10 @@ mod tests {
|
||||
assert!(!intersection.advance());
|
||||
}
|
||||
{
|
||||
let a = VecPostings::from(vec!(1, 3, 9));
|
||||
let b = VecPostings::from(vec!(3, 4, 9, 18));
|
||||
let c = VecPostings::from(vec!(1, 5, 9, 111));
|
||||
let mut intersection = IntersectionDocSet::from(vec!(a, b, c));
|
||||
let a = VecPostings::from(vec![1, 3, 9]);
|
||||
let b = VecPostings::from(vec![3, 4, 9, 18]);
|
||||
let c = VecPostings::from(vec![1, 5, 9, 111]);
|
||||
let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
|
||||
assert!(intersection.advance());
|
||||
assert_eq!(intersection.doc(), 9);
|
||||
assert!(!intersection.advance());
|
||||
@@ -274,9 +276,11 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
let mut segment_postings = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
while segment_postings.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
@@ -284,9 +288,14 @@ mod tests {
|
||||
let searcher = INDEX.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
b.iter(|| {
|
||||
let segment_postings_a = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
let segment_postings_b = segment_reader.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq).unwrap();
|
||||
let mut intersection = IntersectionDocSet::from(vec!(segment_postings_a, segment_postings_b));
|
||||
let segment_postings_a = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let segment_postings_b = segment_reader
|
||||
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
|
||||
segment_postings_b]);
|
||||
while intersection.advance() {}
|
||||
});
|
||||
}
|
||||
@@ -296,7 +305,9 @@ mod tests {
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||
|
||||
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
let mut segment_postings = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
let mut existing_docs = Vec::new();
|
||||
for doc in &docs {
|
||||
if *doc >= segment_postings.doc() {
|
||||
@@ -308,7 +319,9 @@ mod tests {
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
let mut segment_postings = segment_reader
|
||||
.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq)
|
||||
.unwrap();
|
||||
for doc in &existing_docs {
|
||||
if segment_postings.skip_next(*doc) == SkipResult::End {
|
||||
break;
|
||||
|
||||
@@ -22,7 +22,7 @@ pub trait PostingsWriter {
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
@@ -115,9 +115,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
.iter()
|
||||
.collect();
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
for (term_bytes, (addr, recorder)) in term_offsets {
|
||||
@@ -138,4 +136,4 @@ fn test_hashmap_size() {
|
||||
assert_eq!(hashmap_size_in_bits(0), 10);
|
||||
assert_eq!(hashmap_size_in_bits(100_000), 11);
|
||||
assert_eq!(hashmap_size_in_bits(300_000_000), 23);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,15 +27,20 @@ impl<'a> SegmentPostings<'a> {
|
||||
fn load_next_block(&mut self) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.block_len = NUM_DOCS_PER_BLOCK;
|
||||
} else {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
self.remaining_data =
|
||||
self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data,
|
||||
self.doc_offset,
|
||||
num_remaining_docs);
|
||||
self.freq_handler
|
||||
.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
self.block_len = num_remaining_docs;
|
||||
}
|
||||
}
|
||||
@@ -49,7 +54,8 @@ impl<'a> SegmentPostings<'a> {
|
||||
pub fn from_data(len: u32,
|
||||
data: &'a [u8],
|
||||
delete_bitset: &'a DeleteBitSet,
|
||||
freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
freq_handler: FreqHandler)
|
||||
-> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
len: len as usize,
|
||||
block_len: len as usize,
|
||||
|
||||
@@ -75,22 +75,22 @@ impl PostingsSerializer {
|
||||
-> Result<PostingsSerializer> {
|
||||
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: postings_write,
|
||||
positions_write: positions_write,
|
||||
written_bytes_postings: 0,
|
||||
written_bytes_positions: 0,
|
||||
last_doc_id_encoded: 0u32,
|
||||
positions_encoder: CompositeEncoder::new(),
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: Vec::new(),
|
||||
term_freqs: Vec::new(),
|
||||
position_deltas: Vec::new(),
|
||||
schema: schema,
|
||||
text_indexing_options: TextIndexingOptions::Unindexed,
|
||||
term_open: false,
|
||||
current_term_info: TermInfo::default(),
|
||||
})
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: postings_write,
|
||||
positions_write: positions_write,
|
||||
written_bytes_postings: 0,
|
||||
written_bytes_positions: 0,
|
||||
last_doc_id_encoded: 0u32,
|
||||
positions_encoder: CompositeEncoder::new(),
|
||||
block_encoder: BlockEncoder::new(),
|
||||
doc_ids: Vec::new(),
|
||||
term_freqs: Vec::new(),
|
||||
position_deltas: Vec::new(),
|
||||
schema: schema,
|
||||
text_indexing_options: TextIndexingOptions::Unindexed,
|
||||
term_open: false,
|
||||
current_term_info: TermInfo::default(),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -155,7 +155,8 @@ impl PostingsSerializer {
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
|
||||
self.terms_fst_builder.insert_value(&self.current_term_info)?;
|
||||
self.terms_fst_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
|
||||
if !self.doc_ids.is_empty() {
|
||||
// we have doc ids waiting to be written
|
||||
@@ -165,8 +166,9 @@ impl PostingsSerializer {
|
||||
// In that case, the remaining part is encoded
|
||||
// using variable int encoding.
|
||||
{
|
||||
let block_encoded = self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded =
|
||||
self.block_encoder
|
||||
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.doc_ids.clear();
|
||||
@@ -186,7 +188,7 @@ impl PostingsSerializer {
|
||||
// end of the term, at which point they are compressed and written.
|
||||
if self.text_indexing_options.is_position_enabled() {
|
||||
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64)
|
||||
.serialize(&mut self.positions_write));
|
||||
.serialize(&mut self.positions_write));
|
||||
let positions_encoded: &[u8] = self.positions_encoder
|
||||
.compress_unsorted(&self.position_deltas[..]);
|
||||
try!(self.positions_write.write_all(positions_encoded));
|
||||
@@ -224,8 +226,9 @@ impl PostingsSerializer {
|
||||
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
|
||||
{
|
||||
// encode the doc ids
|
||||
let block_encoded: &[u8] = self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
let block_encoded: &[u8] =
|
||||
self.block_encoder
|
||||
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
|
||||
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
|
||||
try!(self.postings_write.write_all(block_encoded));
|
||||
self.written_bytes_postings += block_encoded.len();
|
||||
|
||||
@@ -2,21 +2,21 @@ use common::BinarySerializable;
|
||||
use std::io;
|
||||
|
||||
|
||||
/// `TermInfo` contains all of the information
|
||||
/// `TermInfo` contains all of the information
|
||||
/// associated to terms in the `.term` file.
|
||||
///
|
||||
///
|
||||
/// It consists of
|
||||
/// * `doc_freq` : the number of document in the segment
|
||||
/// containing this term. It is also the length of the
|
||||
/// posting list associated to this term
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// * `postings_offset` : an offset in the `.idx` file
|
||||
/// addressing the start of the posting list associated
|
||||
/// to this term.
|
||||
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
|
||||
pub struct TermInfo {
|
||||
/// Number of documents in the segment containing the term
|
||||
pub doc_freq: u32,
|
||||
/// Offset within the postings (`.idx`) file.
|
||||
/// Offset within the postings (`.idx`) file.
|
||||
pub postings_offset: u32,
|
||||
/// Offset within the position (`.pos`) file.
|
||||
pub positions_offset: u32,
|
||||
@@ -25,20 +25,17 @@ pub struct TermInfo {
|
||||
|
||||
impl BinarySerializable for TermInfo {
|
||||
fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> {
|
||||
Ok(
|
||||
try!(self.doc_freq.serialize(writer)) +
|
||||
try!(self.postings_offset.serialize(writer)) +
|
||||
try!(self.positions_offset.serialize(writer))
|
||||
)
|
||||
Ok(try!(self.doc_freq.serialize(writer)) + try!(self.postings_offset.serialize(writer)) +
|
||||
try!(self.positions_offset.serialize(writer)))
|
||||
}
|
||||
fn deserialize(reader: &mut io::Read) -> io::Result<Self> {
|
||||
let doc_freq = try!(u32::deserialize(reader));
|
||||
let postings_offset = try!(u32::deserialize(reader));
|
||||
let positions_offset = try!(u32::deserialize(reader));
|
||||
Ok(TermInfo {
|
||||
doc_freq: doc_freq,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
})
|
||||
doc_freq: doc_freq,
|
||||
postings_offset: postings_offset,
|
||||
positions_offset: positions_offset,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,10 +37,11 @@ impl Query for BooleanQuery {
|
||||
}
|
||||
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
let sub_weights = try!(self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect());
|
||||
let sub_weights =
|
||||
try!(self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
|
||||
.collect());
|
||||
let occurs: Vec<Occur> = self.subqueries
|
||||
.iter()
|
||||
.map(|&(ref occur, ref _subquery)| *occur)
|
||||
@@ -54,12 +55,14 @@ impl BooleanQuery {
|
||||
/// Helper method to create a boolean query matching a given list of terms.
|
||||
/// The resulting query is a disjunction of the terms.
|
||||
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
||||
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms.into_iter()
|
||||
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
|
||||
.into_iter()
|
||||
.map(|term| {
|
||||
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
let term_query: Box<Query> = box TermQuery::new(term,
|
||||
SegmentPostingsOption::Freq);
|
||||
(Occur::Should, term_query)
|
||||
})
|
||||
.collect();
|
||||
BooleanQuery::from(occur_term_queries)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use query::boolean_query::ScoreCombiner;
|
||||
/// Each `HeapItem` represents the head of
|
||||
/// one of scorer being merged.
|
||||
///
|
||||
/// * `doc` - is the current doc id for the given segment postings
|
||||
/// * `doc` - is the current doc id for the given segment postings
|
||||
/// * `ord` - is the ordinal used to identify to which segment postings
|
||||
/// this heap item belong to.
|
||||
#[derive(Eq, PartialEq)]
|
||||
@@ -27,8 +27,8 @@ impl PartialOrd for HeapItem {
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other:&Self) -> Ordering {
|
||||
(other.doc).cmp(&self.doc)
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
(other.doc).cmp(&self.doc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,9 +41,7 @@ pub struct BooleanScorer<TScorer: Scorer> {
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
|
||||
pub fn new(scorers: Vec<TScorer>,
|
||||
occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
|
||||
pub fn new(scorers: Vec<TScorer>, occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
|
||||
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
|
||||
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
|
||||
for mut posting in scorers {
|
||||
@@ -57,11 +55,11 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
.map(|posting| posting.doc())
|
||||
.enumerate()
|
||||
.map(|(ord, doc)| {
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32
|
||||
}
|
||||
})
|
||||
HeapItem {
|
||||
doc: doc,
|
||||
ord: ord as u32,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
BooleanScorer {
|
||||
scorers: non_empty_scorers,
|
||||
@@ -69,20 +67,19 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
doc: 0u32,
|
||||
score_combiner: score_combiner,
|
||||
occur_filter: occur_filter,
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Advances the head of our heap (the segment posting with the lowest doc)
|
||||
/// It will also update the new current `DocId` as well as the term frequency
|
||||
/// associated with the segment postings.
|
||||
///
|
||||
///
|
||||
/// After advancing the `SegmentPosting`, the postings is removed from the heap
|
||||
/// if it has been entirely consumed, or pushed back into the heap.
|
||||
///
|
||||
///
|
||||
/// # Panics
|
||||
/// This method will panic if the head `SegmentPostings` is not empty.
|
||||
fn advance_head(&mut self,) {
|
||||
fn advance_head(&mut self) {
|
||||
{
|
||||
let mut mutable_head = self.queue.peek_mut().unwrap();
|
||||
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
|
||||
@@ -96,7 +93,7 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
self.score_combiner.clear();
|
||||
let mut ord_bitset = 0u64;
|
||||
@@ -106,40 +103,37 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
|
||||
self.doc = heap_item.doc;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
self.advance_head();
|
||||
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
|
||||
while let Some(&HeapItem { doc, ord }) = self.queue.peek() {
|
||||
if doc == self.doc {
|
||||
let ord = ord as usize;
|
||||
let score = self.scorers[ord].score();
|
||||
self.score_combiner.update(score);
|
||||
ord_bitset |= 1 << ord;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
self.advance_head();
|
||||
}
|
||||
}
|
||||
if self.occur_filter.accept(ord_bitset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer: Scorer> Scorer for BooleanScorer<TScorer> {
|
||||
|
||||
fn score(&self,) -> f32 {
|
||||
fn score(&self) -> f32 {
|
||||
self.score_combiner.score()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,9 +23,9 @@ impl BooleanWeight {
|
||||
impl Weight for BooleanWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(self.weights
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
.iter()
|
||||
.map(|weight| weight.scorer(reader))
|
||||
.collect());
|
||||
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
|
||||
Ok(box boolean_scorer)
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ pub use self::score_combiner::ScoreCombiner;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::Scorer;
|
||||
@@ -23,12 +23,12 @@ mod tests {
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use schema::*;
|
||||
use fastfield::{U64FastFieldReader};
|
||||
use fastfield::U64FastFieldReader;
|
||||
use postings::SegmentPostingsOption;
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
@@ -64,7 +64,8 @@ mod tests {
|
||||
}
|
||||
|
||||
let make_term_query = |text: &str| {
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, text), SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, text),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let query: Box<Query> = box term_query;
|
||||
query
|
||||
};
|
||||
@@ -78,58 +79,59 @@ mod tests {
|
||||
test_collector.docs()
|
||||
};
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")) ]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")) ]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 2, 3));
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
|
||||
(Occur::Should, make_term_query("b")),
|
||||
(Occur::MustNot, make_term_query("d")),
|
||||
]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec!(0, 1));
|
||||
(Occur::MustNot, make_term_query("d"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]);
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d"))]);
|
||||
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_scorer() {
|
||||
let occurs = vec!(Occur::Should, Occur::Should);
|
||||
let occurs = vec![Occur::Should, Occur::Should];
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300));
|
||||
|
||||
let left = VecPostings::from(vec!(1, 2, 3));
|
||||
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec![100, 200, 300]);
|
||||
|
||||
let left = VecPostings::from(vec![1, 2, 3]);
|
||||
let left_scorer = TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: Some(left_fieldnorms),
|
||||
postings: left,
|
||||
};
|
||||
|
||||
let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35));
|
||||
let right = VecPostings::from(vec!(1, 3, 8));
|
||||
|
||||
|
||||
let right_fieldnorms = U64FastFieldReader::from(vec![15, 25, 35]);
|
||||
let right = VecPostings::from(vec![1, 3, 8]);
|
||||
|
||||
let right_scorer = TermScorer {
|
||||
idf: 4f32,
|
||||
fieldnorm_reader_opt: Some(right_fieldnorms),
|
||||
postings: right,
|
||||
};
|
||||
|
||||
let mut boolean_scorer = BooleanScorer::new(vec!(left_scorer, right_scorer), occur_filter);
|
||||
let mut boolean_scorer = BooleanScorer::new(vec![left_scorer, right_scorer], occur_filter);
|
||||
assert_eq!(boolean_scorer.next(), Some(1u32));
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.8707107) < 0.001);
|
||||
assert_eq!(boolean_scorer.next(), Some(2u32));
|
||||
@@ -139,7 +141,7 @@ mod tests {
|
||||
assert!(abs_diff(boolean_scorer.score(), 0.5163978) < 0.001f32);
|
||||
assert!(!boolean_scorer.advance());
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -7,26 +7,25 @@ pub struct ScoreCombiner {
|
||||
}
|
||||
|
||||
impl ScoreCombiner {
|
||||
|
||||
pub fn update(&mut self, score: Score) {
|
||||
self.score += score;
|
||||
self.num_fields += 1;
|
||||
}
|
||||
|
||||
pub fn clear(&mut self,) {
|
||||
pub fn clear(&mut self) {
|
||||
self.score = 0f32;
|
||||
self.num_fields = 0;
|
||||
}
|
||||
|
||||
|
||||
/// Compute the coord term
|
||||
fn coord(&self,) -> f32 {
|
||||
fn coord(&self) -> f32 {
|
||||
self.coords[self.num_fields]
|
||||
}
|
||||
|
||||
pub fn score(&self, ) -> Score {
|
||||
|
||||
pub fn score(&self) -> Score {
|
||||
self.score * self.coord()
|
||||
}
|
||||
|
||||
|
||||
pub fn default_for_num_scorers(num_scorers: usize) -> ScoreCombiner {
|
||||
let query_coords: Vec<Score> = (0..num_scorers + 1)
|
||||
.map(|i| (i as Score) / (num_scorers as Score))
|
||||
@@ -43,4 +42,4 @@ impl From<Vec<Score>> for ScoreCombiner {
|
||||
score: 0f32,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/// Query module
|
||||
///
|
||||
///
|
||||
/// The query module regroups all of tantivy's query objects
|
||||
///
|
||||
|
||||
|
||||
@@ -5,21 +5,20 @@ use query::Occur;
|
||||
// at most 64 elements.
|
||||
///
|
||||
/// It wraps some simple bitmask to compute the filter
|
||||
/// rapidly.
|
||||
/// rapidly.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct OccurFilter {
|
||||
and_mask: u64,
|
||||
result: u64,
|
||||
result: u64,
|
||||
}
|
||||
|
||||
impl OccurFilter {
|
||||
|
||||
/// Returns true if the bitset is matching the occur list.
|
||||
pub fn accept(&self, ord_set: u64) -> bool {
|
||||
(self.and_mask & ord_set) == self.result
|
||||
}
|
||||
|
||||
/// Builds an `OccurFilter` from a list of `Occur`.
|
||||
|
||||
/// Builds an `OccurFilter` from a list of `Occur`.
|
||||
pub fn new(occurs: &[Occur]) -> OccurFilter {
|
||||
let mut and_mask = 0u64;
|
||||
let mut result = 0u64;
|
||||
@@ -29,16 +28,16 @@ impl OccurFilter {
|
||||
Occur::Must => {
|
||||
and_mask |= shift;
|
||||
result |= shift;
|
||||
},
|
||||
}
|
||||
Occur::MustNot => {
|
||||
and_mask |= shift;
|
||||
},
|
||||
Occur::Should => {},
|
||||
}
|
||||
Occur::Should => {}
|
||||
}
|
||||
}
|
||||
OccurFilter {
|
||||
and_mask: and_mask,
|
||||
result: result
|
||||
result: result,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ pub use self::phrase_scorer::PhraseScorer;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use core::Index;
|
||||
use schema::FieldValue;
|
||||
@@ -18,30 +18,35 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query() {
|
||||
|
||||
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{ // 0
|
||||
{
|
||||
// 0
|
||||
let doc = doc!(text_field=>"b b b d c g c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 1
|
||||
{
|
||||
// 1
|
||||
let doc = doc!(text_field=>"a b b d c g c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 2
|
||||
{
|
||||
// 2
|
||||
let doc = doc!(text_field=>"a b a b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 3
|
||||
{
|
||||
// 3
|
||||
let doc = doc!(text_field=>"c a b a d ga a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 4
|
||||
{
|
||||
// 4
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -57,17 +62,19 @@ mod tests {
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
searcher.search(&phrase_query, &mut test_collector).expect("search should succeed");
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
|
||||
let empty_vec = Vec::<u32>::new();
|
||||
|
||||
assert_eq!(test_query(vec!("a", "b", "c")), vec!(2, 4));
|
||||
assert_eq!(test_query(vec!("a", "b")), vec!(1, 2, 3, 4));
|
||||
assert_eq!(test_query(vec!("b", "b")), vec!(0, 1));
|
||||
assert_eq!(test_query(vec!("g", "ewrwer")), empty_vec);
|
||||
assert_eq!(test_query(vec!("g", "a")), empty_vec);
|
||||
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
||||
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
||||
assert_eq!(test_query(vec!["g", "ewrwer"]), empty_vec);
|
||||
assert_eq!(test_query(vec!["g", "a"]), empty_vec);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -8,12 +8,12 @@ use Result;
|
||||
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of word.
|
||||
/// For instance the phrase query for `"part time"` will match
|
||||
/// For instance the phrase query for `"part time"` will match
|
||||
/// the sentence
|
||||
///
|
||||
///
|
||||
/// **Alan just got a part time job.**
|
||||
///
|
||||
/// On the other hand it will not match the sentence.
|
||||
/// On the other hand it will not match the sentence.
|
||||
///
|
||||
/// **This is my favorite part of the job.**
|
||||
///
|
||||
@@ -22,12 +22,10 @@ use Result;
|
||||
///
|
||||
#[derive(Debug)]
|
||||
pub struct PhraseQuery {
|
||||
phrase_terms: Vec<Term>,
|
||||
phrase_terms: Vec<Term>,
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
|
||||
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any {
|
||||
@@ -40,15 +38,12 @@ impl Query for PhraseQuery {
|
||||
fn weight(&self, _searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box PhraseWeight::from(self.phrase_terms.clone()))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<Term>> for PhraseQuery {
|
||||
fn from(phrase_terms: Vec<Term>) -> PhraseQuery {
|
||||
assert!(phrase_terms.len() > 1);
|
||||
PhraseQuery {
|
||||
phrase_terms: phrase_terms,
|
||||
}
|
||||
PhraseQuery { phrase_terms: phrase_terms }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,18 +15,16 @@ impl<'a> PhraseScorer<'a> {
|
||||
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
|
||||
.docsets()
|
||||
.iter()
|
||||
.map(|posting| {
|
||||
posting.positions()
|
||||
})
|
||||
.map(|posting| posting.positions())
|
||||
.collect();
|
||||
|
||||
|
||||
let num_postings = positions_arr.len() as u32;
|
||||
|
||||
|
||||
let mut ord = 1u32;
|
||||
let mut pos_candidate = positions_arr[0][0];
|
||||
let mut pos_candidate = positions_arr[0][0];
|
||||
positions_arr[0] = &(positions_arr[0])[1..];
|
||||
let mut count_matching = 1;
|
||||
|
||||
|
||||
'outer: loop {
|
||||
let target = pos_candidate + ord;
|
||||
let positions = positions_arr[ord as usize];
|
||||
@@ -40,11 +38,10 @@ impl<'a> PhraseScorer<'a> {
|
||||
if count_matching == num_postings {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else if pos_i > target {
|
||||
} else if pos_i > target {
|
||||
count_matching = 1;
|
||||
pos_candidate = positions[i] - ord;
|
||||
positions_arr[ord as usize] = &(positions_arr[ord as usize])[(i+1)..];
|
||||
positions_arr[ord as usize] = &(positions_arr[ord as usize])[(i + 1)..];
|
||||
}
|
||||
ord += 1;
|
||||
if ord == num_postings {
|
||||
@@ -58,7 +55,7 @@ impl<'a> PhraseScorer<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DocSet for PhraseScorer<'a> {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.intersection_docset.advance() {
|
||||
if self.phrase_match() {
|
||||
return true;
|
||||
@@ -67,15 +64,14 @@ impl<'a> DocSet for PhraseScorer<'a> {
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
self.intersection_docset.doc()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Scorer for PhraseScorer<'a> {
|
||||
fn score(&self,) -> f32 {
|
||||
fn score(&self) -> f32 {
|
||||
1f32
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -14,26 +14,22 @@ pub struct PhraseWeight {
|
||||
|
||||
impl From<Vec<Term>> for PhraseWeight {
|
||||
fn from(phrase_terms: Vec<Term>) -> PhraseWeight {
|
||||
PhraseWeight {
|
||||
phrase_terms: phrase_terms
|
||||
}
|
||||
PhraseWeight { phrase_terms: phrase_terms }
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
let term_postings_option = reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
for term in &self.phrase_terms {
|
||||
let term_postings_option =
|
||||
reader.read_postings(term, SegmentPostingsOption::FreqAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
term_postings_list.push(term_postings);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,36 +9,35 @@ use std::any::Any;
|
||||
|
||||
|
||||
/// Query trait are in charge of defining :
|
||||
///
|
||||
///
|
||||
/// - a set of documents
|
||||
/// - a way to score these documents
|
||||
///
|
||||
/// When performing a [search](#method.search), these documents will then
|
||||
/// be pushed to a [Collector](../collector/trait.Collector.html),
|
||||
/// which will in turn be in charge of deciding what to do with them.
|
||||
///
|
||||
/// which will in turn be in charge of deciding what to do with them.
|
||||
///
|
||||
/// Concretely, this scored docset is represented by the
|
||||
/// [`Scorer`](./trait.Scorer.html) trait.
|
||||
///
|
||||
/// Because our index is actually split into segments, the
|
||||
/// Because our index is actually split into segments, the
|
||||
/// query does not actually directly creates `DocSet` object.
|
||||
/// Instead, the query creates a [`Weight`](./trait.Weight.html)
|
||||
/// object for a given searcher.
|
||||
///
|
||||
/// The weight object, in turn, makes it possible to create
|
||||
/// object for a given searcher.
|
||||
///
|
||||
/// The weight object, in turn, makes it possible to create
|
||||
/// a scorer for a specific [`SegmentReader`](../struct.SegmentReader.html).
|
||||
///
|
||||
///
|
||||
/// So to sum it up :
|
||||
/// - a `Query` is recipe to define a set of documents as well the way to score them.
|
||||
/// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance
|
||||
/// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance
|
||||
/// hold statistics about the different term of the query. It is created by the query.
|
||||
/// - a `Scorer` is a cursor over the set of matching documents, for a specific
|
||||
/// - a `Scorer` is a cursor over the set of matching documents, for a specific
|
||||
/// [`SegmentReader`](../struct.SegmentReader.html). It is created by the [`Weight`](./trait.Weight.html).
|
||||
///
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, `Weight` and `Scorer`.
|
||||
pub trait Query: fmt::Debug {
|
||||
|
||||
/// Used to make it possible to cast Box<Query>
|
||||
/// into a specific type. This is mostly useful for unit tests.
|
||||
fn as_any(&self) -> &Any;
|
||||
@@ -47,24 +46,21 @@ pub trait Query: fmt::Debug {
|
||||
///
|
||||
/// See [Weight](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>>;
|
||||
|
||||
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate throw the matched documents and push them to the collector.
|
||||
///
|
||||
fn search(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
collector: &mut Collector) -> Result<TimerTree> {
|
||||
|
||||
let mut timer_tree = TimerTree::default();
|
||||
fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result<TimerTree> {
|
||||
|
||||
let mut timer_tree = TimerTree::default();
|
||||
let weight = try!(self.weight(searcher));
|
||||
|
||||
|
||||
{
|
||||
let mut search_timer = timer_tree.open("search");
|
||||
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
|
||||
|
||||
@@ -9,9 +9,9 @@ pub enum LogicalLiteral {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalAST{
|
||||
pub enum LogicalAST {
|
||||
Clause(Vec<(Occur, LogicalAST)>),
|
||||
Leaf(Box<LogicalLiteral>)
|
||||
Leaf(Box<LogicalLiteral>),
|
||||
}
|
||||
|
||||
fn occur_letter(occur: Occur) -> &'static str {
|
||||
@@ -28,8 +28,7 @@ impl fmt::Debug for LogicalAST {
|
||||
LogicalAST::Clause(ref clause) => {
|
||||
if clause.is_empty() {
|
||||
try!(write!(formatter, "<emptyclause>"));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let (ref occur, ref subquery) = clause[0];
|
||||
try!(write!(formatter, "({}{:?}", occur_letter(*occur), subquery));
|
||||
for &(ref occur, ref subquery) in &clause[1..] {
|
||||
@@ -39,9 +38,7 @@ impl fmt::Debug for LogicalAST {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LogicalAST::Leaf(ref literal) => {
|
||||
write!(formatter, "{:?}", literal)
|
||||
}
|
||||
LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,12 +52,8 @@ impl From<LogicalLiteral> for LogicalAST {
|
||||
impl fmt::Debug for LogicalLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
LogicalLiteral::Term(ref term) => {
|
||||
write!(formatter, "{:?}", term)
|
||||
},
|
||||
LogicalLiteral::Phrase(ref terms) => {
|
||||
write!(formatter, "\"{:?}\"", terms)
|
||||
}
|
||||
LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
|
||||
LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,4 +4,4 @@ mod user_input_ast;
|
||||
|
||||
pub mod logical_ast;
|
||||
pub use self::query_parser::QueryParser;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
pub use self::query_parser::QueryParserError;
|
||||
|
||||
@@ -11,31 +11,27 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
phrase.or(word)
|
||||
};
|
||||
|
||||
let negative_numbers =
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let field =
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
|
||||
)
|
||||
|
||||
let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_')))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
|
||||
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name:
|
||||
Some(field_name),
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| {
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase: phrase,
|
||||
}
|
||||
});
|
||||
try(term_query)
|
||||
.or(term_default_field)
|
||||
.map(|query_literal| UserInputAST::from(query_literal))
|
||||
@@ -58,13 +54,11 @@ pub fn parse_to_ast<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
where I: Stream<Item = char>
|
||||
{
|
||||
sep_by(parser(leaf), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| {
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
}
|
||||
})
|
||||
.map(|subqueries: Vec<UserInputAST>| if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect())
|
||||
})
|
||||
.parse_stream(input)
|
||||
}
|
||||
|
||||
|
||||
@@ -92,10 +92,10 @@ impl QueryParser {
|
||||
analyzer: box SimpleTokenizer,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
///
|
||||
/// By default a ,
|
||||
/// By default a ,
|
||||
pub fn set_conjunction_by_default(&mut self) {
|
||||
self.conjunction_by_default = true;
|
||||
}
|
||||
@@ -114,11 +114,11 @@ impl QueryParser {
|
||||
let logical_ast = self.parse_query_to_logical_ast(query)?;
|
||||
Ok(convert_to_query(logical_ast))
|
||||
}
|
||||
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, _remaining) =
|
||||
parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let (user_input_ast, _remaining) = parse_to_ast(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
@@ -127,10 +127,10 @@ impl QueryParser {
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
}
|
||||
|
||||
|
||||
fn compute_logical_ast(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
@@ -172,21 +172,19 @@ impl QueryParser {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
terms.push(Term::from_field_text(field, phrase));
|
||||
}
|
||||
if terms.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
else if terms.len() == 1 {
|
||||
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
|
||||
} else if terms.len() == 1 {
|
||||
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())));
|
||||
} else {
|
||||
return Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
return Ok(Some(LogicalLiteral::Phrase(terms)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
@@ -198,8 +196,8 @@ impl QueryParser {
|
||||
}
|
||||
|
||||
fn compute_logical_ast_with_occur(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<(Occur, LogicalAST), QueryParserError> {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
@@ -244,17 +242,14 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(box ast));
|
||||
}
|
||||
}
|
||||
let result_ast =
|
||||
if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
let result_ast = if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect())
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
}
|
||||
@@ -292,7 +287,8 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<Query> {
|
||||
fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
match logical_ast {
|
||||
LogicalAST::Clause(clause) => {
|
||||
let occur_subqueries = clause.into_iter()
|
||||
let occur_subqueries = clause
|
||||
.into_iter()
|
||||
.map(|(occur, subquery)| (occur, convert_to_query(subquery)))
|
||||
.collect::<Vec<_>>();
|
||||
box BooleanQuery::from(occur_subqueries)
|
||||
@@ -311,7 +307,7 @@ mod test {
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
@@ -331,7 +327,7 @@ mod test {
|
||||
fn parse_query_to_logical_ast(query: &str,
|
||||
default_conjunction: bool)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
let mut query_parser = make_query_parser();
|
||||
let mut query_parser = make_query_parser();
|
||||
if default_conjunction {
|
||||
query_parser.set_conjunction_by_default();
|
||||
}
|
||||
@@ -345,40 +341,33 @@ mod test {
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_simple() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("toto").is_ok());
|
||||
assert!(query_parser.parse_query("toto").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_nonindexed_field_yields_error() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
|
||||
let is_not_indexed_err = |query: &str| {
|
||||
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
|
||||
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
|
||||
Some(field_name.clone())
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64"))
|
||||
);
|
||||
assert_eq!(is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64")));
|
||||
assert_eq!(is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64")));
|
||||
}
|
||||
|
||||
|
||||
@@ -392,25 +381,32 @@ mod test {
|
||||
#[test]
|
||||
pub fn test_parse_query_ints() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"-9999999999999\"")
|
||||
.is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("signed:\"18446744073709551615\"")
|
||||
.is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
|
||||
assert!(query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper("unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
&format!("{:?}",
|
||||
Term::from_field_i64(Field(2u32), -2324)),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
@@ -424,7 +420,9 @@ mod test {
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
|
||||
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
|
||||
@@ -436,7 +434,9 @@ mod test {
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 0, 0, 0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
@@ -444,7 +444,9 @@ mod test {
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
|
||||
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true)
|
||||
.err()
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) +(Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
|
||||
|
||||
@@ -2,18 +2,14 @@ use std::fmt;
|
||||
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
pub phrase: String,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => {
|
||||
write!(formatter, "{}:\"{}\"", field_name, self.phrase)
|
||||
}
|
||||
None => {
|
||||
write!(formatter, "\"{}\"", self.phrase)
|
||||
}
|
||||
Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -22,41 +18,33 @@ pub enum UserInputAST {
|
||||
Clause(Vec<Box<UserInputAST>>),
|
||||
Not(Box<UserInputAST>),
|
||||
Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLiteral>)
|
||||
|
||||
Leaf(Box<UserInputLiteral>),
|
||||
}
|
||||
|
||||
impl From<UserInputLiteral> for UserInputAST {
|
||||
fn from(literal: UserInputLiteral) -> UserInputAST {
|
||||
UserInputAST::Leaf(box literal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputAST {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputAST::Must(ref subquery) => {
|
||||
write!(formatter, "+({:?})", subquery)
|
||||
},
|
||||
UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery),
|
||||
UserInputAST::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
try!(write!(formatter, "<emptyclause>"));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
try!(write!(formatter, "{:?}", &subqueries[0]));
|
||||
for subquery in &subqueries[1..] {
|
||||
try!(write!(formatter, " {:?}", subquery));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
},
|
||||
UserInputAST::Not(ref subquery) => {
|
||||
write!(formatter, "-({:?})", subquery)
|
||||
}
|
||||
UserInputAST::Leaf(ref subquery) => {
|
||||
write!(formatter, "{:?}", subquery)
|
||||
|
||||
}
|
||||
UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery),
|
||||
UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,30 +5,29 @@ use collector::Collector;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
/// Scored set of documents matching a query within a specific segment.
|
||||
///
|
||||
///
|
||||
/// See [Query](./trait.Query.html).
|
||||
pub trait Scorer: DocSet {
|
||||
|
||||
/// Returns the score.
|
||||
///
|
||||
///
|
||||
/// This method will perform a bit of computation and is not cached.
|
||||
fn score(&self,) -> Score;
|
||||
|
||||
fn score(&self) -> Score;
|
||||
|
||||
/// Consumes the complete `DocSet` and
|
||||
/// push the scored documents to the collector.
|
||||
/// push the scored documents to the collector.
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
while self.advance() {
|
||||
collector.collect(self.doc(), self.score());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
fn score(&self,) -> Score {
|
||||
fn score(&self) -> Score {
|
||||
self.deref().score()
|
||||
}
|
||||
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
let scorer = self.deref_mut();
|
||||
while scorer.advance() {
|
||||
@@ -38,22 +37,22 @@ impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
}
|
||||
|
||||
/// EmptyScorer is a dummy Scorer in which no document matches.
|
||||
///
|
||||
///
|
||||
/// It is useful for tests and handling edge cases.
|
||||
pub struct EmptyScorer;
|
||||
|
||||
impl DocSet for EmptyScorer {
|
||||
fn advance(&mut self,) -> bool {
|
||||
fn advance(&mut self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
fn doc(&self) -> DocId {
|
||||
DocId::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
impl Scorer for EmptyScorer {
|
||||
fn score(&self,) -> Score {
|
||||
fn score(&self) -> Score {
|
||||
0f32
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ pub use self::term_scorer::TermScorer;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use postings::{DocSet, VecPostings};
|
||||
use query::Scorer;
|
||||
use query::term_query::TermScorer;
|
||||
@@ -23,7 +23,7 @@ mod tests {
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
@@ -44,7 +44,8 @@ mod tests {
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq);
|
||||
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"),
|
||||
SegmentPostingsOption::NoFreq);
|
||||
let term_weight = term_query.weight(&searcher).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut term_scorer = term_weight.scorer(segment_reader).unwrap();
|
||||
@@ -53,13 +54,13 @@ mod tests {
|
||||
assert_eq!(term_scorer.score(), 0.30685282);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_term_scorer() {
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec!(10, 4));
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec![10, 4]);
|
||||
assert_eq!(left_fieldnorms.get(0), 10);
|
||||
assert_eq!(left_fieldnorms.get(1), 4);
|
||||
let left = VecPostings::from(vec!(1));
|
||||
let left = VecPostings::from(vec![1]);
|
||||
let mut left_scorer = TermScorer {
|
||||
idf: 0.30685282,
|
||||
fieldnorm_reader_opt: Some(left_fieldnorms),
|
||||
@@ -69,4 +70,4 @@ mod tests {
|
||||
assert!(abs_diff(left_scorer.score(), 0.15342641) < 0.001f32);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::any::Any;
|
||||
/// The score associated is defined as
|
||||
/// `idf` * sqrt(`term_freq` / `field norm`)
|
||||
/// in which :
|
||||
/// * idf - inverse document frequency.
|
||||
/// * idf - inverse document frequency.
|
||||
/// * term_freq - number of occurrences of the term in the field
|
||||
/// * field norm - number of tokens in the field.
|
||||
#[derive(Debug)]
|
||||
@@ -31,9 +31,9 @@ impl TermQuery {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns a weight object.
|
||||
///
|
||||
///
|
||||
/// While `.weight(...)` returns a boxed trait object,
|
||||
/// this method return a specific implementation.
|
||||
/// This is useful for optimization purpose.
|
||||
@@ -55,5 +55,4 @@ impl Query for TermQuery {
|
||||
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
|
||||
Ok(box self.specialized_weight(searcher))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,41 +6,46 @@ use query::Scorer;
|
||||
use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
pub struct TermScorer<TPostings> where TPostings: Postings {
|
||||
pub struct TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
{
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
|
||||
pub postings: TPostings,
|
||||
}
|
||||
|
||||
impl<TPostings> TermScorer<TPostings> where TPostings: Postings {
|
||||
impl<TPostings> TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
{
|
||||
pub fn postings(&self) -> &TPostings {
|
||||
&self.postings
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings> DocSet for TermScorer<TPostings> where TPostings: Postings {
|
||||
fn advance(&mut self,) -> bool {
|
||||
impl<TPostings> DocSet for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
{
|
||||
fn advance(&mut self) -> bool {
|
||||
self.postings.advance()
|
||||
}
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.postings.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<TPostings> Scorer for TermScorer<TPostings> where TPostings: Postings {
|
||||
fn score(&self,) -> Score {
|
||||
impl<TPostings> Scorer for TermScorer<TPostings>
|
||||
where TPostings: Postings
|
||||
{
|
||||
fn score(&self) -> Score {
|
||||
let doc = self.postings.doc();
|
||||
let tf = match self.fieldnorm_reader_opt {
|
||||
Some(ref fieldnorm_reader) => {
|
||||
let field_norm = fieldnorm_reader.get(doc);
|
||||
(self.postings.term_freq() as f32 / field_norm as f32)
|
||||
}
|
||||
None => {
|
||||
self.postings.term_freq() as f32
|
||||
}
|
||||
None => self.postings.term_freq() as f32,
|
||||
};
|
||||
self.idf * tf.sqrt()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,40 +16,35 @@ pub struct TermWeight {
|
||||
|
||||
|
||||
impl Weight for TermWeight {
|
||||
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let specialized_scorer = try!(self.specialized_scorer(reader));
|
||||
Ok(box specialized_scorer)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl TermWeight {
|
||||
|
||||
fn idf(&self) -> f32 {
|
||||
1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln()
|
||||
}
|
||||
|
||||
pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<TermScorer<SegmentPostings<'a>>> {
|
||||
pub fn specialized_scorer<'a>(&'a self,
|
||||
reader: &'a SegmentReader)
|
||||
-> Result<TermScorer<SegmentPostings<'a>>> {
|
||||
let field = self.term.field();
|
||||
let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field);
|
||||
Ok(
|
||||
reader
|
||||
.read_postings(&self.term, self.segment_postings_options)
|
||||
.map(|segment_postings|
|
||||
TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt: fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
}
|
||||
)
|
||||
.unwrap_or(
|
||||
TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::empty()
|
||||
Ok(reader
|
||||
.read_postings(&self.term, self.segment_postings_options)
|
||||
.map(|segment_postings| {
|
||||
TermScorer {
|
||||
idf: self.idf(),
|
||||
fieldnorm_reader_opt: fieldnorm_reader_opt,
|
||||
postings: segment_postings,
|
||||
}
|
||||
})
|
||||
)
|
||||
.unwrap_or(TermScorer {
|
||||
idf: 1f32,
|
||||
fieldnorm_reader_opt: None,
|
||||
postings: SegmentPostings::empty(),
|
||||
}))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,7 @@ use core::SegmentReader;
|
||||
///
|
||||
/// See [Query](./trait.Query.html).
|
||||
pub trait Weight {
|
||||
|
||||
/// Returns the scorer for the given segment.
|
||||
/// See [Query](./trait.Query.html).
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>>;
|
||||
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use super::*;
|
||||
use itertools::Itertools;
|
||||
use itertools::Itertools;
|
||||
|
||||
/// Tantivy's Document is the object that can
|
||||
/// be indexed and then searched for.
|
||||
///
|
||||
/// be indexed and then searched for.
|
||||
///
|
||||
/// Documents are fundamentally a collection of unordered couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
///
|
||||
///
|
||||
///
|
||||
///
|
||||
|
||||
/// Documents are really just a list of couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
@@ -30,25 +30,24 @@ impl PartialEq for Document {
|
||||
impl Eq for Document {}
|
||||
|
||||
impl Document {
|
||||
|
||||
/// Creates a new, empty document object
|
||||
pub fn new() -> Document {
|
||||
Document::default()
|
||||
}
|
||||
|
||||
/// Returns the number of `(field, value)` pairs.
|
||||
pub fn len(&self,) -> usize {
|
||||
pub fn len(&self) -> usize {
|
||||
self.field_values.len()
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff the document contains no fields.
|
||||
pub fn is_empty(&self,) -> bool {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.field_values.is_empty()
|
||||
}
|
||||
|
||||
|
||||
/// Add a text field.
|
||||
pub fn add_text(&mut self, field: Field, text: &str) {
|
||||
let value = Value::Str(String::from(text));
|
||||
let value = Value::Str(String::from(text));
|
||||
self.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
@@ -66,29 +65,27 @@ impl Document {
|
||||
pub fn add(&mut self, field_value: FieldValue) {
|
||||
self.field_values.push(field_value);
|
||||
}
|
||||
|
||||
|
||||
/// field_values accessor
|
||||
pub fn field_values(&self) -> &[FieldValue] {
|
||||
&self.field_values
|
||||
}
|
||||
|
||||
|
||||
/// Sort and groups the field_values by field.
|
||||
///
|
||||
/// The result of this method is not cached and is
|
||||
/// The result of this method is not cached and is
|
||||
/// computed on the fly when this method is called.
|
||||
pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&FieldValue>)> {
|
||||
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
|
||||
field_values.sort_by_key(|field_value| field_value.field());
|
||||
field_values
|
||||
let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect();
|
||||
field_values.sort_by_key(|field_value| field_value.field());
|
||||
field_values
|
||||
.into_iter()
|
||||
.group_by(|field_value| field_value.field())
|
||||
.into_iter()
|
||||
.map(|(key, group)| {
|
||||
(key, group.into_iter().collect())
|
||||
})
|
||||
.map(|(key, group)| (key, group.into_iter().collect()))
|
||||
.collect::<Vec<(Field, Vec<&FieldValue>)>>()
|
||||
}
|
||||
|
||||
|
||||
/// Returns all of the `FieldValue`s associated the given field
|
||||
pub fn get_all(&self, field: Field) -> Vec<&Value> {
|
||||
self.field_values
|
||||
@@ -110,9 +107,7 @@ impl Document {
|
||||
|
||||
impl From<Vec<FieldValue>> for Document {
|
||||
fn from(field_values: Vec<FieldValue>) -> Document {
|
||||
Document {
|
||||
field_values: field_values
|
||||
}
|
||||
Document { field_values: field_values }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,7 +116,7 @@ impl From<Vec<FieldValue>> for Document {
|
||||
mod tests {
|
||||
|
||||
use schema::*;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_doc() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -130,5 +125,5 @@ mod tests {
|
||||
doc.add_text(text_field, "My title");
|
||||
assert_eq!(doc.field_values().len(), 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use common::BinarySerializable;
|
||||
/// `Field` is actually a `u8` identifying a `Field`
|
||||
/// The schema is in charge of holding mapping between field names
|
||||
/// to `Field` objects.
|
||||
///
|
||||
///
|
||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
||||
/// Value 255 is reserved.
|
||||
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
|
||||
@@ -22,4 +22,3 @@ impl BinarySerializable for Field {
|
||||
u32::deserialize(reader).map(Field)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,10 +9,10 @@ use schema::FieldType;
|
||||
|
||||
/// A `FieldEntry` represents a field and its configuration.
|
||||
/// `Schema` are a collection of `FieldEntry`
|
||||
///
|
||||
/// It consists of
|
||||
/// - a field name
|
||||
/// - a field type, itself wrapping up options describing
|
||||
///
|
||||
/// It consists of
|
||||
/// - a field name
|
||||
/// - a field type, itself wrapping up options describing
|
||||
/// how the field should be indexed.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FieldEntry {
|
||||
@@ -21,7 +21,6 @@ pub struct FieldEntry {
|
||||
}
|
||||
|
||||
impl FieldEntry {
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_text(field_name: String, field_type: TextOptions) -> FieldEntry {
|
||||
@@ -30,7 +29,7 @@ impl FieldEntry {
|
||||
field_type: FieldType::Str(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
@@ -39,7 +38,7 @@ impl FieldEntry {
|
||||
field_type: FieldType::U64(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Creates a new i64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
@@ -48,48 +47,42 @@ impl FieldEntry {
|
||||
field_type: FieldType::I64(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns the name of the field
|
||||
pub fn name(&self,) -> &str {
|
||||
pub fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
|
||||
/// Returns the field type
|
||||
pub fn field_type(&self,) -> &FieldType {
|
||||
pub fn field_type(&self) -> &FieldType {
|
||||
&self.field_type
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff the field is indexed
|
||||
pub fn is_indexed(&self,) -> bool {
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => options.get_indexing_options().is_indexed(),
|
||||
FieldType::U64(ref options) => options.is_indexed(),
|
||||
FieldType::I64(ref options) => options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
pub fn is_int_fast(&self,) -> bool {
|
||||
pub fn is_int_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options) => options.is_fast(),
|
||||
FieldType::I64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff the field is stored
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
pub fn is_stored(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options) => {
|
||||
options.is_stored()
|
||||
}
|
||||
FieldType::I64(ref options) => {
|
||||
options.is_stored()
|
||||
}
|
||||
FieldType::Str(ref options) => {
|
||||
options.is_stored()
|
||||
}
|
||||
FieldType::U64(ref options) => options.is_stored(),
|
||||
FieldType::I64(ref options) => options.is_stored(),
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -105,17 +98,17 @@ impl Serialize for FieldEntry {
|
||||
FieldType::Str(ref options) => {
|
||||
s.serialize_field("type", "text")?;
|
||||
s.serialize_field("options", options)?;
|
||||
},
|
||||
}
|
||||
FieldType::U64(ref options) => {
|
||||
s.serialize_field("type", "u64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
},
|
||||
}
|
||||
FieldType::I64(ref options) => {
|
||||
s.serialize_field("type", "i64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
s.end()
|
||||
}
|
||||
}
|
||||
@@ -126,7 +119,11 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
#[serde(field_identifier, rename_all = "lowercase")]
|
||||
enum Field { Name, Type, Options };
|
||||
enum Field {
|
||||
Name,
|
||||
Type,
|
||||
Options,
|
||||
};
|
||||
|
||||
const FIELDS: &'static [&'static str] = &["name", "type", "options"];
|
||||
|
||||
@@ -161,13 +158,24 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
}
|
||||
Field::Options => {
|
||||
match ty {
|
||||
None => return Err(de::Error::custom("The `type` field must be specified before `options`")),
|
||||
None => {
|
||||
return Err(de::Error::custom("The `type` field must be specified before `options`",),)
|
||||
}
|
||||
Some(ty) => {
|
||||
match ty {
|
||||
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
|
||||
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
|
||||
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
|
||||
_ => return Err(de::Error::custom(format!("Unrecognised type {}", ty)))
|
||||
"text" => {
|
||||
field_type = Some(FieldType::Str(map.next_value()?))
|
||||
}
|
||||
"u64" => {
|
||||
field_type = Some(FieldType::U64(map.next_value()?))
|
||||
}
|
||||
"i64" => {
|
||||
field_type = Some(FieldType::I64(map.next_value()?))
|
||||
}
|
||||
_ => {
|
||||
return Err(de::Error::custom(format!("Unrecognised type {}",
|
||||
ty)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -177,12 +185,13 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
|
||||
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
|
||||
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
|
||||
let field_type = field_type.ok_or_else(|| de::Error::missing_field("options"))?;
|
||||
let field_type = field_type
|
||||
.ok_or_else(|| de::Error::missing_field("options"))?;
|
||||
|
||||
Ok(FieldEntry {
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,7 +206,7 @@ mod tests {
|
||||
use super::*;
|
||||
use schema::TEXT;
|
||||
use serde_json;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_json_serialization() {
|
||||
let field_value = FieldEntry::new_text(String::from("title"), TEXT);
|
||||
@@ -217,10 +226,10 @@ mod tests {
|
||||
let field_value: FieldEntry = serde_json::from_str(expected).unwrap();
|
||||
|
||||
assert_eq!("title", field_value.name);
|
||||
|
||||
|
||||
match field_value.field_type {
|
||||
FieldType::Str(_) => assert!(true),
|
||||
_ => panic!("expected FieldType::Str")
|
||||
_ => panic!("expected FieldType::Str"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user