mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
Format
This commit is contained in:
@@ -20,7 +20,6 @@ impl CountCollector {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Collector for CountCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
|
||||
@@ -13,7 +13,7 @@ use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use postings::SkipResult;
|
||||
use std::{u64, usize};
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
|
||||
use DocId;
|
||||
@@ -48,7 +48,6 @@ impl<'a> Ord for Hit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct SegmentFacetCounter {
|
||||
pub facet_reader: FacetReader,
|
||||
pub facet_ords: Vec<u64>,
|
||||
@@ -59,16 +58,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
if facet_bytes.is_empty() {
|
||||
0
|
||||
} else {
|
||||
facet_bytes
|
||||
.iter()
|
||||
.cloned()
|
||||
.filter(|b| *b == 0u8)
|
||||
.count() + 1
|
||||
facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Collector for faceting
|
||||
///
|
||||
/// The collector collects all facets. You need to configure it
|
||||
@@ -215,23 +208,24 @@ pub struct FacetCollector {
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item=&'a Facet>>(target: &[u8], collapse_it: &mut Peekable<I>) -> SkipResult {
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
target: &[u8],
|
||||
collapse_it: &mut Peekable<I>,
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => {
|
||||
match facet_bytes.encoded_bytes().cmp(&target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(&target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
@@ -241,7 +235,6 @@ fn skip<'a, I: Iterator<Item=&'a Facet>>(target: &[u8], collapse_it: &mut Peekab
|
||||
}
|
||||
|
||||
impl FacetCollector {
|
||||
|
||||
/// Create a facet collector to collect the facets
|
||||
/// from a specific facet `Field`.
|
||||
///
|
||||
@@ -261,7 +254,6 @@ impl FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Adds a facet that we want to record counts
|
||||
///
|
||||
/// Adding facet `Facet::from("/country")` for instance,
|
||||
@@ -272,12 +264,15 @@ impl FacetCollector {
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where Facet: From<T> {
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
!old_facet.is_prefix_of(&facet),
|
||||
"Tried to add a facet which is a descendant of an already added facet.");
|
||||
"Tried to add a facet which is a descendant of an already added facet."
|
||||
);
|
||||
assert!(
|
||||
!facet.is_prefix_of(&old_facet),
|
||||
"Tried to add a facet which is an ancestor of an already added facet."
|
||||
@@ -292,10 +287,7 @@ impl FacetCollector {
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader
|
||||
.facet_dict()
|
||||
.range()
|
||||
.into_stream();
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
@@ -315,7 +307,8 @@ impl FacetCollector {
|
||||
continue 'outer;
|
||||
} else if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
@@ -335,13 +328,11 @@ impl FacetCollector {
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(
|
||||
SegmentFacetCounter {
|
||||
facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() },
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
}
|
||||
);
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: unsafe { self.ff_reader.take().unwrap().into_inner() },
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -361,14 +352,9 @@ impl FacetCollector {
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts
|
||||
.facet_reader
|
||||
.facet_dict()
|
||||
.range()
|
||||
.into_stream())
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut facet_merger = TermMerger::new(facet_streams);
|
||||
@@ -398,51 +384,43 @@ impl FacetCollector {
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
}
|
||||
}
|
||||
FacetCounts {
|
||||
facet_counts
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader =
|
||||
unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] +=
|
||||
if collapsed_ord == previous_collapsed_ord {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
previous_collapsed_ord = collapsed_ord;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Intermediary result of the `FacetCollector` that stores
|
||||
/// the facet counts for all the segments.
|
||||
pub struct FacetCounts {
|
||||
@@ -450,20 +428,20 @@ pub struct FacetCounts {
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item=(&'a Facet, u64)>
|
||||
where Facet: From<T> {
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
let left_bound = Bound::Excluded(facet.clone());
|
||||
let right_bound =
|
||||
if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
@@ -471,8 +449,9 @@ impl FacetCounts {
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where Facet: From<T> {
|
||||
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
@@ -480,9 +459,7 @@ impl FacetCounts {
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek()
|
||||
.map(|hit| hit.count)
|
||||
.unwrap_or(u64::MIN);
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
|
||||
for (facet, count) in it {
|
||||
if count > lowest_count {
|
||||
lowest_count = count;
|
||||
@@ -496,16 +473,13 @@ impl FacetCounts {
|
||||
.map(|hit| (hit.facet, hit.count))
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use schema::{SchemaBuilder, Document, Facet};
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
@@ -550,21 +524,23 @@ mod tests {
|
||||
.get("/top1")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
assert_eq!(facets, [
|
||||
("/top1/mid0", 50),
|
||||
("/top1/mid1", 50),
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
].iter()
|
||||
.map(|&(facet_str, count)| {
|
||||
(String::from(facet_str), count)
|
||||
})
|
||||
.collect::<Vec<_>>());
|
||||
assert_eq!(
|
||||
facets,
|
||||
[
|
||||
("/top1/mid0", 50),
|
||||
("/top1/mid1", 50),
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
].iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected="Tried to add a facet which is a descendant of an already added facet.")]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -585,18 +561,14 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs: Vec<Document> = vec![
|
||||
("a", 10),
|
||||
("b", 100),
|
||||
("c", 7),
|
||||
("d", 12),
|
||||
("e", 21)
|
||||
].into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet_{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
}).collect();
|
||||
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet_{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
@@ -620,8 +592,9 @@ mod tests {
|
||||
vec![
|
||||
(&Facet::from("/facet_b"), 100),
|
||||
(&Facet::from("/facet_e"), 21),
|
||||
(&Facet::from("/facet_d"), 12)
|
||||
]);
|
||||
(&Facet::from("/facet_d"), 12),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -632,12 +605,12 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs = vec!();
|
||||
let mut docs = vec![];
|
||||
for val in 0..50 {
|
||||
let facet = Facet::from(&format!("/facet_{}", val));
|
||||
for _ in 0..val*val {
|
||||
docs.push(doc!(facet_field=>facet.clone()));
|
||||
}
|
||||
for _ in 0..val * val {
|
||||
docs.push(doc!(facet_field=>facet.clone()));
|
||||
}
|
||||
}
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
@@ -656,4 +629,3 @@ mod tests {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -134,7 +134,8 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & mask)
|
||||
} else {
|
||||
@@ -165,7 +166,8 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::io::{self, Read};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
@@ -19,7 +18,7 @@ impl FileAddr {
|
||||
fn new(field: Field, idx: usize) -> FileAddr {
|
||||
FileAddr {
|
||||
field: field,
|
||||
idx: idx
|
||||
idx: idx,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -36,7 +35,7 @@ impl BinarySerializable for FileAddr {
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr {
|
||||
field: field,
|
||||
idx: idx
|
||||
idx: idx,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -59,7 +58,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
|
||||
self.for_field_with_idx(field, 0)
|
||||
self.for_field_with_idx(field, 0)
|
||||
}
|
||||
|
||||
/// Start writing a new field.
|
||||
@@ -71,7 +70,6 @@ impl<W: Write> CompositeWrite<W> {
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
|
||||
/// Close the composite file.
|
||||
///
|
||||
/// An index of the different field offsets
|
||||
@@ -89,9 +87,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (offset, file_addr) in offset_fields {
|
||||
VInt((offset - prev_offset) as u64).serialize(
|
||||
&mut self.write,
|
||||
)?;
|
||||
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
|
||||
file_addr.serialize(&mut self.write)?;
|
||||
prev_offset = offset;
|
||||
}
|
||||
@@ -103,7 +99,6 @@ impl<W: Write> CompositeWrite<W> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// A composite file is an abstraction to store a
|
||||
/// file partitioned by field.
|
||||
///
|
||||
@@ -174,20 +169,20 @@ impl CompositeFile {
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr {field: field, idx: idx})
|
||||
.map(|&(from, to)| {
|
||||
self.data.slice(from, to)
|
||||
.get(&FileAddr {
|
||||
field: field,
|
||||
idx: idx,
|
||||
})
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeWrite, CompositeFile};
|
||||
use directory::{RAMDirectory, Directory};
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
@@ -231,7 +226,6 @@ mod test {
|
||||
assert_eq!(payload_4, 2u64);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -25,7 +25,9 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
|
||||
}
|
||||
|
||||
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
|
||||
unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
|
||||
unsafe {
|
||||
simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset)
|
||||
}
|
||||
}
|
||||
|
||||
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
|
||||
|
||||
@@ -108,17 +108,21 @@ impl SegmentReader {
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(ErrorKind::InvalidArgument(format!("The field {:?} is not a \
|
||||
hierarchical facet.", field_entry)).into())
|
||||
return Err(ErrorKind::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)).into());
|
||||
}
|
||||
let term_ords_reader = self.multi_value_reader(field)?;
|
||||
let termdict_source = self.termdict_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| {
|
||||
ErrorKind::InvalidArgument(format!("The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.", field_entry.name()))
|
||||
})?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
ErrorKind::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{Heap, HeapAllocable, BytesRef};
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
@@ -53,9 +53,6 @@ mod murmurhash2 {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
@@ -63,14 +60,10 @@ mod murmurhash2 {
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| {
|
||||
(1 << num_bits) * mem::size_of::<KeyValue>()
|
||||
};
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| {
|
||||
compute_table_size(*num_bits) < table_size_limit
|
||||
})
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
@@ -81,7 +74,6 @@ pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external heap.
|
||||
/// The `value_addr` also points to an address in the heap.
|
||||
@@ -101,7 +93,6 @@ impl KeyValue {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
@@ -118,7 +109,6 @@ pub struct TermHashMap<'a> {
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
@@ -141,7 +131,6 @@ impl QuadraticProbing {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
@@ -178,18 +167,17 @@ impl<'a> TermHashMap<'a> {
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> (UnorderedTermId, &mut V) {
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
@@ -212,7 +200,6 @@ impl<'a> TermHashMap<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -223,7 +210,6 @@ mod tests {
|
||||
use std::collections::HashSet;
|
||||
use super::split_memory;
|
||||
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
_addr: u32,
|
||||
@@ -245,7 +231,6 @@ mod tests {
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
@@ -319,5 +304,4 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -39,6 +39,5 @@ fn test_unrolled_linked_list() {
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,19 +20,17 @@ use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use tempdir::TempDir;
|
||||
|
||||
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped).
|
||||
///
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
let file = File::open(&full_path)
|
||||
.map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
}
|
||||
})?;
|
||||
let file = File::open(&full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
@@ -44,9 +42,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<MmapReadOnly>, OpenRe
|
||||
}
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| {
|
||||
From::from(IOError::with_path(full_path.to_owned(), e))
|
||||
})
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
@@ -79,7 +75,6 @@ impl Default for MmapCache {
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
|
||||
/// Removes a `MmapReadOnly` entry from the mmap cache.
|
||||
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
|
||||
self.cache.remove(full_path).is_some()
|
||||
@@ -95,21 +90,21 @@ impl MmapCache {
|
||||
|
||||
fn get_mmap(&mut self, full_path: PathBuf) -> Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(occupied_entry) => {
|
||||
let mmap = occupied_entry.get();
|
||||
self.counters.hit += 1;
|
||||
Some(mmap.clone())
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss += 1;
|
||||
if let Some(mmap) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(mmap.clone());
|
||||
Some(mmap)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
HashMapEntry::Occupied(occupied_entry) => {
|
||||
let mmap = occupied_entry.get();
|
||||
self.counters.hit += 1;
|
||||
Some(mmap.clone())
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss += 1;
|
||||
if let Some(mmap) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(mmap.clone());
|
||||
Some(mmap)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,9 +252,9 @@ impl Directory for MmapDirectory {
|
||||
})?;
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(full_path)?
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
.get_mmap(full_path)?
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
@@ -292,20 +287,19 @@ impl Directory for MmapDirectory {
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
}
|
||||
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!("Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
mmap_cache.discard_from_cache(path);
|
||||
|
||||
// Removing the entry in the MMap cache.
|
||||
@@ -415,7 +409,10 @@ mod tests {
|
||||
}
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
mmap_directory.delete(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths - i - 1);
|
||||
assert_eq!(
|
||||
mmap_directory.get_cache_info().mmapped.len(),
|
||||
num_paths - i - 1
|
||||
);
|
||||
}
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
||||
|
||||
@@ -4,7 +4,7 @@ use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::{StableDeref, CloneStableDeref};
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
|
||||
@@ -4,7 +4,6 @@ use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
/// segment.
|
||||
@@ -24,7 +23,6 @@ pub struct FacetReader {
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
|
||||
/// Creates a new `FacetReader`.
|
||||
///
|
||||
/// A facet reader just wraps :
|
||||
@@ -32,8 +30,10 @@ impl FacetReader {
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(term_ords: MultiValueIntFastFieldReader,
|
||||
term_dict: TermDictionaryImpl) -> FacetReader {
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader,
|
||||
term_dict: TermDictionaryImpl,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords: term_ords,
|
||||
term_dict: term_dict,
|
||||
@@ -56,7 +56,8 @@ impl FacetReader {
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
@@ -64,4 +65,4 @@ impl FacetReader {
|
||||
pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
|
||||
self.term_ords.get_vals(doc, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +95,9 @@ mod tests {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -129,7 +131,9 @@ mod tests {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -164,7 +168,9 @@ mod tests {
|
||||
for _ in 0..10_000 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -199,7 +205,9 @@ mod tests {
|
||||
5_000_000_000_000_000_000u64 + i,
|
||||
);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -238,7 +246,9 @@ mod tests {
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -277,7 +287,9 @@ mod tests {
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
@@ -311,7 +323,9 @@ mod tests {
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -366,7 +380,9 @@ mod tests {
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
@@ -398,7 +414,9 @@ mod tests {
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
|
||||
@@ -2,4 +2,4 @@ mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
|
||||
@@ -18,8 +18,10 @@ pub struct MultiValueIntFastFieldReader {
|
||||
}
|
||||
|
||||
impl MultiValueIntFastFieldReader {
|
||||
|
||||
pub(crate) fn open(idx_reader: U64FastFieldReader, vals_reader: U64FastFieldReader) -> MultiValueIntFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: U64FastFieldReader,
|
||||
vals_reader: U64FastFieldReader,
|
||||
) -> MultiValueIntFastFieldReader {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader: idx_reader,
|
||||
vals_reader: vals_reader,
|
||||
@@ -38,12 +40,11 @@ impl MultiValueIntFastFieldReader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Facet, Document, SchemaBuilder};
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
@@ -51,7 +52,9 @@ mod tests {
|
||||
let facet_field = schema_builder.add_facet_field("facets");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 30_000_000).expect("Failed to create index writer.");
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
@@ -72,9 +75,7 @@ mod tests {
|
||||
index.load_searchers().expect("Reloading searchers");
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
@@ -108,7 +109,5 @@ mod tests {
|
||||
facet_reader.facet_ords(2, &mut vals);
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,10 +38,15 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer, mapping: &HashMap<UnorderedTermId, usize>) -> io::Result<()> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<UnorderedTermId, usize>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer = serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
@@ -50,13 +55,13 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer = serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?;
|
||||
let mut value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?;
|
||||
for val in &self.vals {
|
||||
value_serializer.add_val(*mapping.get(val).expect("Missing term ordinal") as u64)?;
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use DocId;
|
||||
use schema::SchemaBuilder;
|
||||
use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use schema::FieldType;
|
||||
use std::mem;
|
||||
@@ -88,7 +88,7 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.is_fast(),
|
||||
FieldType::HierarchicalFacet => { true },
|
||||
FieldType::HierarchicalFacet => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -113,7 +113,6 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
amplitude =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
|
||||
}
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
@@ -127,7 +126,6 @@ impl FastFieldReader for U64FastFieldReader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<u64>> for U64FastFieldReader {
|
||||
fn from(vals: Vec<u64>) -> U64FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -136,22 +134,23 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
let path = Path::new("__dummy__");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(path).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers.get_field_writer(field).expect(
|
||||
"With a RAMDirectory, this should never fail.",
|
||||
);
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer(field)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer, HashMap::new()).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
@@ -159,9 +158,9 @@ impl From<Vec<u64>> for U64FastFieldReader {
|
||||
let composite_file =
|
||||
CompositeFile::open(&source).expect("Failed to read the composite file");
|
||||
|
||||
let field_source = composite_file.open_read(field).expect(
|
||||
"File component not found",
|
||||
);
|
||||
let field_source = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
U64FastFieldReader::open(field_source)
|
||||
}
|
||||
}
|
||||
@@ -222,7 +221,9 @@ impl FastFieldReader for I64FastFieldReader {
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
fn open(data: ReadOnlySource) -> I64FastFieldReader {
|
||||
I64FastFieldReader { underlying: U64FastFieldReader::open(data) }
|
||||
I64FastFieldReader {
|
||||
underlying: U64FastFieldReader::open(data),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
|
||||
@@ -35,7 +35,9 @@ impl FastFieldSerializer {
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write: composite_write })
|
||||
Ok(FastFieldSerializer {
|
||||
composite_write: composite_write,
|
||||
})
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
@@ -54,12 +56,12 @@ impl FastFieldSerializer {
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use schema::{Schema, Field, Document, Cardinality};
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::Value;
|
||||
@@ -25,12 +25,11 @@ impl FastFieldsWriter {
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let default_value =
|
||||
if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
@@ -50,7 +49,7 @@ impl FastFieldsWriter {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {},
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
@@ -64,7 +63,7 @@ impl FastFieldsWriter {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
|
||||
multi_values_writers: vec!(),
|
||||
multi_values_writers: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,23 +72,22 @@ impl FastFieldsWriter {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| {
|
||||
field_writer.field() == field
|
||||
})
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub(crate) fn get_multivalue_writer(&mut self, field: Field) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| {
|
||||
multivalue_writer.field() == field
|
||||
})
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
@@ -104,9 +102,11 @@ impl FastFieldsWriter {
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: HashMap<Field, HashMap<UnorderedTermId, usize>>) -> io::Result<()> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
@@ -201,9 +201,9 @@ impl IntFastFieldWriter {
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
VInt(val).serialize(&mut self.vals).expect(
|
||||
"unable to serialize VInt to Vec",
|
||||
);
|
||||
VInt(val)
|
||||
.serialize(&mut self.vals)
|
||||
.expect("unable to serialize VInt to Vec");
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
@@ -215,7 +215,6 @@ impl IntFastFieldWriter {
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
@@ -228,13 +227,11 @@ impl IntFastFieldWriter {
|
||||
/// only the first one is taken in account.
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
match *v {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64field, got {:?} ", v),
|
||||
}
|
||||
}
|
||||
Some(v) => match *v {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64field, got {:?} ", v),
|
||||
},
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{Tokenizer, TokenStream};
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use schema::Value;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
@@ -126,11 +126,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
pub fn add_document(
|
||||
&mut self,
|
||||
add_operation: AddOperation,
|
||||
schema: &Schema,
|
||||
) -> io::Result<()> {
|
||||
pub fn add_document(&mut self, add_operation: AddOperation, schema: &Schema) -> io::Result<()> {
|
||||
let doc_id = self.max_doc;
|
||||
let mut doc = add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
@@ -144,17 +140,16 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
match *field_options.field_type() {
|
||||
FieldType::HierarchicalFacet => {
|
||||
let facets: Vec<&[u8]> = field_values.iter()
|
||||
.flat_map(|field_value| {
|
||||
match field_value.value() {
|
||||
&Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
let facets: Vec<&[u8]> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match field_value.value() {
|
||||
&Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||
_ => {
|
||||
panic!("Expected hierarchical facet");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut term = unsafe {Term::with_capacity(100)};
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
for facet_bytes in facets {
|
||||
let mut unordered_term_id_opt = None;
|
||||
@@ -163,7 +158,8 @@ impl<'a> SegmentWriter<'a> {
|
||||
.token_stream(&fake_str)
|
||||
.process(&mut |ref token| {
|
||||
term.set_text(&token.text);
|
||||
let unordered_term_id = self.multifield_postings.subscribe(doc_id, &term);
|
||||
let unordered_term_id =
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
unordered_term_id_opt = Some(unordered_term_id);
|
||||
});
|
||||
|
||||
@@ -176,25 +172,26 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let num_tokens =
|
||||
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] {
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
if texts.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
}
|
||||
} else {
|
||||
let num_tokens = if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.0 as usize]
|
||||
{
|
||||
let texts: Vec<&str> = field_values
|
||||
.iter()
|
||||
.flat_map(|field_value| match *field_value.value() {
|
||||
Value::Str(ref text) => Some(text.as_str()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
if texts.is_empty() {
|
||||
0
|
||||
};
|
||||
} else {
|
||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
||||
self.multifield_postings
|
||||
.index_text(doc_id, field, &mut token_stream)
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| {
|
||||
@@ -226,9 +223,7 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
}
|
||||
self.fieldnorms_writer.fill_val_up_to(doc_id);
|
||||
doc.filter_fields(|field| {
|
||||
schema.get_field_entry(field).is_stored()
|
||||
});
|
||||
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());
|
||||
let doc_writer = self.segment_serializer.get_store_writer();
|
||||
doc_writer.store(&doc)?;
|
||||
self.max_doc += 1;
|
||||
|
||||
@@ -34,8 +34,6 @@ extern crate log;
|
||||
#[macro_use]
|
||||
extern crate error_chain;
|
||||
|
||||
extern crate regex;
|
||||
extern crate tempfile;
|
||||
extern crate atomicwrites;
|
||||
extern crate bit_set;
|
||||
extern crate byteorder;
|
||||
@@ -49,11 +47,13 @@ extern crate itertools;
|
||||
extern crate lz4;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
extern crate rust_stemmers;
|
||||
extern crate serde;
|
||||
extern crate serde_json;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate time;
|
||||
extern crate uuid;
|
||||
|
||||
|
||||
@@ -94,39 +94,52 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let query = TermQuery::new(Term::from_field_text(title, "abc"), IndexRecordOption::WithFreqsAndPositions);
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(title, "abc"),
|
||||
IndexRecordOption::WithFreqsAndPositions,
|
||||
);
|
||||
let weight = query.specialized_weight(&*searcher);
|
||||
{
|
||||
let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0,1,2], scorer.postings().positions());
|
||||
assert_eq!(&[0, 1, 2], scorer.postings().positions());
|
||||
scorer.advance();
|
||||
assert_eq!(&[0,5], scorer.postings().positions());
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
scorer.advance();
|
||||
scorer.advance();
|
||||
assert_eq!(&[0,5], scorer.postings().positions());
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert_eq!(&[0,5], scorer.postings().positions());
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0,5], scorer.postings().positions());
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
{
|
||||
let mut scorer = weight.specialized_scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
let mut scorer = weight
|
||||
.specialized_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap();
|
||||
assert_eq!(scorer.skip_next(100), SkipResult::Reached);
|
||||
assert_eq!(scorer.skip_next(1002), SkipResult::Reached);
|
||||
assert_eq!(scorer.doc(), 1002);
|
||||
assert_eq!(&[0,5], scorer.postings().positions());
|
||||
assert_eq!(&[0, 5], scorer.postings().positions());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
use DocId;
|
||||
use schema::Term;
|
||||
use postings::{InvertedIndexSerializer, FieldSerializer};
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Schema, Field};
|
||||
use schema::{Field, Schema};
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{TermHashMap, Heap};
|
||||
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use datastruct::stacker::{Heap, TermHashMap};
|
||||
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use tokenizer::Token;
|
||||
@@ -17,39 +17,31 @@ use tokenizer::TokenStream;
|
||||
use schema::IndexRecordOption;
|
||||
use postings::UnorderedTermId;
|
||||
|
||||
|
||||
fn posting_from_field_entry<'a>(
|
||||
field_entry: &FieldEntry,
|
||||
heap: &'a Heap,
|
||||
) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
text_options
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|indexing_options| {
|
||||
match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
.map(|indexing_options| match indexing_options.index_option() {
|
||||
IndexRecordOption::Basic => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqs => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
IndexRecordOption::WithFreqsAndPositions => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
FieldType::U64(_) |
|
||||
FieldType::I64(_) |
|
||||
FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
term_index: TermHashMap<'a>,
|
||||
@@ -88,7 +80,10 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
#[allow(needless_range_loop)]
|
||||
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||
|
||||
@@ -99,8 +94,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> = HashMap::new();
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
@@ -120,8 +115,9 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let mut mapping = HashMap::new();
|
||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_,_,bucket)| bucket)
|
||||
.enumerate() {
|
||||
.map(|&(_, _, bucket)| bucket)
|
||||
.enumerate()
|
||||
{
|
||||
mapping.insert(term_unord_id, term_ord);
|
||||
}
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
@@ -144,7 +140,6 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
///
|
||||
@@ -168,20 +163,22 @@ pub trait PostingsWriter {
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()>;
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
heap: &Heap,
|
||||
) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and subscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
fn index_text<'a>(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
heap: &Heap,
|
||||
) -> u32 {
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
@@ -215,7 +212,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
term_index: &mut TermHashMap,
|
||||
@@ -237,8 +233,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
term_ord
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], u32, UnorderedTermId)],
|
||||
|
||||
@@ -9,7 +9,6 @@ use DocId;
|
||||
use std::any::Any;
|
||||
use core::Searcher;
|
||||
|
||||
|
||||
/// Query that matches all of the documents.
|
||||
///
|
||||
/// All of the document get the score 1f32.
|
||||
@@ -34,12 +33,11 @@ impl Weight for AllWeight {
|
||||
Ok(box AllScorer {
|
||||
started: false,
|
||||
doc: 0u32,
|
||||
max_doc: reader.max_doc()
|
||||
max_doc: reader.max_doc(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Scorer associated to the `AllQuery` query.
|
||||
pub struct AllScorer {
|
||||
started: bool,
|
||||
@@ -51,8 +49,7 @@ impl DocSet for AllScorer {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.started {
|
||||
self.doc += 1u32;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
self.started = true;
|
||||
}
|
||||
self.doc < self.max_doc
|
||||
@@ -71,4 +68,4 @@ impl Scorer for AllScorer {
|
||||
fn score(&self) -> Score {
|
||||
1f32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,4 +24,4 @@ pub use self::scorer::EmptyScorer;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
pub use self::all_query::{AllQuery, AllWeight, AllScorer};
|
||||
pub use self::all_query::{AllQuery, AllScorer, AllWeight};
|
||||
|
||||
@@ -6,7 +6,6 @@ pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::phrase_weight::PhraseWeight;
|
||||
pub use self::phrase_scorer::PhraseScorer;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -75,8 +74,6 @@ mod tests {
|
||||
assert_eq!(test_query(vec!["g", "a"]), empty_vec);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_docfreq_order() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -90,11 +87,13 @@ mod tests {
|
||||
let doc = doc!(text_field=>"b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 1
|
||||
{
|
||||
// 1
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 2
|
||||
{
|
||||
// 2
|
||||
let doc = doc!(text_field=>"b a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use query::Scorer;
|
||||
use DocId;
|
||||
use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings};
|
||||
use postings::{DocSet, IntersectionDocSet, Postings, SegmentPostings, SkipResult};
|
||||
|
||||
struct PostingsWithOffset {
|
||||
offset: u32,
|
||||
@@ -11,7 +11,7 @@ impl PostingsWithOffset {
|
||||
pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
|
||||
PostingsWithOffset {
|
||||
offset,
|
||||
segment_postings
|
||||
segment_postings,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -49,7 +49,6 @@ pub struct PhraseScorer {
|
||||
}
|
||||
|
||||
impl PhraseScorer {
|
||||
|
||||
pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
|
||||
let postings_with_offsets: Vec<_> = term_postings
|
||||
.into_iter()
|
||||
@@ -57,12 +56,11 @@ impl PhraseScorer {
|
||||
.map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32))
|
||||
.collect();
|
||||
PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(postings_with_offsets)
|
||||
intersection_docset: IntersectionDocSet::from(postings_with_offsets),
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_match(&self) -> bool {
|
||||
|
||||
// TODO maybe we could avoid decoding positions lazily for all terms
|
||||
// when there is > 2 terms.
|
||||
//
|
||||
@@ -74,7 +72,6 @@ impl PhraseScorer {
|
||||
positions_arr[docset.offset as usize] = docset.positions();
|
||||
}
|
||||
|
||||
|
||||
let num_postings = positions_arr.len() as u32;
|
||||
|
||||
let mut ord = 1u32;
|
||||
|
||||
@@ -23,7 +23,8 @@ impl Weight for PhraseWeight {
|
||||
for term in &self.phrase_terms {
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions) {
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
term_postings_list.push(postings);
|
||||
} else {
|
||||
return Ok(box EmptyScorer);
|
||||
|
||||
@@ -21,9 +21,7 @@ pub struct Document {
|
||||
|
||||
impl From<Vec<FieldValue>> for Document {
|
||||
fn from(field_values: Vec<FieldValue>) -> Self {
|
||||
Document {
|
||||
field_values
|
||||
}
|
||||
Document { field_values }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +36,6 @@ impl PartialEq for Document {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Eq for Document {}
|
||||
|
||||
impl Document {
|
||||
@@ -59,14 +56,16 @@ impl Document {
|
||||
|
||||
/// Retain only the field that are matching the
|
||||
/// predicate given in argument.
|
||||
pub fn filter_fields<P: Fn(Field)->bool>(&mut self, predicate: P) {
|
||||
pub fn filter_fields<P: Fn(Field) -> bool>(&mut self, predicate: P) {
|
||||
self.field_values
|
||||
.retain(|field_value| predicate(field_value.field()));
|
||||
}
|
||||
|
||||
/// Adding a facet to the document.
|
||||
pub fn add_facet<F>(&mut self, field: Field, path: F)
|
||||
where Facet: From<F> {
|
||||
where
|
||||
Facet: From<F>,
|
||||
{
|
||||
let facet = Facet::from(path);
|
||||
let value = Value::Facet(facet);
|
||||
self.add(FieldValue::new(field, value));
|
||||
@@ -144,9 +143,7 @@ impl BinarySerializable for Document {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let num_field_values = VInt::deserialize(reader)?.val() as usize;
|
||||
let field_values = (0..num_field_values)
|
||||
.map(|_| {
|
||||
FieldValue::deserialize(reader)
|
||||
})
|
||||
.map(|_| FieldValue::deserialize(reader))
|
||||
.collect::<io::Result<Vec<FieldValue>>>()?;
|
||||
Ok(Document::from(field_values))
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::fmt::{self, Display, Debug, Formatter};
|
||||
use std::fmt::{self, Debug, Display, Formatter};
|
||||
use std::str;
|
||||
use std::io::{self, Read, Write};
|
||||
use regex::Regex;
|
||||
@@ -7,7 +7,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::borrow::Cow;
|
||||
use common::BinarySerializable;
|
||||
|
||||
|
||||
const SLASH_BYTE: u8 = '/' as u8;
|
||||
const ESCAPE_BYTE: u8 = '\\' as u8;
|
||||
|
||||
@@ -30,7 +29,6 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
pub struct Facet(Vec<u8>);
|
||||
|
||||
impl Facet {
|
||||
|
||||
/// Returns a new instance of the "root facet"
|
||||
/// Equivalent to `/`.
|
||||
pub fn root() -> Facet {
|
||||
@@ -66,7 +64,9 @@ impl Facet {
|
||||
/// contains a `/` or a `\`, it should be escaped
|
||||
/// using an anti-slash `/`.
|
||||
pub fn from_text<'a, T>(path: &'a T) -> Facet
|
||||
where T: ?Sized + AsRef<str> {
|
||||
where
|
||||
T: ?Sized + AsRef<str>,
|
||||
{
|
||||
From::from(path)
|
||||
}
|
||||
|
||||
@@ -75,9 +75,10 @@ impl Facet {
|
||||
///
|
||||
/// The steps are expected to be unescaped.
|
||||
pub fn from_path<Path>(path: Path) -> Facet
|
||||
where
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString {
|
||||
where
|
||||
Path: IntoIterator,
|
||||
Path::Item: ToString,
|
||||
{
|
||||
let mut facet_bytes: Vec<u8> = Vec::with_capacity(100);
|
||||
let mut step_it = path.into_iter();
|
||||
if let Some(step) = step_it.next() {
|
||||
@@ -95,7 +96,6 @@ impl Facet {
|
||||
&mut self.0
|
||||
}
|
||||
|
||||
|
||||
/// Returns `true` iff other is a subfacet of `self`.
|
||||
pub fn is_prefix_of(&self, other: &Facet) -> bool {
|
||||
let self_bytes: &[u8] = self.encoded_bytes();
|
||||
@@ -116,7 +116,6 @@ impl Borrow<[u8]> for Facet {
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
|
||||
fn from(path_asref: &'a T) -> Facet {
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
@@ -129,9 +128,7 @@ impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
let path_bytes = path.as_bytes();
|
||||
for &c in &path_bytes[1..] {
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, ESCAPE_BYTE) => state = State::Escaped,
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push(FACET_SEP_BYTE);
|
||||
}
|
||||
@@ -179,16 +176,19 @@ fn escape_slashes(s: &str) -> Cow<str> {
|
||||
|
||||
impl Serialize for Facet {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serializer.serialize_str(&self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Facet {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> where
|
||||
D: Deserializer<'de> {
|
||||
<&'de str as Deserialize<'de>>::deserialize(deserializer)
|
||||
.map(Facet::from)
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
<&'de str as Deserialize<'de>>::deserialize(deserializer).map(Facet::from)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,7 +199,6 @@ impl Debug for Facet {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -226,7 +225,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_facet_debug() {
|
||||
let v = ["first", "second", "third"];
|
||||
@@ -234,4 +232,4 @@ mod tests {
|
||||
assert_eq!(format!("{:?}", facet), "Facet(/first/second/third)");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ impl FieldEntry {
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(),
|
||||
FieldType::HierarchicalFacet => true
|
||||
FieldType::HierarchicalFacet => true,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ impl FieldType {
|
||||
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
FieldType::HierarchicalFacet => true
|
||||
FieldType::HierarchicalFacet => true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ impl FieldType {
|
||||
None
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic)
|
||||
FieldType::HierarchicalFacet => Some(IndexRecordOption::Basic),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,9 +75,7 @@ impl FieldType {
|
||||
FieldType::U64(_) | FieldType::I64(_) => Err(ValueParsingError::TypeError(
|
||||
format!("Expected an integer, got {:?}", json),
|
||||
)),
|
||||
FieldType::HierarchicalFacet => {
|
||||
Ok(Value::Facet(Facet::from(field_text)))
|
||||
}
|
||||
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
|
||||
},
|
||||
JsonValue::Number(ref field_val_num) => match *self {
|
||||
FieldType::I64(_) => {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
|
||||
/// Express whether a field is single-value or multi-valued.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
pub enum Cardinality {
|
||||
@@ -10,15 +9,14 @@ pub enum Cardinality {
|
||||
/// The document can have any number of values associated to the document.
|
||||
/// This is more memory and CPU expensive than the SingleValue solution.
|
||||
#[serde(rename = "multi")]
|
||||
MultiValues
|
||||
MultiValues,
|
||||
}
|
||||
|
||||
/// Define how an int field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct IntOptions {
|
||||
indexed: bool,
|
||||
#[serde(skip_serializing_if="Option::is_none")]
|
||||
fast: Option<Cardinality>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")] fast: Option<Cardinality>,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
@@ -86,7 +84,6 @@ impl Default for IntOptions {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Shortcut for a u64 fast field.
|
||||
///
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
|
||||
@@ -114,7 +111,6 @@ pub const INT_STORED: IntOptions = IntOptions {
|
||||
fast: None,
|
||||
};
|
||||
|
||||
|
||||
impl BitOr for IntOptions {
|
||||
type Output = IntOptions;
|
||||
|
||||
|
||||
@@ -334,8 +334,12 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_schema_serialization() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
@@ -399,7 +403,9 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_document_to_json() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
@@ -418,8 +424,12 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast(Cardinality::SingleValue);
|
||||
let count_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let author_field = schema_builder.add_text_field("author", STRING);
|
||||
let count_field = schema_builder.add_u64_field("count", count_options);
|
||||
|
||||
@@ -44,7 +44,7 @@ impl Default for TextOptions {
|
||||
|
||||
/// Configuration defining indexing for a text field.
|
||||
/// It wraps:
|
||||
///
|
||||
///
|
||||
/// * record (See [`IndexRecordOption`](./enum.IndexRecordOption.html))
|
||||
/// * tokenizer
|
||||
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
|
||||
|
||||
@@ -26,7 +26,7 @@ impl Serialize for Value {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer)
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -178,9 +178,7 @@ mod binary_serialize {
|
||||
let value = i64::deserialize(reader)?;
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
HIERARCHICAL_FACET_CODE => {
|
||||
Ok(Value::Facet(Facet::deserialize(reader)?))
|
||||
}
|
||||
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!("No field type is associated with code {:?}", type_code),
|
||||
|
||||
@@ -39,9 +39,7 @@ impl StoreReader {
|
||||
}
|
||||
|
||||
fn block_offset(&self, doc_id: DocId) -> (DocId, u64) {
|
||||
self.block_index()
|
||||
.seek(doc_id + 1)
|
||||
.unwrap_or((0u32, 0u64))
|
||||
self.block_index().seek(doc_id + 1).unwrap_or((0u32, 0u64))
|
||||
}
|
||||
|
||||
pub(crate) fn block_data(&self) -> &[u8] {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use directory::WritePtr;
|
||||
use DocId;
|
||||
use common::{VInt, BinarySerializable};
|
||||
use common::{BinarySerializable, VInt};
|
||||
use std::io::{self, Write};
|
||||
use super::StoreReader;
|
||||
use lz4;
|
||||
@@ -66,10 +66,8 @@ impl StoreWriter {
|
||||
pub fn stack(&mut self, store_reader: &StoreReader) -> io::Result<()> {
|
||||
if !self.current_block.is_empty() {
|
||||
self.write_and_compress_block()?;
|
||||
self.offset_index_writer.insert(
|
||||
self.doc,
|
||||
&(self.writer.written_bytes() as u64),
|
||||
)?;
|
||||
self.offset_index_writer
|
||||
.insert(self.doc, &(self.writer.written_bytes() as u64))?;
|
||||
}
|
||||
let doc_offset = self.doc;
|
||||
let start_offset = self.writer.written_bytes() as u64;
|
||||
@@ -81,9 +79,8 @@ impl StoreWriter {
|
||||
// its start doc id and its start file offset.
|
||||
for (next_doc_id, block_addr) in store_reader.block_index() {
|
||||
self.doc = doc_offset + next_doc_id;
|
||||
self.offset_index_writer.insert(
|
||||
self.doc,
|
||||
&(start_offset + block_addr))?;
|
||||
self.offset_index_writer
|
||||
.insert(self.doc, &(start_offset + block_addr))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use fst::map::{StreamBuilder, Stream};
|
||||
use fst::map::{Stream, StreamBuilder};
|
||||
use postings::TermInfo;
|
||||
use super::TermDictionaryImpl;
|
||||
use termdict::{TermOrdinal, TermDictionary, TermStreamerBuilder, TermStreamer};
|
||||
use termdict::{TermDictionary, TermOrdinal, TermStreamer, TermStreamerBuilder};
|
||||
|
||||
/// See [`TermStreamerBuilder`](./trait.TermStreamerBuilder.html)
|
||||
pub struct TermStreamerBuilderImpl<'a> {
|
||||
@@ -53,7 +53,6 @@ impl<'a> TermStreamerBuilder for TermStreamerBuilderImpl<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// See [`TermStreamer`](./trait.TermStreamer.html)
|
||||
pub struct TermStreamerImpl<'a> {
|
||||
fst_map: &'a TermDictionaryImpl,
|
||||
@@ -88,4 +87,3 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
||||
&self.current_value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use common::BinarySerializable;
|
||||
use schema::FieldType;
|
||||
use postings::TermInfo;
|
||||
use termdict::{TermDictionary, TermDictionaryBuilder, TermOrdinal};
|
||||
use super::{TermStreamerImpl, TermStreamerBuilderImpl};
|
||||
use super::{TermStreamerBuilderImpl, TermStreamerImpl};
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
@@ -95,7 +95,6 @@ pub struct TermDictionaryImpl {
|
||||
values_mmap: ReadOnlySource,
|
||||
}
|
||||
|
||||
|
||||
impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
type Streamer = TermStreamerImpl<'a>;
|
||||
|
||||
@@ -105,9 +104,8 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer).expect(
|
||||
"Deserializing 4 bytes should always work",
|
||||
) as usize;
|
||||
let footer_size = u32::deserialize(&mut split_len_buffer)
|
||||
.expect("Deserializing 4 bytes should always work") as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
@@ -128,16 +126,14 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl {
|
||||
let mut node = fst.root();
|
||||
while ord != 0 || !node.is_final() {
|
||||
if let Some(transition) = node.transitions()
|
||||
.take_while(|transition| {
|
||||
transition.out.value() <= ord
|
||||
})
|
||||
.last() {
|
||||
.take_while(|transition| transition.out.value() <= ord)
|
||||
.last()
|
||||
{
|
||||
ord -= transition.out.value();
|
||||
bytes.push(transition.inp);
|
||||
let new_node_addr = transition.addr;
|
||||
node = fst.node(new_node_addr);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,34 +48,30 @@ followed by a streaming through at most `1024` elements in the
|
||||
term `stream`.
|
||||
*/
|
||||
|
||||
use schema::{Term, Field, FieldType};
|
||||
use schema::{Field, FieldType, Term};
|
||||
use directory::ReadOnlySource;
|
||||
use postings::TermInfo;
|
||||
|
||||
|
||||
/// Position of the term in the sorted list of terms.
|
||||
pub type TermOrdinal = u64;
|
||||
|
||||
|
||||
pub use self::merger::TermMerger;
|
||||
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
mod fstdict;
|
||||
#[cfg(not(feature = "streamdict"))]
|
||||
pub use self::fstdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
|
||||
pub use self::fstdict::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerBuilderImpl,
|
||||
TermStreamerImpl};
|
||||
|
||||
#[cfg(feature = "streamdict")]
|
||||
mod streamdict;
|
||||
#[cfg(feature = "streamdict")]
|
||||
pub use self::streamdict::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl,
|
||||
TermStreamerBuilderImpl};
|
||||
pub use self::streamdict::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerBuilderImpl,
|
||||
TermStreamerImpl};
|
||||
|
||||
mod merger;
|
||||
use std::io;
|
||||
|
||||
|
||||
/// Dictionary associating sorted `&[u8]` to values
|
||||
pub trait TermDictionary<'a>
|
||||
where
|
||||
@@ -154,7 +150,6 @@ where
|
||||
fn finish(self) -> io::Result<W>;
|
||||
}
|
||||
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub trait TermStreamer: Sized {
|
||||
@@ -202,7 +197,6 @@ pub trait TermStreamer: Sized {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// `TermStreamerBuilder` is an helper object used to define
|
||||
/// a range of terms that should be streamed.
|
||||
pub trait TermStreamerBuilder {
|
||||
@@ -226,13 +220,12 @@ pub trait TermStreamerBuilder {
|
||||
fn into_stream(self) -> Self::Streamer;
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{TermDictionaryImpl, TermDictionaryBuilderImpl, TermStreamerImpl};
|
||||
use directory::{RAMDirectory, Directory, ReadOnlySource};
|
||||
use super::{TermDictionaryBuilderImpl, TermDictionaryImpl, TermStreamerImpl};
|
||||
use directory::{Directory, RAMDirectory, ReadOnlySource};
|
||||
use std::path::PathBuf;
|
||||
use schema::{FieldType, SchemaBuilder, Document, TEXT};
|
||||
use schema::{Document, FieldType, SchemaBuilder, TEXT};
|
||||
use core::Index;
|
||||
use std::str;
|
||||
use termdict::TermStreamer;
|
||||
@@ -243,17 +236,15 @@ mod tests {
|
||||
|
||||
const BLOCK_SIZE: usize = 1_500;
|
||||
|
||||
|
||||
fn make_term_info(val: u64) -> TermInfo {
|
||||
TermInfo {
|
||||
doc_freq: val as u32,
|
||||
positions_offset: val * 2u64,
|
||||
positions_offset: val * 2u64,
|
||||
postings_offset: val * 3u64,
|
||||
positions_inner_offset: 5u8,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_term_ordinals() {
|
||||
const COUNTRIES: [&'static str; 7] = [
|
||||
@@ -263,15 +254,15 @@ mod tests {
|
||||
"Slovenia",
|
||||
"Spain",
|
||||
"Sweden",
|
||||
"Switzerland"
|
||||
"Switzerland",
|
||||
];
|
||||
let mut directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("TermDictionary");
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(write, field_type).unwrap();
|
||||
for term in COUNTRIES.iter() {
|
||||
term_dictionary_builder
|
||||
.insert(term.as_bytes(), &make_term_info(0u64))
|
||||
@@ -283,7 +274,7 @@ mod tests {
|
||||
let term_dict: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
for (term_ord, term) in COUNTRIES.iter().enumerate() {
|
||||
assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64);
|
||||
let mut bytes = vec!();
|
||||
let mut bytes = vec![];
|
||||
assert!(term_dict.ord_to_term(term_ord as u64, &mut bytes));
|
||||
assert_eq!(bytes, term.as_bytes());
|
||||
}
|
||||
@@ -296,8 +287,8 @@ mod tests {
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(write, field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(write, field_type).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert("abc".as_bytes(), &make_term_info(34u64))
|
||||
.unwrap();
|
||||
@@ -377,7 +368,6 @@ mod tests {
|
||||
assert_eq!(&*term_string, "abcdef");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_term_dictionary_stream() {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
@@ -385,8 +375,8 @@ mod tests {
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
@@ -411,13 +401,12 @@ mod tests {
|
||||
term_dictionary.get(key.as_bytes());
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_stream_high_range_prefix_suffix() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder
|
||||
.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))
|
||||
@@ -451,8 +440,8 @@ mod tests {
|
||||
.collect();
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
term_dictionary_builder
|
||||
.insert(id.as_bytes(), &make_term_info(*i as u64))
|
||||
@@ -520,14 +509,15 @@ mod tests {
|
||||
fn test_empty_string() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[], &make_term_info(1 as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[], &make_term_info(1 as u64)).unwrap();
|
||||
term_dictionary_builder
|
||||
.insert(&[1u8], &make_term_info(2 as u64)).unwrap();
|
||||
term_dictionary_builder
|
||||
.finish().unwrap()
|
||||
.insert(&[1u8], &make_term_info(2 as u64))
|
||||
.unwrap();
|
||||
term_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let term_dictionary: TermDictionaryImpl = TermDictionaryImpl::from_source(source);
|
||||
@@ -543,8 +533,8 @@ mod tests {
|
||||
fn test_stream_range_boundaries() {
|
||||
let field_type = FieldType::Str(TEXT);
|
||||
let buffer: Vec<u8> = {
|
||||
let mut term_dictionary_builder = TermDictionaryBuilderImpl::new(vec![], field_type)
|
||||
.unwrap();
|
||||
let mut term_dictionary_builder =
|
||||
TermDictionaryBuilderImpl::new(vec![], field_type).unwrap();
|
||||
for i in 0u8..10u8 {
|
||||
let number_arr = [i; 1];
|
||||
term_dictionary_builder
|
||||
|
||||
@@ -49,7 +49,6 @@ impl TermDeltaDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// code
|
||||
// first bit represents whether the prefix / suffix len can be encoded
|
||||
// on the same byte. (the next one)
|
||||
@@ -57,18 +56,17 @@ impl TermDeltaDecoder {
|
||||
|
||||
#[inline(always)]
|
||||
pub fn decode<'a>(&mut self, code: u8, mut cursor: &'a [u8]) -> &'a [u8] {
|
||||
let (prefix_len, suffix_len): (usize, usize) =
|
||||
if (code & 1u8) == 1u8 {
|
||||
let b = cursor[0];
|
||||
cursor = &cursor[1..];
|
||||
let prefix_len = (b & 15u8) as usize;
|
||||
let suffix_len = (b >> 4u8) as usize;
|
||||
(prefix_len, suffix_len)
|
||||
} else {
|
||||
let prefix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
let suffix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
(prefix_len as usize, suffix_len as usize)
|
||||
};
|
||||
let (prefix_len, suffix_len): (usize, usize) = if (code & 1u8) == 1u8 {
|
||||
let b = cursor[0];
|
||||
cursor = &cursor[1..];
|
||||
let prefix_len = (b & 15u8) as usize;
|
||||
let suffix_len = (b >> 4u8) as usize;
|
||||
(prefix_len, suffix_len)
|
||||
} else {
|
||||
let prefix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
let suffix_len = u32::deserialize(&mut cursor).unwrap();
|
||||
(prefix_len as usize, suffix_len as usize)
|
||||
};
|
||||
unsafe { self.term.set_len(prefix_len) };
|
||||
self.term.extend_from_slice(&(*cursor)[..suffix_len]);
|
||||
&cursor[suffix_len..]
|
||||
|
||||
@@ -6,32 +6,28 @@ use super::{Token, TokenFilter, TokenStream};
|
||||
pub struct AlphaNumOnlyFilter;
|
||||
|
||||
pub struct AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.text.chars().all(|c| c.is_ascii_alphanumeric())
|
||||
}
|
||||
|
||||
fn wrap(
|
||||
tail: TailTokenStream,
|
||||
) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||
AlphaNumOnlyFilterStream {
|
||||
tail
|
||||
}
|
||||
fn wrap(tail: TailTokenStream) -> AlphaNumOnlyFilterStream<TailTokenStream> {
|
||||
AlphaNumOnlyFilterStream { tail }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
type ResultTokenStream = AlphaNumOnlyFilterStream<TailTokenStream>;
|
||||
|
||||
@@ -41,8 +37,8 @@ impl<TailTokenStream> TokenFilter<TailTokenStream> for AlphaNumOnlyFilter
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for AlphaNumOnlyFilterStream<TailTokenStream>
|
||||
where
|
||||
TailTokenStream: TokenStream
|
||||
where
|
||||
TailTokenStream: TokenStream,
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use super::{Token, Tokenizer, TokenStream};
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use std::str;
|
||||
use schema::FACET_SEP_BYTE;
|
||||
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
/// and emits a token for all of its parent.
|
||||
///
|
||||
@@ -39,27 +38,27 @@ impl<'a> Tokenizer<'a> for FacetTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
match self.state {
|
||||
State::RootFacetNotEmitted => {
|
||||
self.state =
|
||||
if self.text.is_empty() {
|
||||
State::Terminated
|
||||
} else {
|
||||
State::UpToPosition(0)
|
||||
};
|
||||
self.state = if self.text.is_empty() {
|
||||
State::Terminated
|
||||
} else {
|
||||
State::UpToPosition(0)
|
||||
};
|
||||
true
|
||||
}
|
||||
State::UpToPosition(cursor) => {
|
||||
let bytes: &[u8] = self.text.as_bytes();
|
||||
if let Some(next_sep_pos) = bytes[cursor+1..]
|
||||
if let Some(next_sep_pos) = bytes[cursor + 1..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|b| b == FACET_SEP_BYTE)
|
||||
.map(|pos| cursor + 1 + pos) {
|
||||
let facet_part = unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
||||
.map(|pos| cursor + 1 + pos)
|
||||
{
|
||||
let facet_part =
|
||||
unsafe { str::from_utf8_unchecked(&bytes[cursor..next_sep_pos]) };
|
||||
self.token.text.push_str(facet_part);
|
||||
self.state = State::UpToPosition(next_sep_pos);
|
||||
} else {
|
||||
@@ -69,9 +68,7 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
}
|
||||
true
|
||||
}
|
||||
State::Terminated => {
|
||||
false
|
||||
}
|
||||
State::Terminated => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +84,7 @@ impl<'a> TokenStream for FacetTokenStream<'a> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use tokenizer::{TokenStream, Token, Tokenizer};
|
||||
use tokenizer::{Token, TokenStream, Tokenizer};
|
||||
use super::FacetTokenizer;
|
||||
use schema::Facet;
|
||||
|
||||
@@ -101,7 +98,9 @@ mod tests {
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { ::std::str::from_utf8_unchecked(facet.encoded_bytes()) })
|
||||
.token_stream(unsafe {
|
||||
::std::str::from_utf8_unchecked(facet.encoded_bytes())
|
||||
})
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
@@ -121,10 +120,12 @@ mod tests {
|
||||
tokens.push(format!("{}", facet));
|
||||
};
|
||||
FacetTokenizer
|
||||
.token_stream(unsafe { ::std::str::from_utf8_unchecked(facet.encoded_bytes()) })
|
||||
.token_stream(unsafe {
|
||||
::std::str::from_utf8_unchecked(facet.encoded_bytes())
|
||||
})
|
||||
.process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 1);
|
||||
assert_eq!(tokens[0], "/");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -253,7 +253,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
|
||||
/// The resulting `TokenStream` type.
|
||||
|
||||
Reference in New Issue
Block a user