Compare commits

...

14 Commits
0.5.0 ... 0.5.2

Author SHA1 Message Date
Paul Masurel
d40ef06dde Edited changelog and bumped version.
This branch is to be published as a hotfix for 5.1.0.

Closes #280
Closes #274
Closes #289
2018-05-05 21:00:10 -07:00
Paul Masurel
384917c17b Added comments from code review. 2018-05-05 20:51:40 -07:00
Paul Masurel
cbca95aee3 Removed large block 2018-05-02 23:32:27 -07:00
Paul Masurel
2b8618afc2 Added unit test for multivalued u64 fastfields. u64 fastfields are not dictionary encoded. 2018-05-02 22:33:38 -07:00
Paul Masurel
967cf2cb02 AllQuery handling deletes, better tests 2018-05-02 11:42:07 -07:00
Paul Masurel
0e68c4ac34 Test passing. 2018-05-01 13:42:31 -07:00
Paul Masurel
e09192b0ab issue-274 Added unit test testing if facet handle merging 2018-05-01 10:28:34 -07:00
Paul Masurel
97b7984200 Updated CHANGELOG 2018-03-10 14:08:11 +09:00
Paul Masurel
8683718159 Version bump 2018-03-10 14:01:30 +09:00
Paul Masurel
0cf274135b Clippy 2018-03-10 13:07:18 +09:00
Paul Masurel
a3b44773bb Bugfix and rustfmt 2018-03-10 12:21:50 +09:00
Paul Masurel
ec7c582109 NOBUG no-simd compression fix 2018-03-09 14:19:58 +09:00
Ewan Higgs
ee7ab72fb1 Support trailing commas using ',+ ,' trick from Blandy 2017. (#250) 2018-02-27 10:33:39 +09:00
Dylan DPC
e82859f2e6 Update Cargo.toml (#249) 2018-02-24 09:17:33 +09:00
42 changed files with 1110 additions and 357 deletions

View File

@@ -1,3 +1,14 @@
Tantivy 0.5.2
===========================
- bugfix #274
- bugfix #280
- bugfix #289
Tantivy 0.5.1
==========================
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
Tantivy 0.5
==========================
- Faceting

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.5.0"
version = "0.5.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
@@ -32,7 +32,7 @@ itertools = "0.5.9"
lz4 = "1.20"
bit-set = "0.4.0"
time = "0.1"
uuid = { version = "0.5", features = ["v4", "serde"] }
uuid = { version = "0.6", features = ["v4", "serde"] }
chan = "0.1"
crossbeam = "0.3"
futures = "0.1"

View File

@@ -60,7 +60,7 @@ impl TopCollector {
panic!("Limit must be strictly greater than 0.");
}
TopCollector {
limit: limit,
limit,
heap: BinaryHeap::with_capacity(limit),
segment_id: 0,
}
@@ -119,7 +119,7 @@ impl Collector for TopCollector {
}
} else {
let wrapped_doc = GlobalScoredDoc {
score: score,
score,
doc_address: DocAddress(self.segment_id, doc),
};
self.heap.push(wrapped_doc);

View File

@@ -16,10 +16,7 @@ pub struct FileAddr {
impl FileAddr {
fn new(field: Field, idx: usize) -> FileAddr {
FileAddr {
field: field,
idx: idx,
}
FileAddr { field, idx }
}
}
@@ -34,8 +31,8 @@ impl BinarySerializable for FileAddr {
let field = Field::deserialize(reader)?;
let idx = VInt::deserialize(reader)?.0 as usize;
Ok(FileAddr {
field: field,
idx: idx,
field,
idx,
})
}
}
@@ -169,10 +166,7 @@ impl CompositeFile {
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
self.offsets_index
.get(&FileAddr {
field: field,
idx: idx,
})
.get(&FileAddr { field, idx, })
.map(|&(from, to)| self.data.slice(from, to))
}
}

View File

@@ -14,7 +14,7 @@ impl<'a> OpenTimer<'a> {
/// when the `OpenTimer` is dropped.
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
name,
timer_tree: self.timer_tree,
start: PreciseTime::now(),
depth: self.depth + 1,
@@ -58,7 +58,7 @@ impl TimerTree {
/// Open a new named subtask
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
name,
timer_tree: self,
start: PreciseTime::now(),
depth: 0,

View File

@@ -1,4 +1,4 @@
use common::bitpacker::compute_num_bits;
use common::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use common::CountingWriter;
use std::cmp;
@@ -30,7 +30,7 @@ pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usiz
.unwrap();
}
let compressed_size = counting_writer.written_bytes();
assert_eq!(compressed_size, compute_block_size(num_bits));
assert_eq!(compressed_size, compressed_block_size(num_bits));
compressed_size
}
@@ -112,14 +112,14 @@ impl BlockDecoder {
) -> usize {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
for i in 0..COMPRESSION_BLOCK_SIZE {
let delta = bit_unpacker.get(i);
let val = offset + delta as u32;
self.output[i] = val;
offset = val;
}
compute_block_size(num_bits)
compressed_block_size(num_bits)
};
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
@@ -127,7 +127,7 @@ impl BlockDecoder {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
for i in 0..COMPRESSION_BLOCK_SIZE {
self.output[i] = bit_unpacker.get(i) as u32;
}

View File

@@ -33,7 +33,12 @@ impl IndexMeta {
impl fmt::Debug for IndexMeta {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", serde_json::ser::to_string(self).expect("JSON serialization for IndexMeta should never fail."))
write!(
f,
"{}",
serde_json::ser::to_string(self)
.expect("JSON serialization for IndexMeta should never fail.")
)
}
}

View File

@@ -7,6 +7,7 @@ use schema::Term;
use fastfield::DeleteBitSet;
use compression::CompressedIntStream;
use postings::FreqReadingOption;
use schema::FieldType;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
@@ -31,14 +32,14 @@ pub struct InvertedIndexReader {
impl InvertedIndexReader {
pub(crate) fn new(
termdict_source: ReadOnlySource,
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
record_option: IndexRecordOption,
) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
termdict,
postings_source,
positions_source,
delete_bitset,
@@ -46,6 +47,21 @@ impl InvertedIndexReader {
}
}
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader::new(
TermDictionaryImpl::empty(field_type),
ReadOnlySource::empty(),
ReadOnlySource::empty(),
DeleteBitSet::empty(),
record_option,
)
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.termdict.get(term.value_bytes())

View File

@@ -8,7 +8,6 @@ use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use directory::ReadOnlySource;
use schema::Document;
use DocId;
use std::sync::Arc;
@@ -97,7 +96,8 @@ impl SegmentReader {
field: Field,
) -> fastfield::Result<FastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue) {
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
@@ -107,19 +107,30 @@ impl SegmentReader {
}
}
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
&self,
field: Field,
idx: usize
) -> fastfield::Result<FastFieldReader<Item>> {
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
Ok(FastFieldReader::open(ff_source))
} else {
let field_entry = self.schema.get_field_entry(field);
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
/// May panick if the field is not a multivalued fastfield of the type `Item`.
pub fn multi_fast_field_reader<Item: FastValue>(&self, field: Field) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
pub fn multi_fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues) {
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let vals_reader = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))
@@ -218,6 +229,8 @@ impl SegmentReader {
}
/// Returns a field reader associated to the field given in argument.
/// If the field was not present in the index during indexing time,
/// the InvertedIndexReader is empty.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
@@ -230,27 +243,38 @@ impl SegmentReader {
{
return Arc::clone(inv_idx_reader);
}
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let record_option_opt = field_type.get_index_record_option();
let record_option = self.schema
.get_field_entry(field)
.field_type()
.get_index_record_option()
.expect("Field does not seem indexed.");
if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name());
}
let termdict_source: ReadOnlySource = self.termdict_composite
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() {
// no documents in the segment contained this field.
// As a result, no data is associated to the inverted index.
//
// Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
}
let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
let postings_source = self.postings_composite
.open_read(field)
.expect("Index corrupted. Failed to open field postings in composite file.");
let positions_source = self.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
termdict_source,
TermDictionaryImpl::from_source(termdict_source),
postings_source,
positions_source,
self.delete_bitset.clone(),

View File

@@ -118,9 +118,9 @@ struct QuadraticProbing {
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing {
hash: hash,
hash,
i: 0,
mask: mask,
mask,
}
}
@@ -137,7 +137,7 @@ impl<'a> TermHashMap<'a> {
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap: heap,
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
@@ -158,11 +158,10 @@ impl<'a> TermHashMap<'a> {
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr: key_bytes_ref,
hash: hash,
key_value_addr, hash
};
}
@@ -193,7 +192,10 @@ impl<'a> TermHashMap<'a> {
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return (bucket as UnorderedTermId, self.heap.get_mut_ref(expull_addr));
return (
bucket as UnorderedTermId,
self.heap.get_mut_ref(expull_addr),
);
}
}
}

View File

@@ -106,7 +106,6 @@ pub trait DocSet {
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self) -> bool {
let unboxed: &mut TDocSet = self.borrow_mut();
@@ -133,11 +132,8 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.count()
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.append_to_bitset(bitset);
}
}

View File

@@ -36,7 +36,7 @@ impl FacetReader {
) -> FacetReader {
FacetReader {
term_ords,
term_dict
term_dict,
}
}

View File

@@ -67,7 +67,6 @@ pub trait FastValue: Default + Clone + Copy {
fn as_u64(&self) -> u64;
}
impl FastValue for u64 {
fn from_u64(val: u64) -> Self {
val
@@ -83,10 +82,8 @@ impl FastValue for u64 {
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::U64(ref integer_options) =>
integer_options.get_fastfield_cardinality(),
FieldType::HierarchicalFacet =>
Some(Cardinality::MultiValues),
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
FieldType::HierarchicalFacet => Some(Cardinality::MultiValues),
_ => None,
}
}
@@ -101,11 +98,9 @@ impl FastValue for i64 {
common::i64_to_u64(*self)
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::I64(ref integer_options) =>
integer_options.get_fastfield_cardinality(),
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
_ => None,
}
}
@@ -123,7 +118,6 @@ fn value_to_u64(value: &Value) -> u64 {
}
}
#[cfg(test)]
mod tests {

View File

@@ -17,7 +17,7 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field(
"multifield",
IntOptions::default().set_fast(Cardinality::MultiValues)
IntOptions::default().set_fast(Cardinality::MultiValues),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -47,13 +47,12 @@ mod tests {
}
}
#[test]
fn test_multivalued_i64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_i64_field(
"multifield",
IntOptions::default().set_fast(Cardinality::MultiValues)
IntOptions::default().set_fast(Cardinality::MultiValues),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -86,4 +85,4 @@ mod tests {
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
}
}
}

View File

@@ -1,7 +1,6 @@
use DocId;
use fastfield::{FastFieldReader, FastValue};
/// Reader for a multivalued `u64` fast field.
///
/// The reader is implemented as two `u64` fast field.
@@ -13,7 +12,7 @@ use fastfield::{FastFieldReader, FastValue};
#[derive(Clone)]
pub struct MultiValueIntFastFieldReader<Item: FastValue> {
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>
vals_reader: FastFieldReader<Item>,
}
impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
@@ -23,17 +22,35 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
) -> MultiValueIntFastFieldReader<Item> {
MultiValueIntFastFieldReader {
idx_reader,
vals_reader
vals_reader,
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
(start, stop)
}
/// Returns the number of values associated to a given document.
pub fn num_vals(&self, doc: DocId) -> usize {
let (start, stop) = self.range(doc);
(stop - start) as usize
}
/// Returns the overall number of values associated to documents.
pub(crate) fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
/// Returns the array of values associated to the given `doc`.
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let start = self.idx_reader.get(doc) as u32;
let stop = self.idx_reader.get(doc + 1) as u32;
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range(start, &mut vals[..]);
self.vals_reader.get_range(start as u32, &mut vals[..]);
}
}

View File

@@ -6,13 +6,13 @@ use postings::UnorderedTermId;
use schema::{Document, Field};
use std::io;
use itertools::Itertools;
use termdict::TermOrdinal;
pub struct MultiValueIntFastFieldWriter {
field: Field,
vals: Vec<u64>,
doc_index: Vec<u64>,
is_facet: bool
is_facet: bool,
}
impl MultiValueIntFastFieldWriter {
@@ -22,7 +22,7 @@ impl MultiValueIntFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
is_facet
is_facet,
}
}
@@ -51,7 +51,6 @@ impl MultiValueIntFastFieldWriter {
}
}
}
}
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
@@ -68,7 +67,7 @@ impl MultiValueIntFastFieldWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
{
// writing the offset index
@@ -85,16 +84,20 @@ impl MultiValueIntFastFieldWriter {
let mut value_serializer: FastSingleFieldSerializer<_>;
match mapping_opt {
Some(mapping) => {
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, 0u64, mapping.len() as u64, 1)?;
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
for val in &self.vals {
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
value_serializer.add_val(remapped_val)?;
}
}
None => {
let val_min_max = self.vals.iter().cloned().minmax();
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for &val in &self.vals {

View File

@@ -24,11 +24,10 @@ pub struct FastFieldReader<Item: FastValue> {
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
min_value_u64: u64,
max_value_u64: u64,
_phantom: PhantomData<Item>
_phantom: PhantomData<Item>,
}
impl<Item: FastValue> FastFieldReader<Item> {
/// Opens a fast field given a source.
pub fn open(data: ReadOnlySource) -> Self {
let min_value: u64;
@@ -48,11 +47,10 @@ impl<Item: FastValue> FastFieldReader<Item> {
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
_phantom: PhantomData
_phantom: PhantomData,
}
}
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
@@ -73,7 +71,10 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: u32, output: &mut [Item]) {
///
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
@@ -137,4 +138,3 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
FastFieldReader::open(field_source)
}
}

View File

@@ -36,9 +36,7 @@ impl FastFieldSerializer {
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer {
composite_write
})
Ok(FastFieldSerializer { composite_write })
}
/// Start serializing a new u64 fast field
@@ -79,11 +77,21 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;

View File

@@ -9,6 +9,7 @@ use std::collections::HashMap;
use postings::UnorderedTermId;
use super::multivalued::MultiValueIntFastFieldWriter;
use common::BinarySerializable;
use termdict::TermOrdinal;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
@@ -53,7 +54,7 @@ impl FastFieldsWriter {
}
FastFieldsWriter {
single_value_writers,
multi_values_writers
multi_values_writers,
}
}
@@ -105,7 +106,7 @@ impl FastFieldsWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?;

View File

@@ -200,7 +200,6 @@ pub fn advance_deletes(
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
{
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
// We are already up-to-date here.
@@ -241,7 +240,6 @@ pub fn advance_deletes(
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(file_protect)
}

View File

@@ -14,9 +14,14 @@ use termdict::TermMerger;
use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{max, min};
use termdict::TermDictionary;
use termdict::TermStreamer;
use schema::FieldType;
use termdict::TermOrdinal;
use schema::Cardinality;
use std::collections::HashMap;
use fastfield::MultiValueIntFastFieldReader;
use std::cmp;
pub struct IndexMerger {
schema: Schema,
@@ -60,6 +65,43 @@ fn extract_fast_field_reader(
segment_reader.fast_field_reader(field).ok()
}
struct TermOrdinalMapping {
per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>
}
impl TermOrdinalMapping {
fn new(max_term_ords: Vec<TermOrdinal>) -> TermOrdinalMapping {
TermOrdinalMapping {
per_segment_new_term_ordinals: max_term_ords
.into_iter()
.map(|max_term_ord| vec![TermOrdinal::default(); max_term_ord as usize])
.collect()
}
}
fn register_from_to(&mut self,
segment_ord: usize,
from_ord: TermOrdinal,
to_ord: TermOrdinal) {
self.per_segment_new_term_ordinals[segment_ord][from_ord as usize] = to_ord;
}
fn get_segment(&self, segment_ord: usize) -> &[TermOrdinal] {
&(self.per_segment_new_term_ordinals[segment_ord])[..]
}
fn max_term_ord(&self) -> TermOrdinal {
self.per_segment_new_term_ordinals
.iter()
.flat_map(|term_ordinals| {
term_ordinals.iter().cloned().max()
})
.max()
.unwrap_or(TermOrdinal::default())
}
}
struct DeltaComputer {
buffer: Vec<u32>,
}
@@ -103,204 +145,384 @@ impl IndexMerger {
}
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fieldnorm_fastfields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(
fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer,
)
}
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fast_fields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<FastFieldReader<u64>>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
let mut u64_readers = vec![];
let mut min_val = u64::max_value();
let mut max_val = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
) {
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
}
}
None => {
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
bail!(ErrorKind::SchemaError(error_msg));
}
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
if let FieldType::Str(ref text_options) = *field_entry.field_type() {
if text_options.get_indexing_options().is_some() {
self.write_single_fast_field(
field,
&extract_fieldnorm_reader,
fast_field_serializer,
)?;
}
}
if u64_readers.is_empty() {
// we have actually zero documents.
min_val = 0;
max_val = 0;
}
assert!(min_val <= max_val);
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_val, max_val)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u64_reader.get(doc_id);
fast_single_field_serializer.add_val(val)?;
}
}
}
fast_single_field_serializer.close_field()?;
}
Ok(())
}
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut delta_computer = DeltaComputer::new();
fn write_fast_fields(&self,
fast_field_serializer: &mut FastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>) -> Result<()> {
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
let field_type = field_entry.field_type();
match *field_type {
FieldType::HierarchicalFacet => {
let term_ordinal_mapping = term_ord_mappings
.remove(&field)
.expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\
`term_ordinal_mapping`.");
self.write_hierarchical_facet_field(
field,
term_ordinal_mapping,
fast_field_serializer)?;
}
FieldType::U64(ref options) | FieldType::I64(ref options) => {
match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(
field,
&extract_fast_field_reader,
fast_field_serializer
)?;
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
}
}
FieldType::Str(_) => {
// We don't handle str fast field for the moment
// They can be implemented using what is done
// for facets in the future.
}
}
}
Ok(())
}
let mut indexed_fields = vec![];
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
if field_entry.is_indexed() {
indexed_fields.push(Field(field_ord as u32));
// used both to merge field norms, `u64/i64` single fast fields.
fn write_single_fast_field(
&self,
field: Field,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<FastFieldReader<u64>>,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
let mut u64_readers = vec![];
let mut min_value = u64::max_value();
let mut max_value = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
) {
// the segment has some non-deleted documents
min_value = cmp::min(min_value, seg_min_val);
max_value = cmp::max(max_value, seg_max_val);
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
} else {
// all documents have been deleted.
}
}
None => {
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
bail!(ErrorKind::SchemaError(error_msg));
}
}
}
if min_value > max_value {
// There is not a single document remaining in the index.
min_value = 0;
max_value = 0;
}
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0u32..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u64_reader.get(doc_id);
fast_single_field_serializer.add_val(val)?;
}
}
}
for indexed_field in indexed_fields {
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
fast_single_field_serializer.close_field()?;
Ok(())
}
let field_term_streams = field_readers
.iter()
.map(|field_reader| field_reader.terms().stream())
.collect();
fn write_multi_fast_field_idx(&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let mut total_num_vals = 0u64;
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
// In the first pass, we compute the total number of vals.
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
for reader in &self.readers {
let multi_ff_reader = reader.multi_fast_field_reader::<u64>(field)?;
let delete_bitset = reader.delete_bitset();
if delete_bitset.has_deletes() {
for doc in 0u32..reader.max_doc() {
if !delete_bitset.is_deleted(doc) {
total_num_vals += multi_ff_reader.num_vals(doc) as u64;
}
}
merged_doc_id_map.push(segment_local_map);
} else {
total_num_vals += multi_ff_reader.total_num_vals();
}
}
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
// In the new segments, the doc id from the different
// segment are stacked so that :
// - Segment 0's doc ids become doc id [0, seg.max_doc]
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
let mut serialize_idx = fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
for reader in &self.readers {
let multi_ff_reader = reader.multi_fast_field_reader::<u64>(field)?;
let delete_bitset = reader.delete_bitset();
for doc in 0u32..reader.max_doc() {
if !delete_bitset.is_deleted(doc) {
serialize_idx.add_val(idx)?;
idx += multi_ff_reader.num_vals(doc) as u64;
}
}
}
serialize_idx.add_val(idx)?;
serialize_idx.close_field()?;
Ok(())
}
let mut field_serializer = serializer.new_field(indexed_field)?;
fn write_hierarchical_facet_field(&self,
field: Field,
term_ordinal_mappings: TermOrdinalMapping,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let field_entry = self.schema.get_field_entry(indexed_field);
// Multifastfield consists in 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
// ... set segment postings option the new field.
let segment_postings_option =
field_entry.field_type().get_index_record_option().expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
// First we merge the idx fast field.
self.write_multi_fast_field_idx(field, fast_field_serializer)?;
while merged_terms.advance() {
let term_bytes: &[u8] = merged_terms.key();
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(indexed_field);
let mut segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
// We can now write the actual fast field values.
// In the case of hierarchical facets, they are actually term ordinals.
let max_term_ord = term_ordinal_mappings.max_term_ord();
{
let mut serialize_vals = fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
let mut vals = Vec::with_capacity(100);
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let delete_bitset = segment_reader.delete_bitset();
let term_ordinal_mapping: &[TermOrdinal] = term_ordinal_mappings.get_segment(segment_ord);
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader.multi_fast_field_reader(field)?;
// TODO optimize if no deletes
for doc in 0..segment_reader.max_doc() {
if !delete_bitset.is_deleted(doc) {
ff_reader.get_vals(doc, &mut vals);
for &prev_term_ord in &vals {
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
serialize_vals.add_val(new_term_ord)?;
}
})
.collect();
}
}
}
serialize_vals.close_field()?;
}
Ok(())
}
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
fn write_multi_fast_field(&self, field: Field, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// Multifastfield consists in 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
// The second contains the actual values.
// We know that there is at least one document containing
// the term, so we add it.
field_serializer.new_term(term_bytes)?;
// First we merge the idx fast field.
self.write_multi_fast_field_idx(field, fast_field_serializer)?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
let mut min_value = u64::max_value();
let mut max_value = u64::min_value();
let mut vals = Vec::with_capacity(100);
// Our values are bitpacked and we need to know what should be
// our bitwidth and our minimum value before serializing any values.
//
// Computing those is non-trivial if some documents are deleted.
// We go through a complete first pass to compute the minimum and the
// maximum value and initialize our Serializer.
for reader in &self.readers {
let ff_reader: MultiValueIntFastFieldReader<u64> = reader.multi_fast_field_reader(field)?;
let delete_bitset = reader.delete_bitset();
for doc in 0u32..reader.max_doc() {
if !delete_bitset.is_deleted(doc) {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
min_value = cmp::min(val, min_value);
max_value = cmp::max(val, max_value);
}
}
}
// TODO optimize when no deletes
}
if min_value > max_value {
min_value = 0;
max_value = 0;
}
// We can now initialize our serializer, and push it the different values
{
let mut serialize_vals = fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
for reader in &self.readers {
let delete_bitset = reader.delete_bitset();
let ff_reader: MultiValueIntFastFieldReader<u64> = reader.multi_fast_field_reader(field)?;
// TODO optimize if no deletes
for doc in 0..reader.max_doc() {
if !delete_bitset.is_deleted(doc) {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
serialize_vals.add_val(val)?;
}
}
}
}
serialize_vals.close_field()?;
}
Ok(())
}
fn write_postings_for_field(&self,
indexed_field: Field,
field_type: &FieldType,
serializer: &mut InvertedIndexSerializer) -> Result<Option<TermOrdinalMapping>> {
let mut delta_computer= DeltaComputer::new();
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
let mut field_term_streams = Vec::new();
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
for field_reader in &field_readers {
let terms = field_reader.terms();
field_term_streams.push(terms.stream());
max_term_ords.push(terms.num_terms() as u64);
}
let mut term_ord_mapping_opt =
if *field_type == FieldType::HierarchicalFacet {
Some(TermOrdinalMapping::new(max_term_ords))
} else {
None
};
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
// In the new segments, the doc id from the different
// segment are stacked so that :
// - Segment 0's doc ids become doc id [0, seg.max_doc]
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let mut field_serializer = serializer.new_field(indexed_field)?;
let field_entry = self.schema.get_field_entry(indexed_field);
// ... set segment postings option the new field.
let segment_postings_option =
field_entry.field_type().get_index_record_option().expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
while merged_terms.advance() {
let term_bytes: &[u8] = merged_terms.key();
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(indexed_field);
let mut segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
let to_term_ord = field_serializer.new_term(term_bytes)?;
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
term_ord_mapping.register_from_to(segment_ord,from_term_ord, to_term_ord);
}
}
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
{
// we make sure to only write the term iff
// there is at least one document.
@@ -313,20 +535,31 @@ impl IndexMerger {
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
if !segment_postings.advance() {
break;
}
}
}
// closing the term.
field_serializer.close_term()?;
// closing the term.
field_serializer.close_term()?;
}
}
field_serializer.close()?;
Ok(term_ord_mapping_opt)
}
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<HashMap<Field, TermOrdinalMapping>> {
let mut term_ordinal_mappings = HashMap::new();
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
if field_entry.is_indexed() {
let indexed_field = Field(field_ord as u32);
if let Some(term_ordinal_mapping) = self.write_postings_for_field(indexed_field, field_entry.field_type(), serializer)? {
term_ordinal_mappings.insert(indexed_field, term_ordinal_mapping);
}
}
field_serializer.close()?;
}
Ok(())
Ok(term_ordinal_mappings)
}
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
@@ -349,9 +582,9 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
self.write_postings(serializer.get_postings_serializer())?;
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
Ok(self.max_doc)
@@ -375,6 +608,10 @@ mod tests {
use schema::IndexRecordOption;
use schema::Cardinality;
use futures::Future;
use IndexWriter;
use query::AllQuery;
use collector::FacetCollector;
use schema::IntOptions;
#[test]
fn test_index_merger_no_deletes() {
@@ -805,4 +1042,297 @@ mod tests {
assert_eq!(searcher.num_docs(), 0);
}
}
#[test]
fn test_merge_facets() {
let mut schema_builder = schema::SchemaBuilder::default();
let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build());
use schema::Facet;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
index_doc(&mut index_writer, &["/top/d"]);
index_doc(&mut index_writer, &["/top/e"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b"]);
index_doc(&mut index_writer, &["/top/c"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_writer.commit().expect("committed");
}
index.load_searchers().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
use collector::{MultiCollector, CountCollector};
let mut count_collector = CountCollector::default();
{
let mut multi_collectors = MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
}
assert_eq!(count_collector.count(), expected_num_docs);
let facet_counts = facet_collector.harvest();
let facets: Vec<(String, u64)> = facet_counts.get("/top")
.map(|(facet, count)| (facet.to_string(), count))
.collect();
assert_eq!(
facets,
expected
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
};
test_searcher(11, &[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1)
]);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
index.load_searchers().unwrap();
test_searcher(11, &[
("/top/a", 5),
("/top/b", 5),
("/top/c", 2),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1)
]);
}
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
test_searcher(9, &[
("/top/a", 3),
("/top/b", 3),
("/top/c", 1),
("/top/d", 2),
("/top/e", 2),
("/top/f", 1)
]);
}
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::SchemaBuilder::default();
let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone());
index_writer.commit().expect("commit failed");
index_writer.add_document(doc);
index_writer.commit().expect("commit failed");
index_writer.delete_term(Term::from_field_u64(int_field, 1));
index_writer.commit().expect("commit failed");
}
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
}
#[test]
fn test_merge_multivalued_int_fields() {
let mut schema_builder = schema::SchemaBuilder::default();
let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 2, 3]);
index_doc(&mut index_writer, &[4, 5]);
index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 5]);
index_doc(&mut index_writer, &[3]);
index_doc(&mut index_writer, &[17]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[20]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[28, 27]);
index_doc(&mut index_writer, &[1_000]);
index_writer.commit().expect("committed");
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut vals: Vec<u64> = Vec::new();
{
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1, 2, 3]);
ff_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4, 5]);
ff_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(4, &mut vals);
assert_eq!(&vals, &[1, 5]);
ff_reader.get_vals(5, &mut vals);
assert_eq!(&vals, &[3]);
ff_reader.get_vals(6, &mut vals);
assert_eq!(&vals, &[17]);
}
{
let segment = searcher.segment_reader(1u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[20]);
}
{
let segment = searcher.segment_reader(2u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[28, 27]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1_000]);
}
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
index.load_searchers().unwrap();
{
let searcher = index.searcher();
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(1, &mut vals);
assert_eq!(&vals, &[1, 2, 3]);
ff_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4, 5]);
ff_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[1, 2]);
ff_reader.get_vals(4, &mut vals);
assert_eq!(&vals, &[1, 5]);
ff_reader.get_vals(5, &mut vals);
assert_eq!(&vals, &[3]);
ff_reader.get_vals(6, &mut vals);
assert_eq!(&vals, &[17]);
ff_reader.get_vals(7, &mut vals);
assert_eq!(&vals, &[20]);
ff_reader.get_vals(8, &mut vals);
assert_eq!(&vals, &[28, 27]);
ff_reader.get_vals(9, &mut vals);
assert_eq!(&vals, &[1_000]);
}
}
}

View File

@@ -11,9 +11,9 @@ pub struct PreparedCommit<'a> {
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
PreparedCommit {
index_writer: index_writer,
index_writer,
payload: None,
opstamp: opstamp,
opstamp
}
}

View File

@@ -645,6 +645,22 @@ mod tests {
assert!(!postings.advance());
}
#[test]
fn test_indexedfield_not_in_documents() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok());
assert!(index.load_searchers().is_ok());
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic
}
#[test]
fn test_delete_postings2() {
let mut schema_builder = SchemaBuilder::default();
@@ -859,31 +875,26 @@ mod tests {
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_res =
segment_reader.fast_field_reader::<u64>(text_field);
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res =
segment_reader.fast_field_reader::<u64>(stored_int_field);
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res =
segment_reader.fast_field_reader::<u64>(fast_field_signed);
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res =
segment_reader.fast_field_reader::<i64>(fast_field_signed);
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_res =
segment_reader.fast_field_reader::<i64>(fast_field_signed);
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)

View File

@@ -63,4 +63,41 @@ macro_rules! doc(
document
}
};
// if there is a trailing comma retry with the trailing comma stripped.
($($field:expr => $value:expr),+ ,) => {
doc!( $( $field => $value ), *);
};
);
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, FAST, TEXT};
#[test]
fn test_doc_basic() {
let mut schema_builder = SchemaBuilder::new();
let title = schema_builder.add_text_field("title", TEXT);
let author = schema_builder.add_text_field("text", TEXT);
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
}
#[test]
fn test_doc_trailing_comma() {
let mut schema_builder = SchemaBuilder::new();
let title = schema_builder.add_text_field("title", TEXT);
let author = schema_builder.add_text_field("text", TEXT);
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
}
}

View File

@@ -16,6 +16,7 @@ use tokenizer::Token;
use tokenizer::TokenStream;
use schema::IndexRecordOption;
use postings::UnorderedTermId;
use termdict::TermOrdinal;
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
@@ -44,6 +45,7 @@ fn posting_from_field_entry<'a>(
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
schema: Schema,
term_index: TermHashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
}
@@ -58,8 +60,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
schema: schema.clone(),
heap,
term_index,
per_field_postings_writers,
@@ -83,7 +85,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
@@ -94,7 +96,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut prev_field = Field(u32::max_value());
@@ -110,17 +112,23 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
// populating the unordered term ord -> ordered term ord mapping
// for the field.
let mut mapping = HashMap::new();
for (term_ord, term_unord_id) in term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket)
.enumerate()
{
mapping.insert(term_unord_id, term_ord);
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Str(_) | FieldType::HierarchicalFacet => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let mut unordered_term_ids = term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket);
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}
}
unordered_term_mappings.insert(field, mapping);
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
let mut field_serializer = serializer.new_field(field)?;

View File

@@ -13,7 +13,7 @@ use std::io::{self, Write};
use compression::VIntEncoder;
use common::CountingWriter;
use common::CompositeWrite;
use termdict::TermDictionaryBuilder;
use termdict::{TermOrdinal, TermDictionaryBuilder};
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
@@ -114,6 +114,7 @@ pub struct FieldSerializer<'a> {
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
num_terms: TermOrdinal,
}
impl<'a> FieldSerializer<'a> {
@@ -152,6 +153,7 @@ impl<'a> FieldSerializer<'a> {
positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
num_terms: TermOrdinal::default(),
})
}
@@ -172,7 +174,7 @@ impl<'a> FieldSerializer<'a> {
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
@@ -180,7 +182,10 @@ impl<'a> FieldSerializer<'a> {
self.term_open = true;
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
self.term_dictionary_builder.insert_key(term)?;
let term_ordinal = self.num_terms;
self.num_terms += 1;
Ok(term_ordinal)
}
/// Serialize the information that a document contains the current term,

View File

@@ -7,6 +7,7 @@ use Result;
use Score;
use DocId;
use core::Searcher;
use fastfield::DeleteBitSet;
/// Query that matches all of the documents.
///
@@ -26,28 +27,52 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
Ok(box AllScorer {
started: false,
state: State::NotStarted,
doc: 0u32,
max_doc: reader.max_doc(),
deleted_bitset: reader.delete_bitset().clone()
})
}
}
enum State {
NotStarted,
Started,
Finished
}
/// Scorer associated to the `AllQuery` query.
pub struct AllScorer {
started: bool,
state: State,
doc: DocId,
max_doc: DocId,
deleted_bitset: DeleteBitSet
}
impl DocSet for AllScorer {
fn advance(&mut self) -> bool {
if self.started {
self.doc += 1u32;
} else {
self.started = true;
loop {
match self.state {
State::NotStarted => {
self.state = State::Started;
self.doc = 0;
}
State::Started => {
self.doc += 1u32;
}
State::Finished => {
return false;
}
}
if self.doc < self.max_doc {
if !self.deleted_bitset.is_deleted(self.doc) {
return true;
}
} else {
self.state = State::Finished;
return false;
}
}
self.doc < self.max_doc
}
fn doc(&self) -> DocId {

View File

@@ -21,8 +21,6 @@ mod tests {
use query::RequiredOptionalScorer;
use query::score_combiner::SumWithCoordsCombiner;
fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
@@ -104,7 +102,9 @@ mod tests {
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&*searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(Downcast::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>::is_type(&*scorer));
assert!(Downcast::<
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
>::is_type(&*scorer));
}
{
let query = query_parser.parse_query("+a b").unwrap();
@@ -116,7 +116,6 @@ mod tests {
#[test]
pub fn test_boolean_query() {
let (index, text_field) = aux_test_helper();
let make_term_query = |text: &str| {

View File

@@ -41,7 +41,8 @@ fn leaf<I>(input: I) -> ParseResult<UserInputAST, I>
where
I: Stream<Item = char>,
{
(char('-'), parser(leaf)).map(|(_, expr)| UserInputAST::Not(box expr))
(char('-'), parser(leaf))
.map(|(_, expr)| UserInputAST::Not(box expr))
.or((char('+'), parser(leaf)).map(|(_, expr)| UserInputAST::Must(box expr)))
.or((char('('), parser(parse_to_ast), char(')')).map(|(_, expr, _)| expr))
.or(parser(literal))

View File

@@ -155,7 +155,7 @@ impl QueryParser {
fn compute_logical_ast_for_leaf(
&self,
field: Field,
phrase: &str
phrase: &str,
) -> Result<Option<LogicalLiteral>, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
@@ -328,7 +328,7 @@ mod test {
use tokenizer::TokenizerManager;
use query::Query;
use schema::Field;
use schema::{TextOptions, TextFieldIndexing, IndexRecordOption};
use schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
use super::QueryParser;
use super::QueryParserError;
use Index;
@@ -538,7 +538,9 @@ mod test {
let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index.tokenizers().register("customtokenizer", SimpleTokenizer);
index
.tokenizers()
.register("customtokenizer", SimpleTokenizer);
let query_parser = QueryParser::for_index(&index, vec![title]);
assert!(query_parser.parse_query("title:\"happy tax\"").is_ok());
}

View File

@@ -51,8 +51,10 @@ impl DocSet for EmptyScorer {
}
fn doc(&self) -> DocId {
panic!("You may not call .doc() on a scorer \
where the last call to advance() did not return true.");
panic!(
"You may not call .doc() on a scorer \
where the last call to advance() did not return true."
);
}
fn size_hint(&self) -> u32 {

View File

@@ -110,7 +110,9 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Union<TScorer, TScoreCombin
}
impl<TScorer, TScoreCombiner> DocSet for Union<TScorer, TScoreCombiner>
where TScorer: Scorer, TScoreCombiner: ScoreCombiner
where
TScorer: Scorer,
TScoreCombiner: ScoreCombiner,
{
fn advance(&mut self) -> bool {
if self.advance_buffered() {

View File

@@ -212,6 +212,14 @@ mod tests {
assert!(Facet::root().is_root());
}
#[test]
fn test_from_path() {
assert_eq!(
Facet::from_path(vec!["top", "a", "firstdoc"]),
Facet::from("/top/a/firstdoc")
);
}
#[test]
fn test_facet_display() {
{

View File

@@ -123,7 +123,6 @@ impl Default for SchemaBuilder {
}
}
struct InnerSchema {
fields: Vec<FieldEntry>,
fields_map: HashMap<String, Field>, // transient
@@ -243,7 +242,6 @@ impl Schema {
}
}
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
@@ -292,7 +290,6 @@ impl<'de> Deserialize<'de> for Schema {
}
}
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug)]
@@ -467,7 +464,10 @@ mod tests {
"jambon": "bayonne"
}"#,
);
assert_matches!(json_err, Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))));
assert_matches!(
json_err,
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_)))
);
}
{
let json_err = schema.parse_document(
@@ -478,7 +478,10 @@ mod tests {
"popularity": 10
}"#,
);
assert_matches!(json_err, Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))));
assert_matches!(
json_err,
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
);
}
{
let json_err = schema.parse_document(
@@ -489,7 +492,10 @@ mod tests {
"popularity": 10
}"#,
);
assert!(!matches!(json_err, Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))));
assert!(!matches!(
json_err,
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
));
}
{
let json_err = schema.parse_document(
@@ -500,7 +506,10 @@ mod tests {
"popularity": 9223372036854775808
}"#,
);
assert_matches!(json_err, Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))));
assert_matches!(
json_err,
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_)))
);
}
{
let json_err = schema.parse_document(

View File

@@ -4,6 +4,7 @@ use common;
use byteorder::{BigEndian, ByteOrder};
use super::Field;
use std::str;
use schema::Facet;
/// Size (in bytes) of the buffer of a int field.
const INT_TERM_LEN: usize = 4 + 8;
@@ -29,6 +30,16 @@ impl Term {
Term::from_field_u64(field, val_u64)
}
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
term.set_bytes(bytes);
term
}
/// Builds a term given a field, and a string value
///
/// Assuming the term has a field id of 2, and a text value of "abc",
@@ -91,10 +102,14 @@ impl Term {
self.set_u64(common::i64_to_u64(val));
}
fn set_bytes(&mut self, bytes: &[u8]) {
self.0.resize(4, 0u8);
self.0.extend(bytes);
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.0.resize(4, 0u8);
self.0.extend(text.as_bytes());
self.set_bytes(text.as_bytes());
}
}

View File

@@ -13,8 +13,8 @@ pub struct TermStreamerBuilderImpl<'a> {
impl<'a> TermStreamerBuilderImpl<'a> {
pub(crate) fn new(fst_map: &'a TermDictionaryImpl, stream_builder: StreamBuilder<'a>) -> Self {
TermStreamerBuilderImpl {
fst_map: fst_map,
stream_builder: stream_builder,
fst_map,
stream_builder,
}
}
}
@@ -86,4 +86,8 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> {
fn value(&self) -> &TermInfo {
&self.current_value
}
fn max_term_ord(&self) -> TermOrdinal {
self.fst_map.num_terms() as TermOrdinal
}
}

View File

@@ -1,4 +1,3 @@
use std::io;
use std::cmp;
use std::io::{Read, Write};

View File

@@ -120,6 +120,16 @@ impl<'a> TermDictionary<'a> for TermDictionaryImpl {
}
}
fn empty(field_type: FieldType) -> Self {
let term_dictionary_data: Vec<u8> =
TermDictionaryBuilderImpl::new(Vec::<u8>::new(), field_type)
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
let source = ReadOnlySource::from(term_dictionary_data);
Self::from_source(source)
}
fn num_terms(&self) -> usize {
self.term_info_store.num_terms()
}

View File

@@ -2,6 +2,7 @@ use std::collections::BinaryHeap;
use termdict::TermStreamerImpl;
use std::cmp::Ordering;
use termdict::TermStreamer;
use termdict::TermOrdinal;
use schema::Term;
pub struct HeapItem<'a> {
@@ -29,6 +30,7 @@ impl<'a> Ord for HeapItem<'a> {
}
}
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
///
@@ -43,8 +45,6 @@ pub struct TermMerger<'a> {
impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary
///
///
pub fn new(streams: Vec<TermStreamerImpl<'a>>) -> TermMerger<'a> {
TermMerger {
heap: BinaryHeap::new(),
@@ -59,6 +59,14 @@ impl<'a> TermMerger<'a> {
}
}
pub(crate) fn matching_segments<'b: 'a>(&'b self) -> Box<'b + Iterator<Item=(usize, TermOrdinal)>> {
Box::new(self.current_streamers
.iter()
.map(|heap_item| {
(heap_item.segment_ord, heap_item.streamer.term_ord())
}))
}
fn advance_segments(&mut self) {
let streamers = &mut self.current_streamers;
let heap = &mut self.heap;

View File

@@ -127,6 +127,9 @@ where
.lt(stop_term.as_slice())
.into_stream()
}
/// Creates an empty term dictionary which contains no terms.
fn empty(field_type: FieldType) -> Self;
}
/// Builder for the new term dictionary.
@@ -194,6 +197,13 @@ pub trait TermStreamer: Sized {
None
}
}
/// Returns an upperbound for term ordinals in this stream.
///
/// All term ordinals are guaranteed to be stricly smaller
/// than the result of `.max_term_ord()`
fn max_term_ord(&self) -> TermOrdinal;
}
/// `TermStreamerBuilder` is an helper object used to define

View File

@@ -31,7 +31,7 @@ impl<'a> Tokenizer<'a> for FacetTokenizer {
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
FacetTokenStream {
text: text,
text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: Token::default(),
}

View File

@@ -6,21 +6,21 @@ use tokenizer::TokenStreamChain;
/// Token
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
}
impl Default for Token {
fn default() -> Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,