mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 22:42:55 +00:00
Compare commits
7 Commits
python-bin
...
0.5.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d40ef06dde | ||
|
|
384917c17b | ||
|
|
cbca95aee3 | ||
|
|
2b8618afc2 | ||
|
|
967cf2cb02 | ||
|
|
0e68c4ac34 | ||
|
|
e09192b0ab |
@@ -1,3 +1,9 @@
|
|||||||
|
Tantivy 0.5.2
|
||||||
|
===========================
|
||||||
|
- bugfix #274
|
||||||
|
- bugfix #280
|
||||||
|
- bugfix #289
|
||||||
|
|
||||||
Tantivy 0.5.1
|
Tantivy 0.5.1
|
||||||
==========================
|
==========================
|
||||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.5.1"
|
version = "0.5.2"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
build = "build.rs"
|
build = "build.rs"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@@ -107,6 +107,19 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||||
|
&self,
|
||||||
|
field: Field,
|
||||||
|
idx: usize
|
||||||
|
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||||
|
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||||
|
Ok(FastFieldReader::open(ff_source))
|
||||||
|
} else {
|
||||||
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
|
Err(FastFieldNotAvailableError::new(field_entry))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||||
@@ -116,14 +129,8 @@ impl SegmentReader {
|
|||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||||
{
|
{
|
||||||
let idx_reader = self.fast_fields_composite
|
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||||
.open_read_with_idx(field, 0)
|
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)?;
|
|
||||||
let vals_reader = self.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 1)
|
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
|
||||||
.map(FastFieldReader::open)?;
|
|
||||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||||
} else {
|
} else {
|
||||||
Err(FastFieldNotAvailableError::new(field_entry))
|
Err(FastFieldNotAvailableError::new(field_entry))
|
||||||
|
|||||||
@@ -26,13 +26,31 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns `(start, stop)`, such that the values associated
|
||||||
|
/// to the given document are `start..stop`.
|
||||||
|
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||||
|
let start = self.idx_reader.get(doc);
|
||||||
|
let stop = self.idx_reader.get(doc + 1);
|
||||||
|
(start, stop)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of values associated to a given document.
|
||||||
|
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||||
|
let (start, stop) = self.range(doc);
|
||||||
|
(stop - start) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of values associated to documents.
|
||||||
|
pub(crate) fn total_num_vals(&self) -> u64 {
|
||||||
|
self.idx_reader.max_value()
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the array of values associated to the given `doc`.
|
/// Returns the array of values associated to the given `doc`.
|
||||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||||
let start = self.idx_reader.get(doc) as u32;
|
let (start, stop) = self.range(doc);
|
||||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
|
||||||
let len = (stop - start) as usize;
|
let len = (stop - start) as usize;
|
||||||
vals.resize(len, Item::default());
|
vals.resize(len, Item::default());
|
||||||
self.vals_reader.get_range(start, &mut vals[..]);
|
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use postings::UnorderedTermId;
|
|||||||
use schema::{Document, Field};
|
use schema::{Document, Field};
|
||||||
use std::io;
|
use std::io;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use termdict::TermOrdinal;
|
||||||
|
|
||||||
pub struct MultiValueIntFastFieldWriter {
|
pub struct MultiValueIntFastFieldWriter {
|
||||||
field: Field,
|
field: Field,
|
||||||
@@ -66,7 +67,7 @@ impl MultiValueIntFastFieldWriter {
|
|||||||
pub fn serialize(
|
pub fn serialize(
|
||||||
&self,
|
&self,
|
||||||
serializer: &mut FastFieldSerializer,
|
serializer: &mut FastFieldSerializer,
|
||||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
{
|
{
|
||||||
// writing the offset index
|
// writing the offset index
|
||||||
@@ -90,13 +91,13 @@ impl MultiValueIntFastFieldWriter {
|
|||||||
1,
|
1,
|
||||||
)?;
|
)?;
|
||||||
for val in &self.vals {
|
for val in &self.vals {
|
||||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||||
value_serializer.add_val(remapped_val)?;
|
value_serializer.add_val(remapped_val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
let val_min_max = self.vals.iter().cloned().minmax();
|
let val_min_max = self.vals.iter().cloned().minmax();
|
||||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
|
||||||
value_serializer =
|
value_serializer =
|
||||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||||
for &val in &self.vals {
|
for &val in &self.vals {
|
||||||
|
|||||||
@@ -71,6 +71,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
///
|
///
|
||||||
/// May panic if `start + output.len()` is greater than
|
/// May panic if `start + output.len()` is greater than
|
||||||
/// the segment's `maxdoc`.
|
/// the segment's `maxdoc`.
|
||||||
|
///
|
||||||
|
// TODO change start to `u64`.
|
||||||
|
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||||
self.bit_unpacker.get_range(start, output_u64);
|
self.bit_unpacker.get_range(start, output_u64);
|
||||||
|
|||||||
@@ -77,11 +77,21 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||||
|
|
||||||
|
/// Creates a new fast field serializer.
|
||||||
|
///
|
||||||
|
/// The serializer in fact encode the values by bitpacking
|
||||||
|
/// `(val - min_value)`.
|
||||||
|
///
|
||||||
|
/// It requires a `min_value` and a `max_value` to compute
|
||||||
|
/// compute the minimum number of bits required to encode
|
||||||
|
/// values.
|
||||||
fn open(
|
fn open(
|
||||||
write: &'a mut W,
|
write: &'a mut W,
|
||||||
min_value: u64,
|
min_value: u64,
|
||||||
max_value: u64,
|
max_value: u64,
|
||||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||||
|
assert!(min_value <= max_value);
|
||||||
min_value.serialize(write)?;
|
min_value.serialize(write)?;
|
||||||
let amplitude = max_value - min_value;
|
let amplitude = max_value - min_value;
|
||||||
amplitude.serialize(write)?;
|
amplitude.serialize(write)?;
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ use std::collections::HashMap;
|
|||||||
use postings::UnorderedTermId;
|
use postings::UnorderedTermId;
|
||||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
|
use termdict::TermOrdinal;
|
||||||
|
|
||||||
/// The fastfieldswriter regroup all of the fast field writers.
|
/// The fastfieldswriter regroup all of the fast field writers.
|
||||||
pub struct FastFieldsWriter {
|
pub struct FastFieldsWriter {
|
||||||
@@ -105,7 +106,7 @@ impl FastFieldsWriter {
|
|||||||
pub fn serialize(
|
pub fn serialize(
|
||||||
&self,
|
&self,
|
||||||
serializer: &mut FastFieldSerializer,
|
serializer: &mut FastFieldSerializer,
|
||||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
for field_writer in &self.single_value_writers {
|
for field_writer in &self.single_value_writers {
|
||||||
field_writer.serialize(serializer)?;
|
field_writer.serialize(serializer)?;
|
||||||
|
|||||||
@@ -200,7 +200,6 @@ pub fn advance_deletes(
|
|||||||
target_opstamp: u64,
|
target_opstamp: u64,
|
||||||
) -> Result<Option<FileProtection>> {
|
) -> Result<Option<FileProtection>> {
|
||||||
let mut file_protect: Option<FileProtection> = None;
|
let mut file_protect: Option<FileProtection> = None;
|
||||||
|
|
||||||
{
|
{
|
||||||
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
|
||||||
// We are already up-to-date here.
|
// We are already up-to-date here.
|
||||||
@@ -241,7 +240,6 @@ pub fn advance_deletes(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_entry.set_meta(segment.meta().clone());
|
segment_entry.set_meta(segment.meta().clone());
|
||||||
|
|
||||||
Ok(file_protect)
|
Ok(file_protect)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,9 +14,14 @@ use termdict::TermMerger;
|
|||||||
use fastfield::FastFieldSerializer;
|
use fastfield::FastFieldSerializer;
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use store::StoreWriter;
|
use store::StoreWriter;
|
||||||
use std::cmp::{max, min};
|
|
||||||
use termdict::TermDictionary;
|
use termdict::TermDictionary;
|
||||||
use termdict::TermStreamer;
|
use termdict::TermStreamer;
|
||||||
|
use schema::FieldType;
|
||||||
|
use termdict::TermOrdinal;
|
||||||
|
use schema::Cardinality;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use fastfield::MultiValueIntFastFieldReader;
|
||||||
|
use std::cmp;
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
@@ -60,6 +65,43 @@ fn extract_fast_field_reader(
|
|||||||
segment_reader.fast_field_reader(field).ok()
|
segment_reader.fast_field_reader(field).ok()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
struct TermOrdinalMapping {
|
||||||
|
per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TermOrdinalMapping {
|
||||||
|
fn new(max_term_ords: Vec<TermOrdinal>) -> TermOrdinalMapping {
|
||||||
|
TermOrdinalMapping {
|
||||||
|
per_segment_new_term_ordinals: max_term_ords
|
||||||
|
.into_iter()
|
||||||
|
.map(|max_term_ord| vec![TermOrdinal::default(); max_term_ord as usize])
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn register_from_to(&mut self,
|
||||||
|
segment_ord: usize,
|
||||||
|
from_ord: TermOrdinal,
|
||||||
|
to_ord: TermOrdinal) {
|
||||||
|
self.per_segment_new_term_ordinals[segment_ord][from_ord as usize] = to_ord;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_segment(&self, segment_ord: usize) -> &[TermOrdinal] {
|
||||||
|
&(self.per_segment_new_term_ordinals[segment_ord])[..]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn max_term_ord(&self) -> TermOrdinal {
|
||||||
|
self.per_segment_new_term_ordinals
|
||||||
|
.iter()
|
||||||
|
.flat_map(|term_ordinals| {
|
||||||
|
term_ordinals.iter().cloned().max()
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap_or(TermOrdinal::default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct DeltaComputer {
|
struct DeltaComputer {
|
||||||
buffer: Vec<u32>,
|
buffer: Vec<u32>,
|
||||||
}
|
}
|
||||||
@@ -103,204 +145,384 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
let fieldnorm_fastfields: Vec<Field> = self.schema
|
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
|
||||||
.fields()
|
let field = Field(field_id as u32);
|
||||||
.iter()
|
if let FieldType::Str(ref text_options) = *field_entry.field_type() {
|
||||||
.enumerate()
|
if text_options.get_indexing_options().is_some() {
|
||||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
self.write_single_fast_field(
|
||||||
.map(|(field_id, _)| Field(field_id as u32))
|
field,
|
||||||
.collect();
|
&extract_fieldnorm_reader,
|
||||||
self.generic_write_fast_field(
|
fast_field_serializer,
|
||||||
fieldnorm_fastfields,
|
)?;
|
||||||
&extract_fieldnorm_reader,
|
|
||||||
fast_field_serializer,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
|
||||||
let fast_fields: Vec<Field> = self.schema
|
|
||||||
.fields()
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
|
||||||
.map(|(field_id, _)| Field(field_id as u32))
|
|
||||||
.collect();
|
|
||||||
self.generic_write_fast_field(
|
|
||||||
fast_fields,
|
|
||||||
&extract_fast_field_reader,
|
|
||||||
fast_field_serializer,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// used both to merge field norms and regular u64 fast fields.
|
|
||||||
fn generic_write_fast_field(
|
|
||||||
&self,
|
|
||||||
fields: Vec<Field>,
|
|
||||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<FastFieldReader<u64>>,
|
|
||||||
fast_field_serializer: &mut FastFieldSerializer,
|
|
||||||
) -> Result<()> {
|
|
||||||
for field in fields {
|
|
||||||
let mut u64_readers = vec![];
|
|
||||||
let mut min_val = u64::max_value();
|
|
||||||
let mut max_val = u64::min_value();
|
|
||||||
|
|
||||||
for reader in &self.readers {
|
|
||||||
match field_reader_extractor(reader, field) {
|
|
||||||
Some(u64_reader) => {
|
|
||||||
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
|
|
||||||
&u64_reader,
|
|
||||||
reader.max_doc(),
|
|
||||||
reader.delete_bitset(),
|
|
||||||
) {
|
|
||||||
// the segment has some non-deleted documents
|
|
||||||
min_val = min(min_val, seg_min_val);
|
|
||||||
max_val = max(max_val, seg_max_val);
|
|
||||||
u64_readers.push((
|
|
||||||
reader.max_doc(),
|
|
||||||
u64_reader,
|
|
||||||
reader.delete_bitset(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let error_msg =
|
|
||||||
format!("Failed to find a u64_reader for field {:?}", field);
|
|
||||||
error!("{}", error_msg);
|
|
||||||
bail!(ErrorKind::SchemaError(error_msg));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if u64_readers.is_empty() {
|
|
||||||
// we have actually zero documents.
|
|
||||||
min_val = 0;
|
|
||||||
max_val = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(min_val <= max_val);
|
|
||||||
|
|
||||||
let mut fast_single_field_serializer =
|
|
||||||
fast_field_serializer.new_u64_fast_field(field, min_val, max_val)?;
|
|
||||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
|
||||||
for doc_id in 0..max_doc {
|
|
||||||
if !delete_bitset.is_deleted(doc_id) {
|
|
||||||
let val = u64_reader.get(doc_id);
|
|
||||||
fast_single_field_serializer.add_val(val)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fast_single_field_serializer.close_field()?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
|
fn write_fast_fields(&self,
|
||||||
let mut delta_computer = DeltaComputer::new();
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
|
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>) -> Result<()> {
|
||||||
|
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
|
||||||
|
let field = Field(field_id as u32);
|
||||||
|
let field_type = field_entry.field_type();
|
||||||
|
match *field_type {
|
||||||
|
FieldType::HierarchicalFacet => {
|
||||||
|
let term_ordinal_mapping = term_ord_mappings
|
||||||
|
.remove(&field)
|
||||||
|
.expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\
|
||||||
|
`term_ordinal_mapping`.");
|
||||||
|
self.write_hierarchical_facet_field(
|
||||||
|
field,
|
||||||
|
term_ordinal_mapping,
|
||||||
|
fast_field_serializer)?;
|
||||||
|
}
|
||||||
|
FieldType::U64(ref options) | FieldType::I64(ref options) => {
|
||||||
|
match options.get_fastfield_cardinality() {
|
||||||
|
Some(Cardinality::SingleValue) => {
|
||||||
|
self.write_single_fast_field(
|
||||||
|
field,
|
||||||
|
&extract_fast_field_reader,
|
||||||
|
fast_field_serializer
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
Some(Cardinality::MultiValues) => {
|
||||||
|
self.write_multi_fast_field(field, fast_field_serializer)?;
|
||||||
|
}
|
||||||
|
None => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FieldType::Str(_) => {
|
||||||
|
// We don't handle str fast field for the moment
|
||||||
|
// They can be implemented using what is done
|
||||||
|
// for facets in the future.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
let mut indexed_fields = vec![];
|
|
||||||
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
// used both to merge field norms, `u64/i64` single fast fields.
|
||||||
if field_entry.is_indexed() {
|
fn write_single_fast_field(
|
||||||
indexed_fields.push(Field(field_ord as u32));
|
&self,
|
||||||
|
field: Field,
|
||||||
|
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<FastFieldReader<u64>>,
|
||||||
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
|
) -> Result<()> {
|
||||||
|
|
||||||
|
let mut u64_readers = vec![];
|
||||||
|
let mut min_value = u64::max_value();
|
||||||
|
let mut max_value = u64::min_value();
|
||||||
|
|
||||||
|
for reader in &self.readers {
|
||||||
|
match field_reader_extractor(reader, field) {
|
||||||
|
Some(u64_reader) => {
|
||||||
|
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
|
||||||
|
&u64_reader,
|
||||||
|
reader.max_doc(),
|
||||||
|
reader.delete_bitset(),
|
||||||
|
) {
|
||||||
|
// the segment has some non-deleted documents
|
||||||
|
min_value = cmp::min(min_value, seg_min_val);
|
||||||
|
max_value = cmp::max(max_value, seg_max_val);
|
||||||
|
u64_readers.push((
|
||||||
|
reader.max_doc(),
|
||||||
|
u64_reader,
|
||||||
|
reader.delete_bitset(),
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
// all documents have been deleted.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let error_msg =
|
||||||
|
format!("Failed to find a u64_reader for field {:?}", field);
|
||||||
|
bail!(ErrorKind::SchemaError(error_msg));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if min_value > max_value {
|
||||||
|
// There is not a single document remaining in the index.
|
||||||
|
min_value = 0;
|
||||||
|
max_value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut fast_single_field_serializer =
|
||||||
|
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
|
||||||
|
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||||
|
for doc_id in 0u32..max_doc {
|
||||||
|
if !delete_bitset.is_deleted(doc_id) {
|
||||||
|
let val = u64_reader.get(doc_id);
|
||||||
|
fast_single_field_serializer.add_val(val)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for indexed_field in indexed_fields {
|
fast_single_field_serializer.close_field()?;
|
||||||
let field_readers = self.readers
|
Ok(())
|
||||||
.iter()
|
}
|
||||||
.map(|reader| reader.inverted_index(indexed_field))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let field_term_streams = field_readers
|
fn write_multi_fast_field_idx(&self,
|
||||||
.iter()
|
field: Field,
|
||||||
.map(|field_reader| field_reader.terms().stream())
|
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
.collect();
|
let mut total_num_vals = 0u64;
|
||||||
|
|
||||||
let mut merged_terms = TermMerger::new(field_term_streams);
|
// In the first pass, we compute the total number of vals.
|
||||||
let mut max_doc = 0;
|
//
|
||||||
|
// This is required by the bitpacker, as it needs to know
|
||||||
// map from segment doc ids to the resulting merged segment doc id.
|
// what should be the bit length use for bitpacking.
|
||||||
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
|
for reader in &self.readers {
|
||||||
Vec::with_capacity(self.readers.len());
|
let multi_ff_reader = reader.multi_fast_field_reader::<u64>(field)?;
|
||||||
|
let delete_bitset = reader.delete_bitset();
|
||||||
for reader in &self.readers {
|
if delete_bitset.has_deletes() {
|
||||||
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
for doc in 0u32..reader.max_doc() {
|
||||||
for doc_id in 0..reader.max_doc() {
|
if !delete_bitset.is_deleted(doc) {
|
||||||
if reader.is_deleted(doc_id) {
|
total_num_vals += multi_ff_reader.num_vals(doc) as u64;
|
||||||
segment_local_map.push(None);
|
|
||||||
} else {
|
|
||||||
segment_local_map.push(Some(max_doc));
|
|
||||||
max_doc += 1u32;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
merged_doc_id_map.push(segment_local_map);
|
} else {
|
||||||
|
total_num_vals += multi_ff_reader.total_num_vals();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Create the total list of doc ids
|
// We can now create our `idx` serializer, and in a second pass,
|
||||||
// by stacking the doc ids from the different segment.
|
// can effectively push the different indexes.
|
||||||
//
|
let mut serialize_idx = fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||||
// In the new segments, the doc id from the different
|
let mut idx = 0;
|
||||||
// segment are stacked so that :
|
for reader in &self.readers {
|
||||||
// - Segment 0's doc ids become doc id [0, seg.max_doc]
|
let multi_ff_reader = reader.multi_fast_field_reader::<u64>(field)?;
|
||||||
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
|
let delete_bitset = reader.delete_bitset();
|
||||||
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
|
for doc in 0u32..reader.max_doc() {
|
||||||
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
if !delete_bitset.is_deleted(doc) {
|
||||||
// ...
|
serialize_idx.add_val(idx)?;
|
||||||
|
idx += multi_ff_reader.num_vals(doc) as u64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serialize_idx.add_val(idx)?;
|
||||||
|
serialize_idx.close_field()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
let mut field_serializer = serializer.new_field(indexed_field)?;
|
fn write_hierarchical_facet_field(&self,
|
||||||
|
field: Field,
|
||||||
|
term_ordinal_mappings: TermOrdinalMapping,
|
||||||
|
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
|
|
||||||
let field_entry = self.schema.get_field_entry(indexed_field);
|
// Multifastfield consists in 2 fastfields.
|
||||||
|
// The first serves as an index into the second one and is stricly increasing.
|
||||||
|
// The second contains the actual values.
|
||||||
|
|
||||||
// ... set segment postings option the new field.
|
// First we merge the idx fast field.
|
||||||
let segment_postings_option =
|
self.write_multi_fast_field_idx(field, fast_field_serializer)?;
|
||||||
field_entry.field_type().get_index_record_option().expect(
|
|
||||||
"Encountered a field that is not supposed to be
|
|
||||||
indexed. Have you modified the schema?",
|
|
||||||
);
|
|
||||||
|
|
||||||
while merged_terms.advance() {
|
// We can now write the actual fast field values.
|
||||||
let term_bytes: &[u8] = merged_terms.key();
|
// In the case of hierarchical facets, they are actually term ordinals.
|
||||||
|
let max_term_ord = term_ordinal_mappings.max_term_ord();
|
||||||
// Let's compute the list of non-empty posting lists
|
{
|
||||||
let segment_postings: Vec<_> = merged_terms
|
let mut serialize_vals = fast_field_serializer.new_u64_fast_field_with_idx(field, 0u64, max_term_ord, 1)?;
|
||||||
.current_kvs()
|
let mut vals = Vec::with_capacity(100);
|
||||||
.iter()
|
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||||
.flat_map(|heap_item| {
|
let delete_bitset = segment_reader.delete_bitset();
|
||||||
let segment_ord = heap_item.segment_ord;
|
let term_ordinal_mapping: &[TermOrdinal] = term_ordinal_mappings.get_segment(segment_ord);
|
||||||
let term_info = heap_item.streamer.value();
|
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader.multi_fast_field_reader(field)?;
|
||||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
// TODO optimize if no deletes
|
||||||
let inverted_index = segment_reader.inverted_index(indexed_field);
|
for doc in 0..segment_reader.max_doc() {
|
||||||
let mut segment_postings = inverted_index
|
if !delete_bitset.is_deleted(doc) {
|
||||||
.read_postings_from_terminfo(term_info, segment_postings_option);
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
if segment_postings.advance() {
|
for &prev_term_ord in &vals {
|
||||||
Some((segment_ord, segment_postings))
|
let new_term_ord = term_ordinal_mapping[prev_term_ord as usize];
|
||||||
} else {
|
serialize_vals.add_val(new_term_ord)?;
|
||||||
None
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
.collect();
|
}
|
||||||
|
}
|
||||||
|
serialize_vals.close_field()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
// At this point, `segment_postings` contains the posting list
|
fn write_multi_fast_field(&self, field: Field, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||||
// of all of the segments containing the given term.
|
|
||||||
//
|
|
||||||
// These segments are non-empty and advance has already been called.
|
|
||||||
|
|
||||||
if !segment_postings.is_empty() {
|
// Multifastfield consists in 2 fastfields.
|
||||||
// If not, the `term` will be entirely removed.
|
// The first serves as an index into the second one and is stricly increasing.
|
||||||
|
// The second contains the actual values.
|
||||||
|
|
||||||
// We know that there is at least one document containing
|
// First we merge the idx fast field.
|
||||||
// the term, so we add it.
|
self.write_multi_fast_field_idx(field, fast_field_serializer)?;
|
||||||
field_serializer.new_term(term_bytes)?;
|
|
||||||
|
|
||||||
// We can now serialize this postings, by pushing each document to the
|
let mut min_value = u64::max_value();
|
||||||
// postings serializer.
|
let mut max_value = u64::min_value();
|
||||||
for (segment_ord, mut segment_postings) in segment_postings {
|
|
||||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
let mut vals = Vec::with_capacity(100);
|
||||||
loop {
|
|
||||||
// `.advance()` has been called once before the loop.
|
|
||||||
// Hence we cannot use a `while segment_postings.advance()` loop.
|
// Our values are bitpacked and we need to know what should be
|
||||||
if let Some(remapped_doc_id) =
|
// our bitwidth and our minimum value before serializing any values.
|
||||||
old_to_new_doc_id[segment_postings.doc() as usize]
|
//
|
||||||
|
// Computing those is non-trivial if some documents are deleted.
|
||||||
|
// We go through a complete first pass to compute the minimum and the
|
||||||
|
// maximum value and initialize our Serializer.
|
||||||
|
for reader in &self.readers {
|
||||||
|
let ff_reader: MultiValueIntFastFieldReader<u64> = reader.multi_fast_field_reader(field)?;
|
||||||
|
let delete_bitset = reader.delete_bitset();
|
||||||
|
for doc in 0u32..reader.max_doc() {
|
||||||
|
if !delete_bitset.is_deleted(doc) {
|
||||||
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
|
for &val in &vals {
|
||||||
|
min_value = cmp::min(val, min_value);
|
||||||
|
max_value = cmp::max(val, max_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO optimize when no deletes
|
||||||
|
}
|
||||||
|
|
||||||
|
if min_value > max_value {
|
||||||
|
min_value = 0;
|
||||||
|
max_value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can now initialize our serializer, and push it the different values
|
||||||
|
{
|
||||||
|
let mut serialize_vals = fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||||
|
for reader in &self.readers {
|
||||||
|
let delete_bitset = reader.delete_bitset();
|
||||||
|
let ff_reader: MultiValueIntFastFieldReader<u64> = reader.multi_fast_field_reader(field)?;
|
||||||
|
// TODO optimize if no deletes
|
||||||
|
for doc in 0..reader.max_doc() {
|
||||||
|
if !delete_bitset.is_deleted(doc) {
|
||||||
|
ff_reader.get_vals(doc, &mut vals);
|
||||||
|
for &val in &vals {
|
||||||
|
serialize_vals.add_val(val)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serialize_vals.close_field()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_postings_for_field(&self,
|
||||||
|
indexed_field: Field,
|
||||||
|
field_type: &FieldType,
|
||||||
|
serializer: &mut InvertedIndexSerializer) -> Result<Option<TermOrdinalMapping>> {
|
||||||
|
let mut delta_computer= DeltaComputer::new();
|
||||||
|
let field_readers = self.readers
|
||||||
|
.iter()
|
||||||
|
.map(|reader| reader.inverted_index(indexed_field))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let mut field_term_streams = Vec::new();
|
||||||
|
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
|
||||||
|
|
||||||
|
for field_reader in &field_readers {
|
||||||
|
let terms = field_reader.terms();
|
||||||
|
field_term_streams.push(terms.stream());
|
||||||
|
max_term_ords.push(terms.num_terms() as u64);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut term_ord_mapping_opt =
|
||||||
|
if *field_type == FieldType::HierarchicalFacet {
|
||||||
|
Some(TermOrdinalMapping::new(max_term_ords))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
let mut merged_terms = TermMerger::new(field_term_streams);
|
||||||
|
let mut max_doc = 0;
|
||||||
|
|
||||||
|
// map from segment doc ids to the resulting merged segment doc id.
|
||||||
|
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
|
||||||
|
Vec::with_capacity(self.readers.len());
|
||||||
|
|
||||||
|
for reader in &self.readers {
|
||||||
|
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
|
||||||
|
for doc_id in 0..reader.max_doc() {
|
||||||
|
if reader.is_deleted(doc_id) {
|
||||||
|
segment_local_map.push(None);
|
||||||
|
} else {
|
||||||
|
segment_local_map.push(Some(max_doc));
|
||||||
|
max_doc += 1u32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
merged_doc_id_map.push(segment_local_map);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the total list of doc ids
|
||||||
|
// by stacking the doc ids from the different segment.
|
||||||
|
//
|
||||||
|
// In the new segments, the doc id from the different
|
||||||
|
// segment are stacked so that :
|
||||||
|
// - Segment 0's doc ids become doc id [0, seg.max_doc]
|
||||||
|
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
|
||||||
|
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
|
||||||
|
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
|
||||||
|
// ...
|
||||||
|
let mut field_serializer = serializer.new_field(indexed_field)?;
|
||||||
|
|
||||||
|
let field_entry = self.schema.get_field_entry(indexed_field);
|
||||||
|
|
||||||
|
// ... set segment postings option the new field.
|
||||||
|
let segment_postings_option =
|
||||||
|
field_entry.field_type().get_index_record_option().expect(
|
||||||
|
"Encountered a field that is not supposed to be
|
||||||
|
indexed. Have you modified the schema?",
|
||||||
|
);
|
||||||
|
|
||||||
|
while merged_terms.advance() {
|
||||||
|
let term_bytes: &[u8] = merged_terms.key();
|
||||||
|
|
||||||
|
// Let's compute the list of non-empty posting lists
|
||||||
|
let segment_postings: Vec<_> = merged_terms
|
||||||
|
.current_kvs()
|
||||||
|
.iter()
|
||||||
|
.flat_map(|heap_item| {
|
||||||
|
let segment_ord = heap_item.segment_ord;
|
||||||
|
let term_info = heap_item.streamer.value();
|
||||||
|
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||||
|
let inverted_index = segment_reader.inverted_index(indexed_field);
|
||||||
|
let mut segment_postings = inverted_index
|
||||||
|
.read_postings_from_terminfo(term_info, segment_postings_option);
|
||||||
|
if segment_postings.advance() {
|
||||||
|
Some((segment_ord, segment_postings))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// At this point, `segment_postings` contains the posting list
|
||||||
|
// of all of the segments containing the given term.
|
||||||
|
//
|
||||||
|
// These segments are non-empty and advance has already been called.
|
||||||
|
|
||||||
|
if !segment_postings.is_empty() {
|
||||||
|
// If not, the `term` will be entirely removed.
|
||||||
|
|
||||||
|
// We know that there is at least one document containing
|
||||||
|
// the term, so we add it.
|
||||||
|
let to_term_ord = field_serializer.new_term(term_bytes)?;
|
||||||
|
|
||||||
|
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
|
||||||
|
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
|
||||||
|
term_ord_mapping.register_from_to(segment_ord,from_term_ord, to_term_ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can now serialize this postings, by pushing each document to the
|
||||||
|
// postings serializer.
|
||||||
|
for (segment_ord, mut segment_postings) in segment_postings {
|
||||||
|
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||||
|
loop {
|
||||||
|
// `.advance()` has been called once before the loop.
|
||||||
|
// Hence we cannot use a `while segment_postings.advance()` loop.
|
||||||
|
if let Some(remapped_doc_id) =
|
||||||
|
old_to_new_doc_id[segment_postings.doc() as usize]
|
||||||
{
|
{
|
||||||
// we make sure to only write the term iff
|
// we make sure to only write the term iff
|
||||||
// there is at least one document.
|
// there is at least one document.
|
||||||
@@ -313,20 +535,31 @@ impl IndexMerger {
|
|||||||
delta_positions,
|
delta_positions,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
if !segment_postings.advance() {
|
if !segment_postings.advance() {
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// closing the term.
|
// closing the term.
|
||||||
field_serializer.close_term()?;
|
field_serializer.close_term()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
field_serializer.close()?;
|
||||||
|
Ok(term_ord_mapping_opt)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<HashMap<Field, TermOrdinalMapping>> {
|
||||||
|
let mut term_ordinal_mappings = HashMap::new();
|
||||||
|
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
||||||
|
if field_entry.is_indexed() {
|
||||||
|
let indexed_field = Field(field_ord as u32);
|
||||||
|
if let Some(term_ordinal_mapping) = self.write_postings_for_field(indexed_field, field_entry.field_type(), serializer)? {
|
||||||
|
term_ordinal_mappings.insert(indexed_field, term_ordinal_mapping);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
field_serializer.close()?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(term_ordinal_mappings)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
|
||||||
@@ -349,9 +582,9 @@ impl IndexMerger {
|
|||||||
|
|
||||||
impl SerializableSegment for IndexMerger {
|
impl SerializableSegment for IndexMerger {
|
||||||
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
|
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
|
||||||
self.write_postings(serializer.get_postings_serializer())?;
|
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
|
||||||
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
|
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
|
||||||
self.write_fast_fields(serializer.get_fast_field_serializer())?;
|
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
|
||||||
self.write_storable_fields(serializer.get_store_writer())?;
|
self.write_storable_fields(serializer.get_store_writer())?;
|
||||||
serializer.close()?;
|
serializer.close()?;
|
||||||
Ok(self.max_doc)
|
Ok(self.max_doc)
|
||||||
@@ -375,6 +608,10 @@ mod tests {
|
|||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
|
use IndexWriter;
|
||||||
|
use query::AllQuery;
|
||||||
|
use collector::FacetCollector;
|
||||||
|
use schema::IntOptions;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_no_deletes() {
|
fn test_index_merger_no_deletes() {
|
||||||
@@ -805,4 +1042,297 @@ mod tests {
|
|||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_merge_facets() {
|
||||||
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
use schema::Facet;
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||||
|
let mut doc = Document::default();
|
||||||
|
for facet in doc_facets {
|
||||||
|
doc.add_facet(facet_field, Facet::from(facet));
|
||||||
|
}
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
};
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/a"]);
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/d"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/e"]);
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &["/top/a"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/b"]);
|
||||||
|
index_doc(&mut index_writer, &["/top/c"]);
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
|
facet_collector.add_facet(Facet::from("/top"));
|
||||||
|
use collector::{MultiCollector, CountCollector};
|
||||||
|
let mut count_collector = CountCollector::default();
|
||||||
|
{
|
||||||
|
let mut multi_collectors = MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
||||||
|
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||||
|
}
|
||||||
|
assert_eq!(count_collector.count(), expected_num_docs);
|
||||||
|
let facet_counts = facet_collector.harvest();
|
||||||
|
let facets: Vec<(String, u64)> = facet_counts.get("/top")
|
||||||
|
.map(|(facet, count)| (facet.to_string(), count))
|
||||||
|
.collect();
|
||||||
|
assert_eq!(
|
||||||
|
facets,
|
||||||
|
expected
|
||||||
|
.iter()
|
||||||
|
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
);
|
||||||
|
};
|
||||||
|
test_searcher(11, &[
|
||||||
|
("/top/a", 5),
|
||||||
|
("/top/b", 5),
|
||||||
|
("/top/c", 2),
|
||||||
|
("/top/d", 2),
|
||||||
|
("/top/e", 2),
|
||||||
|
("/top/f", 1)
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Merging the segments
|
||||||
|
{
|
||||||
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
test_searcher(11, &[
|
||||||
|
("/top/a", 5),
|
||||||
|
("/top/b", 5),
|
||||||
|
("/top/c", 2),
|
||||||
|
("/top/d", 2),
|
||||||
|
("/top/e", 2),
|
||||||
|
("/top/f", 1)
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deleting one term
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||||
|
let facet_term = Term::from_facet(facet_field, &facet);
|
||||||
|
index_writer.delete_term(facet_term);
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
test_searcher(9, &[
|
||||||
|
("/top/a", 3),
|
||||||
|
("/top/b", 3),
|
||||||
|
("/top/c", 1),
|
||||||
|
("/top/d", 2),
|
||||||
|
("/top/e", 2),
|
||||||
|
("/top/f", 1)
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
|
let int_options = IntOptions::default()
|
||||||
|
.set_fast(Cardinality::MultiValues)
|
||||||
|
.set_indexed();
|
||||||
|
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
let mut doc = Document::default();
|
||||||
|
doc.add_u64(int_field, 1);
|
||||||
|
index_writer.add_document(doc.clone());
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
|
// Merging the segments
|
||||||
|
{
|
||||||
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_merge_multivalued_int_fields() {
|
||||||
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
|
let int_options = IntOptions::default()
|
||||||
|
.set_fast(Cardinality::MultiValues)
|
||||||
|
.set_indexed();
|
||||||
|
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||||
|
let mut doc = Document::default();
|
||||||
|
for &val in int_vals {
|
||||||
|
doc.add_u64(int_field, val);
|
||||||
|
}
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
};
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &[1, 2]);
|
||||||
|
index_doc(&mut index_writer, &[1, 2, 3]);
|
||||||
|
index_doc(&mut index_writer, &[4, 5]);
|
||||||
|
index_doc(&mut index_writer, &[1, 2]);
|
||||||
|
index_doc(&mut index_writer, &[1, 5]);
|
||||||
|
index_doc(&mut index_writer, &[3]);
|
||||||
|
index_doc(&mut index_writer, &[17]);
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &[20]);
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
|
index_doc(&mut index_writer, &[28, 27]);
|
||||||
|
index_doc(&mut index_writer, &[1_000]);
|
||||||
|
|
||||||
|
index_writer.commit().expect("committed");
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
|
||||||
|
let mut vals: Vec<u64> = Vec::new();
|
||||||
|
|
||||||
|
{
|
||||||
|
let segment = searcher.segment_reader(0u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(1, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2, 3]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(2, &mut vals);
|
||||||
|
assert_eq!(&vals, &[4, 5]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(3, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(4, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 5]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(5, &mut vals);
|
||||||
|
assert_eq!(&vals, &[3]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(6, &mut vals);
|
||||||
|
assert_eq!(&vals, &[17]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
let segment = searcher.segment_reader(1u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[20]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let segment = searcher.segment_reader(2u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(1, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1_000]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merging the segments
|
||||||
|
{
|
||||||
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
|
{
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let segment = searcher.segment_reader(0u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(1, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2, 3]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(2, &mut vals);
|
||||||
|
assert_eq!(&vals, &[4, 5]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(3, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 2]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(4, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1, 5]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(5, &mut vals);
|
||||||
|
assert_eq!(&vals, &[3]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(6, &mut vals);
|
||||||
|
assert_eq!(&vals, &[17]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(7, &mut vals);
|
||||||
|
assert_eq!(&vals, &[20]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(8, &mut vals);
|
||||||
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(9, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1_000]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ use tokenizer::Token;
|
|||||||
use tokenizer::TokenStream;
|
use tokenizer::TokenStream;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use postings::UnorderedTermId;
|
use postings::UnorderedTermId;
|
||||||
|
use termdict::TermOrdinal;
|
||||||
|
|
||||||
fn posting_from_field_entry<'a>(
|
fn posting_from_field_entry<'a>(
|
||||||
field_entry: &FieldEntry,
|
field_entry: &FieldEntry,
|
||||||
@@ -44,6 +45,7 @@ fn posting_from_field_entry<'a>(
|
|||||||
|
|
||||||
pub struct MultiFieldPostingsWriter<'a> {
|
pub struct MultiFieldPostingsWriter<'a> {
|
||||||
heap: &'a Heap,
|
heap: &'a Heap,
|
||||||
|
schema: Schema,
|
||||||
term_index: TermHashMap<'a>,
|
term_index: TermHashMap<'a>,
|
||||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||||
}
|
}
|
||||||
@@ -58,8 +60,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
.map(|field_entry| posting_from_field_entry(field_entry, heap))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
MultiFieldPostingsWriter {
|
MultiFieldPostingsWriter {
|
||||||
|
schema: schema.clone(),
|
||||||
heap,
|
heap,
|
||||||
term_index,
|
term_index,
|
||||||
per_field_postings_writers,
|
per_field_postings_writers,
|
||||||
@@ -83,7 +85,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
|||||||
pub fn serialize(
|
pub fn serialize(
|
||||||
&self,
|
&self,
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
|
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||||
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
|
||||||
term_offsets.sort_by_key(|&(k, _, _)| k);
|
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||||
|
|
||||||
@@ -94,7 +96,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
|||||||
.map(|(key, _, _)| Term::wrap(key).field())
|
.map(|(key, _, _)| Term::wrap(key).field())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
|
|
||||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
|
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||||
HashMap::new();
|
HashMap::new();
|
||||||
|
|
||||||
let mut prev_field = Field(u32::max_value());
|
let mut prev_field = Field(u32::max_value());
|
||||||
@@ -110,17 +112,23 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
|||||||
let (field, start) = offsets[i];
|
let (field, start) = offsets[i];
|
||||||
let (_, stop) = offsets[i + 1];
|
let (_, stop) = offsets[i + 1];
|
||||||
|
|
||||||
// populating the unordered term ord -> ordered term ord mapping
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
// for the field.
|
|
||||||
let mut mapping = HashMap::new();
|
match field_entry.field_type() {
|
||||||
for (term_ord, term_unord_id) in term_offsets[start..stop]
|
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||||
.iter()
|
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||||
.map(|&(_, _, bucket)| bucket)
|
// for the field.
|
||||||
.enumerate()
|
let mut unordered_term_ids = term_offsets[start..stop]
|
||||||
{
|
.iter()
|
||||||
mapping.insert(term_unord_id, term_ord);
|
.map(|&(_, _, bucket)| bucket);
|
||||||
|
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||||
|
.enumerate()
|
||||||
|
.map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
|
||||||
|
.collect();
|
||||||
|
unordered_term_mappings.insert(field, mapping);
|
||||||
|
}
|
||||||
|
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||||
}
|
}
|
||||||
unordered_term_mappings.insert(field, mapping);
|
|
||||||
|
|
||||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||||
let mut field_serializer = serializer.new_field(field)?;
|
let mut field_serializer = serializer.new_field(field)?;
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ use std::io::{self, Write};
|
|||||||
use compression::VIntEncoder;
|
use compression::VIntEncoder;
|
||||||
use common::CountingWriter;
|
use common::CountingWriter;
|
||||||
use common::CompositeWrite;
|
use common::CompositeWrite;
|
||||||
use termdict::TermDictionaryBuilder;
|
use termdict::{TermOrdinal, TermDictionaryBuilder};
|
||||||
|
|
||||||
/// `PostingsSerializer` is in charge of serializing
|
/// `PostingsSerializer` is in charge of serializing
|
||||||
/// postings on disk, in the
|
/// postings on disk, in the
|
||||||
@@ -114,6 +114,7 @@ pub struct FieldSerializer<'a> {
|
|||||||
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
|
||||||
current_term_info: TermInfo,
|
current_term_info: TermInfo,
|
||||||
term_open: bool,
|
term_open: bool,
|
||||||
|
num_terms: TermOrdinal,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FieldSerializer<'a> {
|
impl<'a> FieldSerializer<'a> {
|
||||||
@@ -152,6 +153,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
positions_serializer_opt,
|
positions_serializer_opt,
|
||||||
current_term_info: TermInfo::default(),
|
current_term_info: TermInfo::default(),
|
||||||
term_open: false,
|
term_open: false,
|
||||||
|
num_terms: TermOrdinal::default(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -172,7 +174,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
/// * term - the term. It needs to come after the previous term according
|
/// * term - the term. It needs to come after the previous term according
|
||||||
/// to the lexicographical order.
|
/// to the lexicographical order.
|
||||||
/// * doc_freq - return the number of document containing the term.
|
/// * doc_freq - return the number of document containing the term.
|
||||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
|
||||||
assert!(
|
assert!(
|
||||||
!self.term_open,
|
!self.term_open,
|
||||||
"Called new_term, while the previous term was not closed."
|
"Called new_term, while the previous term was not closed."
|
||||||
@@ -180,7 +182,10 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
self.term_open = true;
|
self.term_open = true;
|
||||||
self.postings_serializer.clear();
|
self.postings_serializer.clear();
|
||||||
self.current_term_info = self.current_term_info();
|
self.current_term_info = self.current_term_info();
|
||||||
self.term_dictionary_builder.insert_key(term)
|
self.term_dictionary_builder.insert_key(term)?;
|
||||||
|
let term_ordinal = self.num_terms;
|
||||||
|
self.num_terms += 1;
|
||||||
|
Ok(term_ordinal)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Serialize the information that a document contains the current term,
|
/// Serialize the information that a document contains the current term,
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use Result;
|
|||||||
use Score;
|
use Score;
|
||||||
use DocId;
|
use DocId;
|
||||||
use core::Searcher;
|
use core::Searcher;
|
||||||
|
use fastfield::DeleteBitSet;
|
||||||
|
|
||||||
/// Query that matches all of the documents.
|
/// Query that matches all of the documents.
|
||||||
///
|
///
|
||||||
@@ -26,28 +27,52 @@ pub struct AllWeight;
|
|||||||
impl Weight for AllWeight {
|
impl Weight for AllWeight {
|
||||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||||
Ok(box AllScorer {
|
Ok(box AllScorer {
|
||||||
started: false,
|
state: State::NotStarted,
|
||||||
doc: 0u32,
|
doc: 0u32,
|
||||||
max_doc: reader.max_doc(),
|
max_doc: reader.max_doc(),
|
||||||
|
deleted_bitset: reader.delete_bitset().clone()
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum State {
|
||||||
|
NotStarted,
|
||||||
|
Started,
|
||||||
|
Finished
|
||||||
|
}
|
||||||
|
|
||||||
/// Scorer associated to the `AllQuery` query.
|
/// Scorer associated to the `AllQuery` query.
|
||||||
pub struct AllScorer {
|
pub struct AllScorer {
|
||||||
started: bool,
|
state: State,
|
||||||
doc: DocId,
|
doc: DocId,
|
||||||
max_doc: DocId,
|
max_doc: DocId,
|
||||||
|
deleted_bitset: DeleteBitSet
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocSet for AllScorer {
|
impl DocSet for AllScorer {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.started {
|
loop {
|
||||||
self.doc += 1u32;
|
match self.state {
|
||||||
} else {
|
State::NotStarted => {
|
||||||
self.started = true;
|
self.state = State::Started;
|
||||||
|
self.doc = 0;
|
||||||
|
}
|
||||||
|
State::Started => {
|
||||||
|
self.doc += 1u32;
|
||||||
|
}
|
||||||
|
State::Finished => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if self.doc < self.max_doc {
|
||||||
|
if !self.deleted_bitset.is_deleted(self.doc) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.state = State::Finished;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.doc < self.max_doc
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn doc(&self) -> DocId {
|
fn doc(&self) -> DocId {
|
||||||
|
|||||||
@@ -212,6 +212,14 @@ mod tests {
|
|||||||
assert!(Facet::root().is_root());
|
assert!(Facet::root().is_root());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_path() {
|
||||||
|
assert_eq!(
|
||||||
|
Facet::from_path(vec!["top", "a", "firstdoc"]),
|
||||||
|
Facet::from("/top/a/firstdoc")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_display() {
|
fn test_facet_display() {
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ use common;
|
|||||||
use byteorder::{BigEndian, ByteOrder};
|
use byteorder::{BigEndian, ByteOrder};
|
||||||
use super::Field;
|
use super::Field;
|
||||||
use std::str;
|
use std::str;
|
||||||
|
use schema::Facet;
|
||||||
|
|
||||||
/// Size (in bytes) of the buffer of a int field.
|
/// Size (in bytes) of the buffer of a int field.
|
||||||
const INT_TERM_LEN: usize = 4 + 8;
|
const INT_TERM_LEN: usize = 4 + 8;
|
||||||
@@ -29,6 +30,16 @@ impl Term {
|
|||||||
Term::from_field_u64(field, val_u64)
|
Term::from_field_u64(field, val_u64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Creates a `Term` given a facet.
|
||||||
|
pub fn from_facet(field: Field, facet: &Facet) -> Term {
|
||||||
|
let bytes = facet.encoded_bytes();
|
||||||
|
let buffer = Vec::with_capacity(4 + bytes.len());
|
||||||
|
let mut term = Term(buffer);
|
||||||
|
term.set_field(field);
|
||||||
|
term.set_bytes(bytes);
|
||||||
|
term
|
||||||
|
}
|
||||||
|
|
||||||
/// Builds a term given a field, and a string value
|
/// Builds a term given a field, and a string value
|
||||||
///
|
///
|
||||||
/// Assuming the term has a field id of 2, and a text value of "abc",
|
/// Assuming the term has a field id of 2, and a text value of "abc",
|
||||||
@@ -91,10 +102,14 @@ impl Term {
|
|||||||
self.set_u64(common::i64_to_u64(val));
|
self.set_u64(common::i64_to_u64(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_bytes(&mut self, bytes: &[u8]) {
|
||||||
|
self.0.resize(4, 0u8);
|
||||||
|
self.0.extend(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
/// Set the texts only, keeping the field untouched.
|
/// Set the texts only, keeping the field untouched.
|
||||||
pub fn set_text(&mut self, text: &str) {
|
pub fn set_text(&mut self, text: &str) {
|
||||||
self.0.resize(4, 0u8);
|
self.set_bytes(text.as_bytes());
|
||||||
self.0.extend(text.as_bytes());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -86,4 +86,8 @@ impl<'a> TermStreamer for TermStreamerImpl<'a> {
|
|||||||
fn value(&self) -> &TermInfo {
|
fn value(&self) -> &TermInfo {
|
||||||
&self.current_value
|
&self.current_value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn max_term_ord(&self) -> TermOrdinal {
|
||||||
|
self.fst_map.num_terms() as TermOrdinal
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use std::collections::BinaryHeap;
|
|||||||
use termdict::TermStreamerImpl;
|
use termdict::TermStreamerImpl;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use termdict::TermStreamer;
|
use termdict::TermStreamer;
|
||||||
|
use termdict::TermOrdinal;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
|
|
||||||
pub struct HeapItem<'a> {
|
pub struct HeapItem<'a> {
|
||||||
@@ -29,6 +30,7 @@ impl<'a> Ord for HeapItem<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Given a list of sorted term streams,
|
/// Given a list of sorted term streams,
|
||||||
/// returns an iterator over sorted unique terms.
|
/// returns an iterator over sorted unique terms.
|
||||||
///
|
///
|
||||||
@@ -43,8 +45,6 @@ pub struct TermMerger<'a> {
|
|||||||
|
|
||||||
impl<'a> TermMerger<'a> {
|
impl<'a> TermMerger<'a> {
|
||||||
/// Stream of merged term dictionary
|
/// Stream of merged term dictionary
|
||||||
///
|
|
||||||
///
|
|
||||||
pub fn new(streams: Vec<TermStreamerImpl<'a>>) -> TermMerger<'a> {
|
pub fn new(streams: Vec<TermStreamerImpl<'a>>) -> TermMerger<'a> {
|
||||||
TermMerger {
|
TermMerger {
|
||||||
heap: BinaryHeap::new(),
|
heap: BinaryHeap::new(),
|
||||||
@@ -59,6 +59,14 @@ impl<'a> TermMerger<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn matching_segments<'b: 'a>(&'b self) -> Box<'b + Iterator<Item=(usize, TermOrdinal)>> {
|
||||||
|
Box::new(self.current_streamers
|
||||||
|
.iter()
|
||||||
|
.map(|heap_item| {
|
||||||
|
(heap_item.segment_ord, heap_item.streamer.term_ord())
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
fn advance_segments(&mut self) {
|
fn advance_segments(&mut self) {
|
||||||
let streamers = &mut self.current_streamers;
|
let streamers = &mut self.current_streamers;
|
||||||
let heap = &mut self.heap;
|
let heap = &mut self.heap;
|
||||||
|
|||||||
@@ -197,6 +197,13 @@ pub trait TermStreamer: Sized {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Returns an upperbound for term ordinals in this stream.
|
||||||
|
///
|
||||||
|
/// All term ordinals are guaranteed to be stricly smaller
|
||||||
|
/// than the result of `.max_term_ord()`
|
||||||
|
fn max_term_ord(&self) -> TermOrdinal;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `TermStreamerBuilder` is an helper object used to define
|
/// `TermStreamerBuilder` is an helper object used to define
|
||||||
|
|||||||
@@ -6,21 +6,21 @@ use tokenizer::TokenStreamChain;
|
|||||||
|
|
||||||
/// Token
|
/// Token
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
/// Offset (byte index) of the first character of the token.
|
/// Offset (byte index) of the first character of the token.
|
||||||
/// Offsets shall not be modified by token filters.
|
/// Offsets shall not be modified by token filters.
|
||||||
pub offset_from: usize,
|
pub offset_from: usize,
|
||||||
/// Offset (byte index) of the last character of the token + 1.
|
/// Offset (byte index) of the last character of the token + 1.
|
||||||
/// The text that generated the token should be obtained by
|
/// The text that generated the token should be obtained by
|
||||||
/// &text[token.offset_from..token.offset_to]
|
/// &text[token.offset_from..token.offset_to]
|
||||||
pub offset_to: usize,
|
pub offset_to: usize,
|
||||||
/// Position, expressed in number of tokens.
|
/// Position, expressed in number of tokens.
|
||||||
pub position: usize,
|
pub position: usize,
|
||||||
/// Actual text content of the token.
|
/// Actual text content of the token.
|
||||||
pub text: String,
|
pub text: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Token {
|
impl Default for Token {
|
||||||
fn default() -> Token {
|
fn default() -> Token {
|
||||||
Token {
|
Token {
|
||||||
offset_from: 0,
|
offset_from: 0,
|
||||||
offset_to: 0,
|
offset_to: 0,
|
||||||
|
|||||||
Reference in New Issue
Block a user