Integrating #274, #280, #289 into master (#290)

* Integrating bugfixes into master

Closes #274
Closes #280
Closes #289

* Next version will be 0.6
This commit is contained in:
Paul Masurel
2018-05-06 09:48:25 -07:00
committed by GitHub
parent ca74c14647
commit 99c0b84036
19 changed files with 966 additions and 291 deletions

View File

@@ -1,11 +1,18 @@
Tantivy 0.5.2
Tantivy 0.6
==========================
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
- Compiles on stable rust
Tantivy 0.5.2
===========================
- bugfix #274
- bugfix #280
- bugfix #289
Tantivy 0.5.1
==========================
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.5.2"
version = "0.6.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -119,6 +119,19 @@ impl SegmentReader {
}
}
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
&self,
field: Field,
idx: usize
) -> fastfield::Result<FastFieldReader<Item>> {
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
Ok(FastFieldReader::open(ff_source))
} else {
let field_entry = self.schema.get_field_entry(field);
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
/// May panick if the field is not a multivalued fastfield of the type `Item`.
pub fn multi_fast_field_reader<Item: FastValue>(
@@ -128,14 +141,8 @@ impl SegmentReader {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let vals_reader = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))

View File

@@ -26,13 +26,31 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
(start, stop)
}
/// Returns the number of values associated to a given document.
pub fn num_vals(&self, doc: DocId) -> usize {
let (start, stop) = self.range(doc);
(stop - start) as usize
}
/// Returns the overall number of values associated to documents.
pub(crate) fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
}
/// Returns the array of values associated to the given `doc`.
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let start = self.idx_reader.get(doc) as u32;
let stop = self.idx_reader.get(doc + 1) as u32;
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range(start, &mut vals[..]);
self.vals_reader.get_range(start as u32, &mut vals[..]);
}
}

View File

@@ -3,6 +3,7 @@ use fastfield::value_to_u64;
use fastfield::FastFieldSerializer;
use itertools::Itertools;
use postings::UnorderedTermId;
use termdict::TermOrdinal;
use schema::{Document, Field};
use std::collections::HashMap;
use std::io;
@@ -101,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
{
// writing the offset index
@@ -125,13 +126,13 @@ impl MultiValueIntFastFieldWriter {
1,
)?;
for val in &self.vals {
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
value_serializer.add_val(remapped_val)?;
}
}
None => {
let val_min_max = self.vals.iter().cloned().minmax();
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for &val in &self.vals {

View File

@@ -71,6 +71,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
///
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.bit_unpacker.get_range(start, output_u64);

View File

@@ -77,11 +77,21 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;

View File

@@ -1,13 +1,11 @@
use super::multivalued::MultiValueIntFastFieldWriter;
use common;
use common::BinarySerializable;
use common::VInt;
use common::{self, VInt, BinarySerializable};
use fastfield::FastFieldSerializer;
use postings::UnorderedTermId;
use schema::FieldType;
use schema::{Cardinality, Document, Field, Schema};
use schema::{FieldType, Cardinality, Document, Field, Schema};
use std::collections::HashMap;
use std::io;
use super::multivalued::MultiValueIntFastFieldWriter;
use termdict::TermOrdinal;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
@@ -93,7 +91,7 @@ impl FastFieldsWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?;

View File

@@ -17,7 +17,8 @@ impl FieldNormsWriter {
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.filter(|&(_, field_entry)|
field_entry.is_indexed())
.map(|(field, _)| Field(field as u32))
.collect::<Vec<Field>>()
}

View File

@@ -195,7 +195,6 @@ pub fn advance_deletes(
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
{
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
// We are already up-to-date here.
@@ -236,7 +235,6 @@ pub fn advance_deletes(
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(file_protect)
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,13 +1,8 @@
use datastruct::stacker::{Heap, TermHashMap};
use postings::Recorder;
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
use schema::FieldEntry;
use schema::FieldType;
use schema::IndexRecordOption;
use schema::Term;
use schema::{Field, Schema};
use schema::{FieldEntry, FieldType, Term, Field, Schema};
use std::collections::HashMap;
use std::io;
use std::marker::PhantomData;
@@ -16,6 +11,9 @@ use tokenizer::Token;
use tokenizer::TokenStream;
use DocId;
use Result;
use schema::IndexRecordOption;
use postings::UnorderedTermId;
use termdict::TermOrdinal;
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
@@ -44,6 +42,7 @@ fn posting_from_field_entry<'a>(
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
schema: Schema,
term_index: TermHashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
}
@@ -58,8 +57,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
schema: schema.clone(),
heap,
term_index,
per_field_postings_writers,
@@ -83,7 +82,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
@@ -94,7 +93,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut prev_field = Field(u32::max_value());
@@ -110,17 +109,23 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
// populating the unordered term ord -> ordered term ord mapping
// for the field.
let mut mapping = HashMap::new();
for (term_ord, term_unord_id) in term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket)
.enumerate()
{
mapping.insert(term_unord_id, term_ord);
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Str(_) | FieldType::HierarchicalFacet => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let mut unordered_term_ids = term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket);
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| (unord_term_id as UnorderedTermId, term_ord as TermOrdinal))
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) => {}
}
unordered_term_mappings.insert(field, mapping);
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
let mut field_serializer =

View File

@@ -1,19 +1,17 @@
use Result;
use DocId;
use super::TermInfo;
use common::BinarySerializable;
use common::CompositeWrite;
use common::CountingWriter;
use compression::VIntEncoder;
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
use core::Segment;
use directory::WritePtr;
use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
use schema::{Field, FieldEntry, FieldType};
use schema::Schema;
use std::io::{self, Write};
use termdict::TermDictionaryBuilder;
use DocId;
use Result;
use common::{CompositeWrite, CountingWriter};
use termdict::{TermOrdinal, TermDictionaryBuilder};
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
@@ -119,6 +117,7 @@ pub struct FieldSerializer<'a> {
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
num_terms: TermOrdinal,
}
impl<'a> FieldSerializer<'a> {
@@ -157,6 +156,7 @@ impl<'a> FieldSerializer<'a> {
positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
num_terms: TermOrdinal::default(),
})
}
@@ -177,7 +177,7 @@ impl<'a> FieldSerializer<'a> {
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
@@ -185,7 +185,10 @@ impl<'a> FieldSerializer<'a> {
self.term_open = true;
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
self.term_dictionary_builder.insert_key(term)?;
let term_ordinal = self.num_terms;
self.num_terms += 1;
Ok(term_ordinal)
}
/// Serialize the information that a document contains the current term,

View File

@@ -1,12 +1,10 @@
use core::Searcher;
use core::SegmentReader;
use docset::DocSet;
use query::Query;
use query::Scorer;
use query::Weight;
use DocId;
use Result;
use Score;
use core::Searcher;
use core::SegmentReader;
use docset::DocSet;
use query::{Query, Weight, Scorer};
/// Query that matches all of the documents.
///
@@ -26,28 +24,46 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
Ok(Box::new(AllScorer {
started: false,
state: State::NotStarted,
doc: 0u32,
max_doc: reader.max_doc(),
max_doc: reader.max_doc()
}))
}
}
enum State {
NotStarted,
Started,
Finished
}
/// Scorer associated to the `AllQuery` query.
pub struct AllScorer {
started: bool,
state: State,
doc: DocId,
max_doc: DocId,
max_doc: DocId
}
impl DocSet for AllScorer {
fn advance(&mut self) -> bool {
if self.started {
self.doc += 1u32;
} else {
self.started = true;
match self.state {
State::NotStarted => {
self.state = State::Started;
self.doc = 0;
}
State::Started => {
self.doc += 1u32;
}
State::Finished => {
return false;
}
}
if self.doc < self.max_doc {
return true;
} else {
self.state = State::Finished;
return false;
}
self.doc < self.max_doc
}
fn doc(&self) -> DocId {
@@ -64,3 +80,49 @@ impl Scorer for AllScorer {
1f32
}
}
#[cfg(test)]
mod tests {
use super::AllQuery;
use Index;
use schema::{TEXT, SchemaBuilder};
use query::Query;
#[test]
fn test_all_query() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_with_num_threads(1, 10_000_000)
.unwrap();
index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap();
{
let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(scorer.advance());
assert_eq!(scorer.doc(), 1u32);
assert!(!scorer.advance());
}
{
let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(!scorer.advance());
}
}
}

View File

@@ -212,6 +212,14 @@ mod tests {
assert!(Facet::root().is_root());
}
#[test]
fn test_from_path() {
assert_eq!(
Facet::from_path(vec!["top", "a", "firstdoc"]),
Facet::from("/top/a/firstdoc")
);
}
#[test]
fn test_facet_display() {
{

View File

@@ -4,6 +4,7 @@ use super::Field;
use byteorder::{BigEndian, ByteOrder};
use common;
use std::str;
use schema::Facet;
/// Size (in bytes) of the buffer of a int field.
const INT_TERM_LEN: usize = 4 + 8;
@@ -29,6 +30,16 @@ impl Term {
Term::from_field_u64(field, val_u64)
}
/// Creates a `Term` given a facet.
pub fn from_facet(field: Field, facet: &Facet) -> Term {
let bytes = facet.encoded_bytes();
let buffer = Vec::with_capacity(4 + bytes.len());
let mut term = Term(buffer);
term.set_field(field);
term.set_bytes(bytes);
term
}
/// Builds a term given a field, and a string value
///
/// Assuming the term has a field id of 2, and a text value of "abc",
@@ -91,10 +102,14 @@ impl Term {
self.set_u64(common::i64_to_u64(val));
}
fn set_bytes(&mut self, bytes: &[u8]) {
self.0.resize(4, 0u8);
self.0.extend(bytes);
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.0.resize(4, 0u8);
self.0.extend(text.as_bytes());
self.set_bytes(text.as_bytes());
}
}

View File

@@ -1,7 +1,8 @@
use schema::Term;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use termdict::TermStreamer;
use termdict::TermOrdinal;
use schema::Term;
pub struct HeapItem<'a> {
pub streamer: TermStreamer<'a>,
@@ -28,6 +29,7 @@ impl<'a> Ord for HeapItem<'a> {
}
}
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
///
@@ -43,7 +45,6 @@ pub struct TermMerger<'a> {
impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary
///
///
pub fn new(streams: Vec<TermStreamer<'a>>) -> TermMerger<'a> {
TermMerger {
heap: BinaryHeap::new(),
@@ -58,6 +59,14 @@ impl<'a> TermMerger<'a> {
}
}
pub(crate) fn matching_segments<'b: 'a>(&'b self) -> Box<'b + Iterator<Item=(usize, TermOrdinal)>> {
Box::new(self.current_streamers
.iter()
.map(|heap_item| {
(heap_item.segment_ord, heap_item.streamer.term_ord())
}))
}
fn advance_segments(&mut self) {
let streamers = &mut self.current_streamers;
let heap = &mut self.heap;

View File

@@ -59,6 +59,7 @@ pub use self::merger::TermMerger;
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
#[cfg(test)]
mod tests {
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};

View File

@@ -6,21 +6,21 @@ use tokenizer::TokenStreamChain;
/// Token
#[derive(Debug, Clone)]
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
}
impl Default for Token {
fn default() -> Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,