Compare commits

...

2 Commits

Author SHA1 Message Date
Paul Masurel
063ed30f66 Added field norm readers 2020-07-20 11:59:43 +09:00
Paul Masurel
6db8bb49d6 Assert nearly equals macro (#853)
* Assert nearly equals macro

* Renamed specialized_scorer in TermScorer
2020-07-17 16:40:41 +09:00
19 changed files with 139 additions and 73 deletions

View File

@@ -8,7 +8,7 @@ use crate::directory::ReadOnlySource;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::Field;
use crate::schema::FieldType;
use crate::schema::Schema;
@@ -48,7 +48,7 @@ pub struct SegmentReader {
positions_composite: CompositeFile,
positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>,
fieldnorms_composite: CompositeFile,
fieldnorm_readers: FieldNormReaders,
store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>,
@@ -126,8 +126,8 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
FieldNormReader::open(fieldnorm_source)
if let Some(fieldnorm_reader) = self.fieldnorm_readers.get_field(field) {
fieldnorm_reader
} else {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
@@ -178,8 +178,8 @@ impl SegmentReader {
let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::new(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
@@ -195,7 +195,7 @@ impl SegmentReader {
termdict_composite,
postings_composite,
fast_fields_readers: fast_field_readers,
fieldnorms_composite,
fieldnorm_readers,
segment_id: segment.id(),
store_source,
delete_bitset_opt,
@@ -308,7 +308,7 @@ impl SegmentReader {
self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(),
self.fieldnorms_composite.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader().space_usage(),
self.delete_bitset_opt
.as_ref()

View File

@@ -21,7 +21,7 @@ mod reader;
mod serializer;
mod writer;
pub use self::reader::FieldNormReader;
pub use self::reader::{FieldNormReader, FieldNormReaders};
pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter;

View File

@@ -1,6 +1,41 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::ReadOnlySource;
use crate::schema::Field;
use crate::space_usage::PerFieldSpaceUsage;
use crate::DocId;
use std::sync::Arc;
/// Reader for the fieldnorm (for each document, the number of tokens indexed in the
/// field) of all indexed fields in the index.
///
/// Each fieldnorm is approximately compressed over one byte. We refer to this byte as
/// `fieldnorm_id`.
/// The mapping from `fieldnorm` to `fieldnorm_id` is given by monotonic.
#[derive(Clone)]
pub struct FieldNormReaders {
data: Arc<CompositeFile>,
}
impl FieldNormReaders {
/// Creates a field norm reader.
pub fn new(source: ReadOnlySource) -> crate::Result<FieldNormReaders> {
let data = CompositeFile::open(&source)?;
Ok(FieldNormReaders {
data: Arc::new(data),
})
}
/// Returns the FieldNormReader for a specific field.
pub fn get_field(&self, field: Field) -> Option<FieldNormReader> {
self.data.open_read(field).map(FieldNormReader::open)
}
/// Return a break down of the space usage per field.
pub fn space_usage(&self) -> PerFieldSpaceUsage {
self.data.space_usage()
}
}
/// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to
@@ -19,6 +54,7 @@ use crate::DocId;
/// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader {
data: ReadOnlySource,
}
@@ -29,6 +65,11 @@ impl FieldNormReader {
FieldNormReader { data }
}
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
}
/// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`.
@@ -65,10 +106,11 @@ impl FieldNormReader {
}
#[cfg(test)]
impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader {
impl From<&[u32]> for FieldNormReader {
fn from(field_norms: &[u32]) -> FieldNormReader {
let field_norms_id = field_norms
.into_iter()
.iter()
.cloned()
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id);

View File

@@ -78,11 +78,12 @@ impl FieldNormsWriter {
}
/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
}
fieldnorms_serializer.close()?;
Ok(())
}
}

View File

@@ -167,7 +167,7 @@ impl IndexMerger {
fn write_fieldnorms(
&self,
fieldnorms_serializer: &mut FieldNormsSerializer,
mut fieldnorms_serializer: FieldNormsSerializer,
) -> crate::Result<()> {
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
@@ -182,6 +182,7 @@ impl IndexMerger {
}
fieldnorms_serializer.serialize_field(field, &fieldnorms_data[..])?;
}
fieldnorms_serializer.close()?;
Ok(())
}
@@ -668,8 +669,10 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer)?;
}
let term_ord_mappings = self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer(), term_ord_mappings)?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
@@ -1504,12 +1507,9 @@ mod tests {
for i in 0..100 {
let mut doc = Document::new();
doc.add_f64(field, 42.0);
doc.add_f64(multi_field, 0.24);
doc.add_f64(multi_field, 0.27);
writer.add_document(doc);
if i % 5 == 0 {
writer.commit()?;
}
@@ -1521,7 +1521,6 @@ mod tests {
// If a merging thread fails, we should end up with more
// than one segment here
assert_eq!(1, index.searchable_segments()?.len());
Ok(())
}
}

View File

@@ -8,15 +8,16 @@ use crate::store::StoreWriter;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
pub struct SegmentSerializer {
segment: Segment,
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FieldNormsSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(segment: &mut Segment) -> crate::Result<SegmentSerializer> {
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
@@ -25,15 +26,21 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(segment)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write),
fast_field_serializer,
fieldnorms_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
}
#[allow(dead_code)]
pub fn segment(&self) -> &Segment {
&self.segment
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer
@@ -44,9 +51,11 @@ impl SegmentSerializer {
&mut self.fast_field_serializer
}
/// Accessor to the field norm serializer.
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
&mut self.fieldnorms_serializer
/// Extract the field norm serializer.
///
/// Note the fieldnorms serializer can only be extracted once.
pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
self.fieldnorms_serializer.take()
}
/// Accessor to the `StoreWriter`.
@@ -55,11 +64,13 @@ impl SegmentSerializer {
}
/// Finalize the segment serialization.
pub fn close(self) -> crate::Result<()> {
pub fn close(mut self) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(())
}
}

View File

@@ -112,7 +112,7 @@ fn merge(
target_opstamp: Opstamp,
) -> crate::Result<SegmentEntry> {
// first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
let merged_segment = index.new_segment();
// First we apply all of the delet to the merged segment, up to the target opstamp.
for segment_entry in &mut segment_entries {
@@ -131,12 +131,13 @@ fn merge(
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the two segments.
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
let merged_segment_id = merged_segment.id();
let segment_meta = index.new_segment_meta(merged_segment_id, num_docs);
Ok(SegmentEntry::new(segment_meta, delete_cursor, None))
}

View File

@@ -62,11 +62,12 @@ impl SegmentWriter {
/// - schema
pub fn for_segment(
memory_budget: usize,
mut segment: Segment,
segment: Segment,
schema: &Schema,
) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let segment_serializer = SegmentSerializer::for_segment(segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers = schema
.fields()
@@ -76,7 +77,7 @@ impl SegmentWriter {
.get_indexing_options()
.and_then(|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
tokenizer_manager.get(tokenizer_name)
}),
_ => None,
},
@@ -280,9 +281,11 @@ fn write(
fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer,
) -> crate::Result<()> {
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let term_ord_map = multifield_postings.serialize(serializer.get_postings_serializer())?;
fast_field_writers.serialize(serializer.get_fast_field_serializer(), &term_ord_map)?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?;
Ok(())
}

View File

@@ -298,17 +298,26 @@ mod tests {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(
nearly_equals(val, expected),
"Got {}, expected {}.",
val,
expected
);
}
pub fn nearly_equals(a: f32, b: f32) -> bool {
(a - b).abs() < 0.0005 * (a + b).abs()
/// Checks if left and right are close one to each other.
/// Panics if the two values are more than 0.5% apart.
#[macro_export]
macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{
match (&$left, &$right) {
(left_val, right_val) => {
let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add {
panic!(
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
)
}
}
}
}};
}
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {

View File

@@ -139,10 +139,10 @@ impl BM25Weight {
mod tests {
use super::idf;
use crate::tests::assert_nearly_equals;
use crate::assert_nearly_equals;
#[test]
fn test_idf() {
assert_nearly_equals(idf(1, 2), 0.6931472);
assert_nearly_equals!(idf(1, 2), 0.6931472);
}
}

View File

@@ -7,6 +7,7 @@ pub use self::boolean_query::BooleanQuery;
mod tests {
use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::TopDocs;
use crate::query::score_combiner::SumWithCoordsCombiner;
@@ -19,7 +20,6 @@ mod tests {
use crate::query::Scorer;
use crate::query::TermQuery;
use crate::schema::*;
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::{DocAddress, DocId, Score};
@@ -256,14 +256,14 @@ mod tests {
.scorer(searcher.segment_reader(0u32), 1.0f32)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 0.84163445f32);
assert_nearly_equals!(boolean_scorer.score(), 0.84163445f32);
}
{
let mut boolean_scorer = boolean_weight
.scorer(searcher.segment_reader(0u32), 2.0f32)
.unwrap();
assert_eq!(boolean_scorer.doc(), 0u32);
assert_nearly_equals(boolean_scorer.score(), 1.6832689f32);
assert_nearly_equals!(boolean_scorer.score(), 1.6832689f32);
}
}

View File

@@ -163,10 +163,10 @@ impl Query for FuzzyTermQuery {
#[cfg(test)]
mod test {
use super::FuzzyTermQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::schema::Schema;
use crate::schema::TEXT;
use crate::tests::assert_nearly_equals;
use crate::Index;
use crate::Term;
@@ -199,7 +199,7 @@ mod test {
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
// fails because non-prefix Levenshtein distance is more than 1 (add 'a' and 'n')
@@ -223,7 +223,7 @@ mod test {
.unwrap();
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
let (score, _) = top_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
}
}

View File

@@ -10,11 +10,11 @@ pub use self::phrase_weight::PhraseWeight;
pub mod tests {
use super::*;
use crate::assert_nearly_equals;
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
use crate::core::Index;
use crate::query::Weight;
use crate::schema::{Schema, Term, TEXT};
use crate::tests::assert_nearly_equals;
use crate::DocId;
use crate::{DocAddress, TERMINATED};
@@ -175,8 +175,8 @@ pub mod tests {
.to_vec()
};
let scores = test_query(vec!["a", "b"]);
assert_nearly_equals(scores[0], 0.40618482);
assert_nearly_equals(scores[1], 0.46844664);
assert_nearly_equals!(scores[0], 0.40618482);
assert_nearly_equals!(scores[1], 0.46844664);
}
#[test] // motivated by #234

View File

@@ -89,10 +89,10 @@ impl Query for RegexQuery {
#[cfg(test)]
mod test {
use super::RegexQuery;
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::schema::TEXT;
use crate::schema::{Field, Schema};
use crate::tests::assert_nearly_equals;
use crate::{Index, IndexReader};
use std::sync::Arc;
use tantivy_fst::Regex;
@@ -129,7 +129,7 @@ mod test {
.unwrap();
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
let (score, _) = scored_docs[0];
assert_nearly_equals(1f32, score);
assert_nearly_equals!(1f32, score);
}
let top_docs = searcher
.search(&query_matching_zero, &TopDocs::with_limit(2))

View File

@@ -9,12 +9,12 @@ pub use self::term_weight::TermWeight;
#[cfg(test)]
mod tests {
use crate::assert_nearly_equals;
use crate::collector::TopDocs;
use crate::docset::DocSet;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::query::{Query, QueryParser, Scorer, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
use crate::tests::assert_nearly_equals;
use crate::Term;
use crate::{Index, TERMINATED};
@@ -105,7 +105,7 @@ mod tests {
.unwrap();
assert_eq!(topdocs.len(), 1);
let (score, _) = topdocs[0];
assert_nearly_equals(0.77802235, score);
assert_nearly_equals!(0.77802235, score);
}
{
let term = Term::from_field_text(left_field, "left1");
@@ -115,9 +115,9 @@ mod tests {
.unwrap();
assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0];
assert_nearly_equals(0.27101856, score1);
assert_nearly_equals!(0.27101856, score1);
let (score2, _) = top_docs[1];
assert_nearly_equals(0.13736556, score2);
assert_nearly_equals!(0.13736556, score2);
}
{
let query_parser = QueryParser::for_index(&index, vec![]);
@@ -125,9 +125,9 @@ mod tests {
let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
assert_eq!(top_docs.len(), 2);
let (score1, _) = top_docs[0];
assert_nearly_equals(0.9153879, score1);
assert_nearly_equals!(0.9153879, score1);
let (score2, _) = top_docs[1];
assert_nearly_equals(0.27101856, score2);
assert_nearly_equals!(0.27101856, score2);
}
}

View File

@@ -20,12 +20,12 @@ pub struct TermWeight {
impl Weight for TermWeight {
fn scorer(&self, reader: &SegmentReader, boost: f32) -> Result<Box<dyn Scorer>> {
let term_scorer = self.scorer_specialized(reader, boost)?;
let term_scorer = self.specialized_scorer(reader, boost)?;
Ok(Box::new(term_scorer))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
@@ -52,7 +52,7 @@ impl Weight for TermWeight {
reader: &SegmentReader,
callback: &mut dyn FnMut(DocId, Score),
) -> crate::Result<()> {
let mut scorer = self.scorer_specialized(reader, 1.0f32)?;
let mut scorer = self.specialized_scorer(reader, 1.0f32)?;
for_each_scorer(&mut scorer, callback);
Ok(())
}
@@ -92,7 +92,7 @@ impl TermWeight {
}
}
fn scorer_specialized(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
fn specialized_scorer(&self, reader: &SegmentReader, boost: f32) -> Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field);
let fieldnorm_reader = reader.get_fieldnorms_reader(field);

View File

@@ -14,7 +14,7 @@ use std::fmt;
/// - a field name
/// - a field type, itself wrapping up options describing
/// how the field should be indexed.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub struct FieldEntry {
name: String,
field_type: FieldType,

View File

@@ -48,7 +48,7 @@ pub enum Type {
/// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, PartialEq)]
pub enum FieldType {
/// String field type configuration
Str(TextOptions),

View File

@@ -6,7 +6,7 @@ use std::borrow::Cow;
use std::ops::BitOr;
/// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: Option<TextFieldIndexing>,
stored: bool,
@@ -51,7 +51,7 @@ impl Default for TextOptions {
/// - the amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field.
#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)]
#[derive(Clone, PartialEq, Debug, Serialize, Deserialize)]
pub struct TextFieldIndexing {
record: IndexRecordOption,
tokenizer: Cow<'static, str>,