Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
7cb018c640 Add an option to opt out fieldnorms for indexed fields.
Closes #922
2020-11-03 16:15:20 +09:00
12 changed files with 210 additions and 73 deletions

View File

@@ -7,6 +7,7 @@ Tantivy 0.14.0
- Added support for Brotli compression in the DocStore. (@ppodolsky)
- Added helper for building intersections and unions in BooleanQuery (@guilload)
- Bugfix in `Query::explain`
- Making it possible to opt out the generation of fieldnorms information for indexed fields. This change breaks compatibility as the meta.json file format is slightly changed. (#922, @pmasurel)
Tantivy 0.13.2
===================

View File

@@ -301,7 +301,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default","fieldnorms":true},"stored":false}}],"opstamp":0}"#
);
}
}

View File

@@ -98,10 +98,9 @@ mod tests {
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight = term_query.specialized_weight(&searcher, false)?;
let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0f32);
let term_weight_res = term_query.specialized_weight(&searcher, false);
assert!(matches!(
term_scorer_err,
term_weight_res,
Err(crate::TantivyError::SchemaError(_))
));
Ok(())

View File

@@ -49,7 +49,7 @@ impl FieldNormReaders {
///
/// This metric is important to compute the score of a
/// document : a document having a query word in one its short fields
/// (e.g. title) is likely to be more relevant than in one of its longer field
/// (e.g. title)is likely to be more relevant than in one of its longer field
/// (e.g. body).
///
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
@@ -61,20 +61,31 @@ impl FieldNormReaders {
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
#[derive(Clone)]
pub struct FieldNormReader {
data: OwnedBytes,
pub enum FieldNormReader {
ConstFieldNorm { fieldnorm_id: u8, num_docs: u32 },
OneByte(OwnedBytes),
}
impl FieldNormReader {
pub fn const_fieldnorm_id(fieldnorm_id: u8, num_docs: u32) -> FieldNormReader {
FieldNormReader::ConstFieldNorm {
fieldnorm_id,
num_docs,
}
}
/// Opens a field norm reader given its file.
pub fn open(fieldnorm_file: FileSlice) -> crate::Result<Self> {
let data = fieldnorm_file.read_bytes()?;
Ok(FieldNormReader { data })
Ok(FieldNormReader::OneByte(data))
}
/// Returns the number of documents in this segment.
pub fn num_docs(&self) -> u32 {
self.data.len() as u32
match self {
Self::ConstFieldNorm { num_docs, .. } => *num_docs,
FieldNormReader::OneByte(vals) => vals.len() as u32,
}
}
/// Returns the `fieldnorm` associated to a doc id.
@@ -86,6 +97,7 @@ impl FieldNormReader {
///
/// The fieldnorm is effectively decoded from the
/// `fieldnorm_id` by doing a simple table lookup.
#[inline(always)]
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
@@ -94,7 +106,11 @@ impl FieldNormReader {
/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
self.data.as_slice()[doc_id as usize]
match self {
FieldNormReader::ConstFieldNorm { fieldnorm_id, .. } => *fieldnorm_id,
FieldNormReader::OneByte(data) => data.as_slice()[doc_id as usize],
}
}
/// Converts a `fieldnorm_id` into a fieldnorm.
@@ -118,9 +134,7 @@ impl FieldNormReader {
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = OwnedBytes::new(field_norms_id);
FieldNormReader {
data: field_norms_data,
}
FieldNormReader::OneByte(field_norms_data)
}
}

View File

@@ -4,7 +4,7 @@ use super::fieldnorm_to_id;
use super::FieldNormsSerializer;
use crate::schema::Field;
use crate::schema::Schema;
use std::{io, iter};
use std::io;
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
/// of each document for each field with field norms.
@@ -13,7 +13,7 @@ use std::{io, iter};
/// byte per document per field.
pub struct FieldNormsWriter {
fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>,
fieldnorms_buffer: Vec<Option<Vec<u8>>>,
}
impl FieldNormsWriter {
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema
.fields()
.filter_map(|(field, field_entry)| {
if field_entry.is_indexed() {
if field_entry.has_fieldnorms() {
Some(field)
} else {
None
@@ -36,17 +36,14 @@ impl FieldNormsWriter {
/// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
.iter()
.map(Field::field_id)
.max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
let num_fields = schema.num_fields();
let mut fieldnorms_buffer: Vec<Option<Vec<u8>>> = vec![None; num_fields];
for field in &fields {
fieldnorms_buffer[field.field_id() as usize] = Some(Vec::new());
}
FieldNormsWriter {
fields,
fieldnorms_buffer: iter::repeat_with(Vec::new)
.take(max_field)
.collect::<Vec<_>>(),
fieldnorms_buffer,
}
}
@@ -55,8 +52,10 @@ impl FieldNormsWriter {
///
/// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for field in self.fields.iter() {
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
for buffer_opt in self.fieldnorms_buffer.iter_mut() {
if let Some(buffer) = buffer_opt {
buffer.resize(max_doc as usize, 0u8);
}
}
}
@@ -69,21 +68,22 @@ impl FieldNormsWriter {
/// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice"
);
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
if let Some(fieldnorm_buffer) = self.fieldnorms_buffer[field.field_id() as usize].as_mut() {
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice" // we fill intermediary `DocId` as having a fieldnorm of 0.
);
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
}
}
/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
if let Some(buffer) = self.fieldnorms_buffer[field.field_id() as usize].as_ref() {
fieldnorms_serializer.serialize_field(field, &buffer[..])?;
}
}
fieldnorms_serializer.close()?;
Ok(())

View File

@@ -322,9 +322,8 @@ pub struct PostingsSerializer<W: Write> {
bm25_weight: Option<BM25Weight>,
num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information.
// this value is used to compute the block wand information.
}
impl<W: Write> PostingsSerializer<W> {
@@ -334,10 +333,6 @@ impl<W: Write> PostingsSerializer<W> {
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer {
output_write: CountingWriter::wrap(write),
@@ -353,20 +348,25 @@ impl<W: Write> PostingsSerializer<W> {
fieldnorm_reader,
bm25_weight: None,
num_docs,
avg_fieldnorm,
}
}
/// Returns the number of documents in the segment currently being serialized.
/// This function may return `None` if there are no fieldnorm for that field.
fn num_docs_in_segment(&self) -> Option<u32> {
self.fieldnorm_reader
.as_ref()
.map(|reader| reader.num_docs())
}
pub fn new_term(&mut self, term_doc_freq: u32) {
if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = BM25Weight::for_one_term(
term_doc_freq as u64,
self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
if self.mode.has_freq() {
return;
}
self.bm25_weight = self.num_docs_in_segment().map(|num_docs| {
BM25Weight::for_one_term(term_doc_freq as u64, num_docs as u64, self.avg_fieldnorm)
});
}
fn write_block(&mut self) {

View File

@@ -92,6 +92,17 @@ impl TermQuery {
searcher: &Searcher,
scoring_enabled: bool,
) -> crate::Result<TermWeight> {
let field_entry = searcher
.schema()
.get_field_entry(self.term.field());
if !field_entry.is_indexed() {
let error_msg = format!("Field {:?} is not indexed.", field_entry.name());
return Err(crate::TantivyError::SchemaError(error_msg));
}
let has_fieldnorms = searcher
.schema()
.get_field_entry(self.term.field())
.has_fieldnorms();
let term = self.term.clone();
let bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
let index_record_option = if scoring_enabled {
@@ -103,6 +114,7 @@ impl TermQuery {
self.term.clone(),
index_record_option,
bm25_weight,
has_fieldnorms,
))
}
}

View File

@@ -1,6 +1,7 @@
use super::term_scorer::TermScorer;
use crate::core::SegmentReader;
use crate::docset::DocSet;
use crate::fieldnorm::FieldNormReader;
use crate::postings::SegmentPostings;
use crate::query::bm25::BM25Weight;
use crate::query::explanation::does_not_match;
@@ -15,6 +16,7 @@ pub struct TermWeight {
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
has_fieldnorms: bool,
}
impl Weight for TermWeight {
@@ -87,11 +89,13 @@ impl TermWeight {
term: Term,
index_record_option: IndexRecordOption,
similarity_weight: BM25Weight,
has_fieldnorms: bool,
) -> TermWeight {
TermWeight {
term,
index_record_option,
similarity_weight,
has_fieldnorms,
}
}
@@ -102,7 +106,11 @@ impl TermWeight {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
let fieldnorm_reader = if self.has_fieldnorms {
reader.get_fieldnorms_reader(field)?
} else {
FieldNormReader::const_fieldnorm_id(1u8, reader.num_docs())
};
let similarity_weight = self.similarity_weight.boost_by(boost);
let postings_opt: Option<SegmentPostings> =
inverted_index.read_postings(&self.term, self.index_record_option)?;

View File

@@ -112,6 +112,21 @@ impl FieldEntry {
}
}
pub fn has_fieldnorms(&self) -> bool {
match self.field_type {
FieldType::Str(ref options) => options
.get_indexing_options()
.map(|options| options.fieldnorms())
.unwrap_or(false),
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::F64(ref options)
| FieldType::Date(ref options) => options.index_option().has_fieldnorms(),
FieldType::HierarchicalFacet => false,
FieldType::Bytes(ref _options) => false,
}
}
/// Returns true iff the field is a int (signed or unsigned) fast field
pub fn is_fast(&self) -> bool {
match self.field_type {
@@ -272,7 +287,8 @@ impl<'de> Deserialize<'de> for FieldEntry {
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::TEXT;
use crate::schema::{Schema, STRING, TEXT};
use crate::Index;
use serde_json;
#[test]
@@ -291,7 +307,8 @@ mod tests {
"options": {
"indexing": {
"record": "position",
"tokenizer": "default"
"tokenizer": "default",
"fieldnorms": true
},
"stored": false
}
@@ -309,4 +326,19 @@ mod tests {
_ => panic!("expected FieldType::Str"),
}
}
#[test]
fn test_fieldnorms() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text);
assert!(matches!(err, Err(crate::TantivyError::SchemaError(_))));
Ok(())
}
}

View File

@@ -14,10 +14,50 @@ pub enum Cardinality {
MultiValues,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum IntOptionIndex {
#[serde(rename = "no_index")]
NoIndex,
#[serde(rename = "index_no_fieldnorms")]
IndexNoFieldnorms,
#[serde(rename = "index_with_fieldnorms")]
IndexWithFieldnorms,
}
impl BitOr<IntOptionIndex> for IntOptionIndex {
type Output = IntOptionIndex;
fn bitor(self, other: IntOptionIndex) -> IntOptionIndex {
match (self, other) {
(_, Self::IndexWithFieldnorms) | (Self::IndexWithFieldnorms, _) => {
Self::IndexWithFieldnorms
}
(_, Self::IndexNoFieldnorms) | (Self::IndexNoFieldnorms, _) => Self::IndexNoFieldnorms,
(Self::NoIndex, Self::NoIndex) => Self::NoIndex,
}
}
}
impl IntOptionIndex {
pub fn is_indexed(&self) -> bool {
match *self {
Self::NoIndex => false,
Self::IndexNoFieldnorms | Self::IndexWithFieldnorms => true,
}
}
pub fn has_fieldnorms(&self) -> bool {
match *self {
Self::NoIndex | Self::IndexNoFieldnorms => false,
Self::IndexWithFieldnorms => true,
}
}
}
/// Define how an u64, i64, of f64 field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct IntOptions {
indexed: bool,
indexed: IntOptionIndex,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
stored: bool,
@@ -31,7 +71,7 @@ impl IntOptions {
/// Returns true iff the value is indexed.
pub fn is_indexed(&self) -> bool {
self.indexed
self.indexed.is_indexed()
}
/// Returns true iff the value is a fast field.
@@ -48,12 +88,21 @@ impl IntOptions {
self
}
pub fn index_option(&self) -> &IntOptionIndex {
&self.indexed
}
pub fn set_indexed(mut self) -> IntOptions {
self.indexed = IntOptionIndex::IndexWithFieldnorms;
self
}
/// Set the field as indexed.
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
pub fn set_indexed(mut self) -> IntOptions {
self.indexed = true;
pub fn set_index_option(mut self, int_option_index: IntOptionIndex) -> IntOptions {
self.indexed = int_option_index;
self
}
@@ -80,7 +129,7 @@ impl IntOptions {
impl Default for IntOptions {
fn default() -> IntOptions {
IntOptions {
indexed: false,
indexed: IntOptionIndex::NoIndex,
stored: false,
fast: None,
}
@@ -96,7 +145,7 @@ impl From<()> for IntOptions {
impl From<FastFlag> for IntOptions {
fn from(_: FastFlag) -> Self {
IntOptions {
indexed: false,
indexed: IntOptionIndex::NoIndex,
stored: false,
fast: Some(Cardinality::SingleValue),
}
@@ -106,7 +155,7 @@ impl From<FastFlag> for IntOptions {
impl From<StoredFlag> for IntOptions {
fn from(_: StoredFlag) -> Self {
IntOptions {
indexed: false,
indexed: IntOptionIndex::NoIndex,
stored: true,
fast: None,
}
@@ -116,7 +165,7 @@ impl From<StoredFlag> for IntOptions {
impl From<IndexedFlag> for IntOptions {
fn from(_: IndexedFlag) -> Self {
IntOptions {
indexed: true,
indexed: IntOptionIndex::IndexWithFieldnorms,
stored: false,
fast: None,
}

View File

@@ -231,6 +231,10 @@ impl Schema {
&self.0.fields[field.field_id() as usize]
}
pub fn num_fields(&self) -> usize {
self.0.fields.len()
}
/// Return the field name for a given `Field`.
pub fn get_field_name(&self, field: Field) -> &str {
self.get_field_entry(field).name()
@@ -444,7 +448,8 @@ mod tests {
"options": {
"indexing": {
"record": "position",
"tokenizer": "default"
"tokenizer": "default",
"fieldnorms": true
},
"stored": false
}
@@ -455,7 +460,8 @@ mod tests {
"options": {
"indexing": {
"record": "basic",
"tokenizer": "raw"
"tokenizer": "raw",
"fieldnorms": false
},
"stored": false
}
@@ -464,7 +470,7 @@ mod tests {
"name": "count",
"type": "u64",
"options": {
"indexed": false,
"indexed": "no_index",
"fast": "single",
"stored": true
}
@@ -473,7 +479,7 @@ mod tests {
"name": "popularity",
"type": "i64",
"options": {
"indexed": false,
"indexed": "no_index",
"fast": "single",
"stored": true
}
@@ -482,7 +488,7 @@ mod tests {
"name": "score",
"type": "f64",
"options": {
"indexed": true,
"indexed": "index_with_fieldnorms",
"fast": "single",
"stored": false
}
@@ -747,7 +753,8 @@ mod tests {
"options": {
"indexing": {
"record": "position",
"tokenizer": "default"
"tokenizer": "default",
"fieldnorms": true
},
"stored": false
}
@@ -756,7 +763,7 @@ mod tests {
"name": "popularity",
"type": "i64",
"options": {
"indexed": false,
"indexed": "no_index",
"fast": "single",
"stored": true
}
@@ -777,7 +784,8 @@ mod tests {
"options": {
"indexing": {
"record": "basic",
"tokenizer": "raw"
"tokenizer": "raw",
"fieldnorms": false
},
"stored": true
}
@@ -786,7 +794,7 @@ mod tests {
"name": "_timestamp",
"type": "date",
"options": {
"indexed": true,
"indexed": "index_with_fieldnorms",
"fast": "single",
"stored": true
}
@@ -797,7 +805,8 @@ mod tests {
"options": {
"indexing": {
"record": "position",
"tokenizer": "default"
"tokenizer": "default",
"fieldnorms": true
},
"stored": false
}
@@ -806,7 +815,7 @@ mod tests {
"name": "popularity",
"type": "i64",
"options": {
"indexed": false,
"indexed": "no_index",
"fast": "single",
"stored": true
}

View File

@@ -55,6 +55,7 @@ impl Default for TextOptions {
pub struct TextFieldIndexing {
record: IndexRecordOption,
tokenizer: Cow<'static, str>,
fieldnorms: bool,
}
impl Default for TextFieldIndexing {
@@ -62,6 +63,7 @@ impl Default for TextFieldIndexing {
TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::Basic,
fieldnorms: false,
}
}
}
@@ -78,6 +80,15 @@ impl TextFieldIndexing {
&self.tokenizer
}
pub fn set_fieldnorms(mut self, fieldnorms: bool) -> TextFieldIndexing {
self.fieldnorms = fieldnorms;
self
}
pub fn fieldnorms(&self) -> bool {
self.fieldnorms
}
/// Sets which information should be indexed with the tokens.
///
/// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail.
@@ -99,6 +110,7 @@ pub const STRING: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
record: IndexRecordOption::Basic,
fieldnorms: false,
}),
stored: false,
};
@@ -108,6 +120,7 @@ pub const TEXT: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::WithFreqsAndPositions,
fieldnorms: true,
}),
stored: false,
};