From b105bf72e16698528bc882bf5f4bf9a079e162f5 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 14 Mar 2022 05:54:06 +0100 Subject: [PATCH] use defaults in meta.json (#1310) This change allows to have unset fields in meta.json and fall back to their defaults Currently it is required to explicitly put e.g. fieldnorms: false --- examples/aggregation.rs | 4 +- src/aggregation/mod.rs | 12 ++--- src/core/index_meta.rs | 14 ++++-- src/indexer/merger.rs | 4 +- src/query/phrase_query/mod.rs | 4 +- src/schema/field_type.rs | 2 +- src/schema/index_record_option.rs | 6 +++ src/schema/text_options.rs | 75 ++++++++++++++++++++++++++----- 8 files changed, 87 insertions(+), 34 deletions(-) diff --git a/examples/aggregation.rs b/examples/aggregation.rs index 3c6ec843f..f24ac6d46 100644 --- a/examples/aggregation.rs +++ b/examples/aggregation.rs @@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> { let mut schema_builder = Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index d4f77fa94..adb59b656 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -314,9 +314,7 @@ mod tests { let mut schema_builder = Schema::builder(); let text_fieldtype = crate::schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); @@ -461,9 +459,7 @@ mod tests { let mut schema_builder = Schema::builder(); let text_fieldtype = crate::schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); @@ -957,9 +953,7 @@ mod tests { let mut schema_builder = Schema::builder(); let text_fieldtype = crate::schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index d62e9902e..c58b830ee 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -239,7 +239,7 @@ impl InnerSegmentMeta { /// /// Contains settings which are applied on the whole /// index, like presort documents. -#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexSettings { /// Sorts the documents by information /// provided in `IndexSortByField` @@ -254,7 +254,7 @@ pub struct IndexSettings { /// Presorting documents can greatly performance /// in some scenarios, by applying top n /// optimizations. -#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexSortByField { /// The field to sort the documents by pub field: String, @@ -262,7 +262,7 @@ pub struct IndexSortByField { pub order: Order, } /// The order to sort by -#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub enum Order { /// Ascending Order Asc, @@ -298,12 +298,12 @@ pub struct IndexMeta { pub schema: Schema, /// Opstamp associated to the last `commit` operation. pub opstamp: Opstamp, - #[serde(skip_serializing_if = "Option::is_none")] /// Payload associated to the last commit. /// /// Upon commit, clients can optionally add a small `String` payload to their commit /// to help identify this commit. /// This payload is entirely unused by tantivy. + #[serde(skip_serializing_if = "Option::is_none")] pub payload: Option, } @@ -374,6 +374,7 @@ impl fmt::Debug for IndexMeta { mod tests { use super::IndexMeta; + use crate::core::index_meta::UntrackedIndexMeta; use crate::schema::{Schema, TEXT}; use crate::{IndexSettings, IndexSortByField, Order}; @@ -402,5 +403,10 @@ mod tests { json, r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"# ); + + let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); + assert_eq!(index_metas.index_settings, deser_meta.index_settings); + assert_eq!(index_metas.schema, deser_meta.schema); + assert_eq!(index_metas.opstamp, deser_meta.opstamp); } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 775baca74..7327a70f4 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1157,9 +1157,7 @@ mod tests { let mut schema_builder = schema::Schema::builder(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 2b3f5469e..1458ba2c2 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -126,9 +126,7 @@ pub mod tests { let mut schema_builder = Schema::builder(); use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; let no_positions = TextOptions::default().set_indexing_options( - TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs), + TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ); let text_field = schema_builder.add_text_field("text", no_positions); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index c52e8adfa..01be36950 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -183,7 +183,7 @@ impl FieldType { } } - /// returns true if the field is normed. + /// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)). pub fn has_fieldnorms(&self) -> bool { match *self { FieldType::Str(ref text_options) => text_options diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index 395b102c2..a2f873cba 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -29,6 +29,12 @@ pub enum IndexRecordOption { WithFreqsAndPositions, } +impl Default for IndexRecordOption { + fn default() -> Self { + IndexRecordOption::Basic + } +} + impl IndexRecordOption { /// Returns true if this option includes encoding /// term frequencies. diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 63ad52b48..8b7161e03 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -9,7 +9,10 @@ use crate::schema::IndexRecordOption; /// Define how a text field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)] pub struct TextOptions { + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] indexing: Option, + #[serde(default)] stored: bool, } @@ -39,26 +42,56 @@ impl TextOptions { } } +#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] +struct TokenizerName(Cow<'static, str>); + +impl Default for TokenizerName { + fn default() -> Self { + TokenizerName::from_static("default") + } +} + +impl TokenizerName { + const fn from_static(name: &'static str) -> Self { + TokenizerName(Cow::Borrowed(name)) + } + fn from_name(name: &str) -> Self { + TokenizerName(Cow::Owned(name.to_string())) + } + fn name(&self) -> &str { + &self.0 + } +} + /// Configuration defining indexing for a text field. /// /// It defines -/// - the amount of information that should be stored about the presence of a term in a document. +/// - The amount of information that should be stored about the presence of a term in a document. /// Essentially, should we store the term frequency and/or the positions (See /// [`IndexRecordOption`](./enum.IndexRecordOption.html)). -/// - the name of the `Tokenizer` that should be used to process the field. +/// - The name of the `Tokenizer` that should be used to process the field. +/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults +/// to `true`. #[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)] pub struct TextFieldIndexing { + #[serde(default)] record: IndexRecordOption, + #[serde(default = "default_fieldnorms")] fieldnorms: bool, - tokenizer: Cow<'static, str>, + #[serde(default)] + tokenizer: TokenizerName, +} + +pub(crate) fn default_fieldnorms() -> bool { + true } impl Default for TextFieldIndexing { fn default() -> TextFieldIndexing { TextFieldIndexing { - tokenizer: Cow::Borrowed("default"), - record: IndexRecordOption::Basic, - fieldnorms: true, + tokenizer: TokenizerName::default(), + record: IndexRecordOption::default(), + fieldnorms: default_fieldnorms(), } } } @@ -67,13 +100,13 @@ impl TextFieldIndexing { /// Sets the tokenizer to be used for a given field. #[must_use] pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing { - self.tokenizer = Cow::Owned(tokenizer_name.to_string()); + self.tokenizer = TokenizerName::from_name(tokenizer_name); self } /// Returns the tokenizer that will be used for this field. pub fn tokenizer(&self) -> &str { - &self.tokenizer + &self.tokenizer.name() } /// Sets fieldnorms @@ -83,7 +116,7 @@ impl TextFieldIndexing { self } - /// Returns true if and only if fieldnorms are stored. + /// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored. pub fn fieldnorms(&self) -> bool { self.fieldnorms } @@ -108,7 +141,7 @@ impl TextFieldIndexing { /// The field will be untokenized and indexed. pub const STRING: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { - tokenizer: Cow::Borrowed("raw"), + tokenizer: TokenizerName::from_static("raw"), fieldnorms: true, record: IndexRecordOption::Basic, }), @@ -118,7 +151,7 @@ pub const STRING: TextOptions = TextOptions { /// The field will be tokenized and indexed. pub const TEXT: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { - tokenizer: Cow::Borrowed("default"), + tokenizer: TokenizerName::from_static("default"), fieldnorms: true, record: IndexRecordOption::WithFreqsAndPositions, }), @@ -187,4 +220,24 @@ mod tests { assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs); assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic); } + + #[test] + fn serde_default_test() { + let json = r#" + { + "indexing": { + "record": "basic", + "fieldnorms": true, + "tokenizer": "default" + }, + "stored": false + } + "#; + let options: TextOptions = serde_json::from_str(json).unwrap(); + let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap(); + assert_eq!(options, options2); + assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic); + let options3: TextOptions = serde_json::from_str("{}").unwrap(); + assert_eq!(options3.indexing, None); + } }