From b105bf72e16698528bc882bf5f4bf9a079e162f5 Mon Sep 17 00:00:00 2001
From: PSeitz <PSeitz@users.noreply.github.com>
Date: Mon, 14 Mar 2022 05:54:06 +0100
Subject: [PATCH] use defaults in meta.json (#1310)

This change allows to have unset fields in meta.json and fall back to their defaults
Currently it is required to explicitly put e.g. fieldnorms: false
---
 examples/aggregation.rs           |  4 +-
 src/aggregation/mod.rs            | 12 ++---
 src/core/index_meta.rs            | 14 ++++--
 src/indexer/merger.rs             |  4 +-
 src/query/phrase_query/mod.rs     |  4 +-
 src/schema/field_type.rs          |  2 +-
 src/schema/index_record_option.rs |  6 +++
 src/schema/text_options.rs        | 75 ++++++++++++++++++++++++++-----
 8 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/examples/aggregation.rs b/examples/aggregation.rs
index 3c6ec843f..f24ac6d46 100644
--- a/examples/aggregation.rs
+++ b/examples/aggregation.rs
@@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> {
     let mut schema_builder = Schema::builder();
     let text_fieldtype = schema::TextOptions::default()
         .set_indexing_options(
-            TextFieldIndexing::default()
-                .set_tokenizer("default")
-                .set_index_option(IndexRecordOption::WithFreqs),
+            TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
         )
         .set_stored();
     let text_field = schema_builder.add_text_field("text", text_fieldtype);
diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs
index d4f77fa94..adb59b656 100644
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -314,9 +314,7 @@ mod tests {
         let mut schema_builder = Schema::builder();
         let text_fieldtype = crate::schema::TextOptions::default()
             .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
             )
             .set_stored();
         let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -461,9 +459,7 @@ mod tests {
         let mut schema_builder = Schema::builder();
         let text_fieldtype = crate::schema::TextOptions::default()
             .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
             )
             .set_stored();
         let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -957,9 +953,7 @@ mod tests {
             let mut schema_builder = Schema::builder();
             let text_fieldtype = crate::schema::TextOptions::default()
                 .set_indexing_options(
-                    TextFieldIndexing::default()
-                        .set_tokenizer("default")
-                        .set_index_option(IndexRecordOption::WithFreqs),
+                    TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
                 )
                 .set_stored();
             let text_field = schema_builder.add_text_field("text", text_fieldtype);
diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
index d62e9902e..c58b830ee 100644
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
 ///
 /// Contains settings which are applied on the whole
 /// index, like presort documents.
-#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
 pub struct IndexSettings {
     /// Sorts the documents by information
     /// provided in `IndexSortByField`
@@ -254,7 +254,7 @@ pub struct IndexSettings {
 /// Presorting documents can greatly performance
 /// in some scenarios, by applying top n
 /// optimizations.
-#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub struct IndexSortByField {
     /// The field to sort the documents by
     pub field: String,
@@ -262,7 +262,7 @@ pub struct IndexSortByField {
     pub order: Order,
 }
 /// The order to sort by
-#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub enum Order {
     /// Ascending Order
     Asc,
@@ -298,12 +298,12 @@ pub struct IndexMeta {
     pub schema: Schema,
     /// Opstamp associated to the last `commit` operation.
     pub opstamp: Opstamp,
-    #[serde(skip_serializing_if = "Option::is_none")]
     /// Payload associated to the last commit.
     ///
     /// Upon commit, clients can optionally add a small `String` payload to their commit
     /// to help identify this commit.
     /// This payload is entirely unused by tantivy.
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub payload: Option<String>,
 }
 
@@ -374,6 +374,7 @@ impl fmt::Debug for IndexMeta {
 mod tests {
 
     use super::IndexMeta;
+    use crate::core::index_meta::UntrackedIndexMeta;
     use crate::schema::{Schema, TEXT};
     use crate::{IndexSettings, IndexSortByField, Order};
 
@@ -402,5 +403,10 @@ mod tests {
             json,
             r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
         );
+
+        let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
+        assert_eq!(index_metas.index_settings, deser_meta.index_settings);
+        assert_eq!(index_metas.schema, deser_meta.schema);
+        assert_eq!(index_metas.opstamp, deser_meta.opstamp);
     }
 }
diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
index 775baca74..7327a70f4 100644
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1157,9 +1157,7 @@ mod tests {
         let mut schema_builder = schema::Schema::builder();
         let text_fieldtype = schema::TextOptions::default()
             .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
             )
             .set_stored();
         let text_field = schema_builder.add_text_field("text", text_fieldtype);
diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs
index 2b3f5469e..1458ba2c2 100644
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -126,9 +126,7 @@ pub mod tests {
         let mut schema_builder = Schema::builder();
         use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
         let no_positions = TextOptions::default().set_indexing_options(
-            TextFieldIndexing::default()
-                .set_tokenizer("default")
-                .set_index_option(IndexRecordOption::WithFreqs),
+            TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
         );
 
         let text_field = schema_builder.add_text_field("text", no_positions);
diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs
index c52e8adfa..01be36950 100644
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -183,7 +183,7 @@ impl FieldType {
         }
     }
 
-    /// returns true if the field is normed.
+    /// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
     pub fn has_fieldnorms(&self) -> bool {
         match *self {
             FieldType::Str(ref text_options) => text_options
diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs
index 395b102c2..a2f873cba 100644
--- a/src/schema/index_record_option.rs
+++ b/src/schema/index_record_option.rs
@@ -29,6 +29,12 @@ pub enum IndexRecordOption {
     WithFreqsAndPositions,
 }
 
+impl Default for IndexRecordOption {
+    fn default() -> Self {
+        IndexRecordOption::Basic
+    }
+}
+
 impl IndexRecordOption {
     /// Returns true if this option includes encoding
     /// term frequencies.
diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs
index 63ad52b48..8b7161e03 100644
--- a/src/schema/text_options.rs
+++ b/src/schema/text_options.rs
@@ -9,7 +9,10 @@ use crate::schema::IndexRecordOption;
 /// Define how a text field should be handled by tantivy.
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
 pub struct TextOptions {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
     indexing: Option<TextFieldIndexing>,
+    #[serde(default)]
     stored: bool,
 }
 
@@ -39,26 +42,56 @@ impl TextOptions {
     }
 }
 
+#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
+struct TokenizerName(Cow<'static, str>);
+
+impl Default for TokenizerName {
+    fn default() -> Self {
+        TokenizerName::from_static("default")
+    }
+}
+
+impl TokenizerName {
+    const fn from_static(name: &'static str) -> Self {
+        TokenizerName(Cow::Borrowed(name))
+    }
+    fn from_name(name: &str) -> Self {
+        TokenizerName(Cow::Owned(name.to_string()))
+    }
+    fn name(&self) -> &str {
+        &self.0
+    }
+}
+
 /// Configuration defining indexing for a text field.
 ///
 /// It defines
-/// - the amount of information that should be stored about the presence of a term in a document.
+/// - The amount of information that should be stored about the presence of a term in a document.
 /// Essentially, should we store the term frequency and/or the positions (See
 /// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
-/// - the name of the `Tokenizer` that should be used to process the field.
+/// - The name of the `Tokenizer` that should be used to process the field.
+/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
+///   to `true`.
 #[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
 pub struct TextFieldIndexing {
+    #[serde(default)]
     record: IndexRecordOption,
+    #[serde(default = "default_fieldnorms")]
     fieldnorms: bool,
-    tokenizer: Cow<'static, str>,
+    #[serde(default)]
+    tokenizer: TokenizerName,
+}
+
+pub(crate) fn default_fieldnorms() -> bool {
+    true
 }
 
 impl Default for TextFieldIndexing {
     fn default() -> TextFieldIndexing {
         TextFieldIndexing {
-            tokenizer: Cow::Borrowed("default"),
-            record: IndexRecordOption::Basic,
-            fieldnorms: true,
+            tokenizer: TokenizerName::default(),
+            record: IndexRecordOption::default(),
+            fieldnorms: default_fieldnorms(),
         }
     }
 }
@@ -67,13 +100,13 @@ impl TextFieldIndexing {
     /// Sets the tokenizer to be used for a given field.
     #[must_use]
     pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
-        self.tokenizer = Cow::Owned(tokenizer_name.to_string());
+        self.tokenizer = TokenizerName::from_name(tokenizer_name);
         self
     }
 
     /// Returns the tokenizer that will be used for this field.
     pub fn tokenizer(&self) -> &str {
-        &self.tokenizer
+        &self.tokenizer.name()
     }
 
     /// Sets fieldnorms
@@ -83,7 +116,7 @@ impl TextFieldIndexing {
         self
     }
 
-    /// Returns true if and only if fieldnorms are stored.
+    /// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored.
     pub fn fieldnorms(&self) -> bool {
         self.fieldnorms
     }
@@ -108,7 +141,7 @@ impl TextFieldIndexing {
 /// The field will be untokenized and indexed.
 pub const STRING: TextOptions = TextOptions {
     indexing: Some(TextFieldIndexing {
-        tokenizer: Cow::Borrowed("raw"),
+        tokenizer: TokenizerName::from_static("raw"),
         fieldnorms: true,
         record: IndexRecordOption::Basic,
     }),
@@ -118,7 +151,7 @@ pub const STRING: TextOptions = TextOptions {
 /// The field will be tokenized and indexed.
 pub const TEXT: TextOptions = TextOptions {
     indexing: Some(TextFieldIndexing {
-        tokenizer: Cow::Borrowed("default"),
+        tokenizer: TokenizerName::from_static("default"),
         fieldnorms: true,
         record: IndexRecordOption::WithFreqsAndPositions,
     }),
@@ -187,4 +220,24 @@ mod tests {
         assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs);
         assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
     }
+
+    #[test]
+    fn serde_default_test() {
+        let json = r#"
+        {
+            "indexing": {
+                "record": "basic",
+                "fieldnorms": true,
+                "tokenizer": "default"
+            },
+            "stored": false
+        }
+        "#;
+        let options: TextOptions = serde_json::from_str(json).unwrap();
+        let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap();
+        assert_eq!(options, options2);
+        assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic);
+        let options3: TextOptions = serde_json::from_str("{}").unwrap();
+        assert_eq!(options3.indexing, None);
+    }
 }