use defaults in meta.json (#1310)

This change allows to have unset fields in meta.json and fall back to their defaults Currently it is required to explicitly put e.g. fieldnorms: false
2025-12-23 02:29:57 +00:00 · 2022-03-14 05:54:06 +01:00
parent 2e255c4bef
commit b105bf72e1
8 changed files with 87 additions and 34 deletions
--- a/examples/aggregation.rs
+++ b/examples/aggregation.rs
@@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> {
    let mut schema_builder = Schema::builder();
    let text_fieldtype = schema::TextOptions::default()
        .set_indexing_options(
-            TextFieldIndexing::default()
-                .set_tokenizer("default")
-                .set_index_option(IndexRecordOption::WithFreqs),
+            TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
        )
        .set_stored();
    let text_field = schema_builder.add_text_field("text", text_fieldtype);
--- a/src/aggregation/mod.rs
+++ b/src/aggregation/mod.rs
@@ -314,9 +314,7 @@ mod tests {
        let mut schema_builder = Schema::builder();
        let text_fieldtype = crate::schema::TextOptions::default()
            .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
            )
            .set_stored();
        let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -461,9 +459,7 @@ mod tests {
        let mut schema_builder = Schema::builder();
        let text_fieldtype = crate::schema::TextOptions::default()
            .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
            )
            .set_stored();
        let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -957,9 +953,7 @@ mod tests {
            let mut schema_builder = Schema::builder();
            let text_fieldtype = crate::schema::TextOptions::default()
                .set_indexing_options(
-                    TextFieldIndexing::default()
-                        .set_tokenizer("default")
-                        .set_index_option(IndexRecordOption::WithFreqs),
+                    TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
                )
                .set_stored();
            let text_field = schema_builder.add_text_field("text", text_fieldtype);
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
 ///
 /// Contains settings which are applied on the whole
 /// index, like presort documents.
-#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
 pub struct IndexSettings {
    /// Sorts the documents by information
    /// provided in `IndexSortByField`
@@ -254,7 +254,7 @@ pub struct IndexSettings {
 /// Presorting documents can greatly performance
 /// in some scenarios, by applying top n
 /// optimizations.
-#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub struct IndexSortByField {
    /// The field to sort the documents by
    pub field: String,
@@ -262,7 +262,7 @@ pub struct IndexSortByField {
    pub order: Order,
 }
 /// The order to sort by
-#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub enum Order {
    /// Ascending Order
    Asc,
@@ -298,12 +298,12 @@ pub struct IndexMeta {
    pub schema: Schema,
    /// Opstamp associated to the last `commit` operation.
    pub opstamp: Opstamp,
-    #[serde(skip_serializing_if = "Option::is_none")]
    /// Payload associated to the last commit.
    ///
    /// Upon commit, clients can optionally add a small `String` payload to their commit
    /// to help identify this commit.
    /// This payload is entirely unused by tantivy.
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub payload: Option<String>,
 }

@@ -374,6 +374,7 @@ impl fmt::Debug for IndexMeta {
 mod tests {

    use super::IndexMeta;
+    use crate::core::index_meta::UntrackedIndexMeta;
    use crate::schema::{Schema, TEXT};
    use crate::{IndexSettings, IndexSortByField, Order};

@@ -402,5 +403,10 @@ mod tests {
            json,
            r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
        );
+
+        let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
+        assert_eq!(index_metas.index_settings, deser_meta.index_settings);
+        assert_eq!(index_metas.schema, deser_meta.schema);
+        assert_eq!(index_metas.opstamp, deser_meta.opstamp);
    }
 }
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -1157,9 +1157,7 @@ mod tests {
        let mut schema_builder = schema::Schema::builder();
        let text_fieldtype = schema::TextOptions::default()
            .set_indexing_options(
-                TextFieldIndexing::default()
-                    .set_tokenizer("default")
-                    .set_index_option(IndexRecordOption::WithFreqs),
+                TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
            )
            .set_stored();
        let text_field = schema_builder.add_text_field("text", text_fieldtype);
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -126,9 +126,7 @@ pub mod tests {
        let mut schema_builder = Schema::builder();
        use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
        let no_positions = TextOptions::default().set_indexing_options(
-            TextFieldIndexing::default()
-                .set_tokenizer("default")
-                .set_index_option(IndexRecordOption::WithFreqs),
+            TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
        );

        let text_field = schema_builder.add_text_field("text", no_positions);
--- a/src/schema/field_type.rs
+++ b/src/schema/field_type.rs
@@ -183,7 +183,7 @@ impl FieldType {
        }
    }

-    /// returns true if the field is normed.
+    /// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
    pub fn has_fieldnorms(&self) -> bool {
        match *self {
            FieldType::Str(ref text_options) => text_options
--- a/src/schema/index_record_option.rs
+++ b/src/schema/index_record_option.rs
@@ -29,6 +29,12 @@ pub enum IndexRecordOption {
    WithFreqsAndPositions,
 }

+impl Default for IndexRecordOption {
+    fn default() -> Self {
+        IndexRecordOption::Basic
+    }
+}
+
 impl IndexRecordOption {
    /// Returns true if this option includes encoding
    /// term frequencies.
--- a/src/schema/text_options.rs
+++ b/src/schema/text_options.rs
@@ -9,7 +9,10 @@ use crate::schema::IndexRecordOption;
 /// Define how a text field should be handled by tantivy.
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
 pub struct TextOptions {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
    indexing: Option<TextFieldIndexing>,
+    #[serde(default)]
    stored: bool,
 }

@@ -39,26 +42,56 @@ impl TextOptions {
    }
 }

+#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
+struct TokenizerName(Cow<'static, str>);
+
+impl Default for TokenizerName {
+    fn default() -> Self {
+        TokenizerName::from_static("default")
+    }
+}
+
+impl TokenizerName {
+    const fn from_static(name: &'static str) -> Self {
+        TokenizerName(Cow::Borrowed(name))
+    }
+    fn from_name(name: &str) -> Self {
+        TokenizerName(Cow::Owned(name.to_string()))
+    }
+    fn name(&self) -> &str {
+        &self.0
+    }
+}
+
 /// Configuration defining indexing for a text field.
 ///
 /// It defines
-/// - the amount of information that should be stored about the presence of a term in a document.
+/// - The amount of information that should be stored about the presence of a term in a document.
 /// Essentially, should we store the term frequency and/or the positions (See
 /// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
-/// - the name of the `Tokenizer` that should be used to process the field.
+/// - The name of the `Tokenizer` that should be used to process the field.
+/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
+///   to `true`.
 #[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
 pub struct TextFieldIndexing {
+    #[serde(default)]
    record: IndexRecordOption,
+    #[serde(default = "default_fieldnorms")]
    fieldnorms: bool,
-    tokenizer: Cow<'static, str>,
+    #[serde(default)]
+    tokenizer: TokenizerName,
+}
+
+pub(crate) fn default_fieldnorms() -> bool {
+    true
 }

 impl Default for TextFieldIndexing {
    fn default() -> TextFieldIndexing {
        TextFieldIndexing {
-            tokenizer: Cow::Borrowed("default"),
-            record: IndexRecordOption::Basic,
-            fieldnorms: true,
+            tokenizer: TokenizerName::default(),
+            record: IndexRecordOption::default(),
+            fieldnorms: default_fieldnorms(),
        }
    }
 }
@@ -67,13 +100,13 @@ impl TextFieldIndexing {
    /// Sets the tokenizer to be used for a given field.
    #[must_use]
    pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
-        self.tokenizer = Cow::Owned(tokenizer_name.to_string());
+        self.tokenizer = TokenizerName::from_name(tokenizer_name);
        self
    }

    /// Returns the tokenizer that will be used for this field.
    pub fn tokenizer(&self) -> &str {
-        &self.tokenizer
+        &self.tokenizer.name()
    }

    /// Sets fieldnorms
@@ -83,7 +116,7 @@ impl TextFieldIndexing {
        self
    }

-    /// Returns true if and only if fieldnorms are stored.
+    /// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored.
    pub fn fieldnorms(&self) -> bool {
        self.fieldnorms
    }
@@ -108,7 +141,7 @@ impl TextFieldIndexing {
 /// The field will be untokenized and indexed.
 pub const STRING: TextOptions = TextOptions {
    indexing: Some(TextFieldIndexing {
-        tokenizer: Cow::Borrowed("raw"),
+        tokenizer: TokenizerName::from_static("raw"),
        fieldnorms: true,
        record: IndexRecordOption::Basic,
    }),
@@ -118,7 +151,7 @@ pub const STRING: TextOptions = TextOptions {
 /// The field will be tokenized and indexed.
 pub const TEXT: TextOptions = TextOptions {
    indexing: Some(TextFieldIndexing {
-        tokenizer: Cow::Borrowed("default"),
+        tokenizer: TokenizerName::from_static("default"),
        fieldnorms: true,
        record: IndexRecordOption::WithFreqsAndPositions,
    }),
@@ -187,4 +220,24 @@ mod tests {
        assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs);
        assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
    }
+
+    #[test]
+    fn serde_default_test() {
+        let json = r#"
+        {
+            "indexing": {
+                "record": "basic",
+                "fieldnorms": true,
+                "tokenizer": "default"
+            },
+            "stored": false
+        }
+        "#;
+        let options: TextOptions = serde_json::from_str(json).unwrap();
+        let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap();
+        assert_eq!(options, options2);
+        assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic);
+        let options3: TextOptions = serde_json::from_str("{}").unwrap();
+        assert_eq!(options3.indexing, None);
+    }
 }