use defaults in meta.json (#1310)

This change allows to have unset fields in meta.json and fall back to their defaults
Currently it is required to explicitly put e.g. fieldnorms: false
This commit is contained in:
PSeitz
2022-03-14 05:54:06 +01:00
committed by GitHub
parent 2e255c4bef
commit b105bf72e1
8 changed files with 87 additions and 34 deletions

View File

@@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);

View File

@@ -314,9 +314,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -461,9 +459,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -957,9 +953,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);

View File

@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
///
/// Contains settings which are applied on the whole
/// index, like presort documents.
#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSettings {
/// Sorts the documents by information
/// provided in `IndexSortByField`
@@ -254,7 +254,7 @@ pub struct IndexSettings {
/// Presorting documents can greatly performance
/// in some scenarios, by applying top n
/// optimizations.
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSortByField {
/// The field to sort the documents by
pub field: String,
@@ -262,7 +262,7 @@ pub struct IndexSortByField {
pub order: Order,
}
/// The order to sort by
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum Order {
/// Ascending Order
Asc,
@@ -298,12 +298,12 @@ pub struct IndexMeta {
pub schema: Schema,
/// Opstamp associated to the last `commit` operation.
pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit.
///
/// Upon commit, clients can optionally add a small `String` payload to their commit
/// to help identify this commit.
/// This payload is entirely unused by tantivy.
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
}
@@ -374,6 +374,7 @@ impl fmt::Debug for IndexMeta {
mod tests {
use super::IndexMeta;
use crate::core::index_meta::UntrackedIndexMeta;
use crate::schema::{Schema, TEXT};
use crate::{IndexSettings, IndexSortByField, Order};
@@ -402,5 +403,10 @@ mod tests {
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
assert_eq!(index_metas.schema, deser_meta.schema);
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
}
}

View File

@@ -1157,9 +1157,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);

View File

@@ -126,9 +126,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
let no_positions = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
);
let text_field = schema_builder.add_text_field("text", no_positions);

View File

@@ -183,7 +183,7 @@ impl FieldType {
}
}
/// returns true if the field is normed.
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {
FieldType::Str(ref text_options) => text_options

View File

@@ -29,6 +29,12 @@ pub enum IndexRecordOption {
WithFreqsAndPositions,
}
impl Default for IndexRecordOption {
fn default() -> Self {
IndexRecordOption::Basic
}
}
impl IndexRecordOption {
/// Returns true if this option includes encoding
/// term frequencies.

View File

@@ -9,7 +9,10 @@ use crate::schema::IndexRecordOption;
/// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
pub struct TextOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
indexing: Option<TextFieldIndexing>,
#[serde(default)]
stored: bool,
}
@@ -39,26 +42,56 @@ impl TextOptions {
}
}
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
struct TokenizerName(Cow<'static, str>);
impl Default for TokenizerName {
fn default() -> Self {
TokenizerName::from_static("default")
}
}
impl TokenizerName {
const fn from_static(name: &'static str) -> Self {
TokenizerName(Cow::Borrowed(name))
}
fn from_name(name: &str) -> Self {
TokenizerName(Cow::Owned(name.to_string()))
}
fn name(&self) -> &str {
&self.0
}
}
/// Configuration defining indexing for a text field.
///
/// It defines
/// - the amount of information that should be stored about the presence of a term in a document.
/// - The amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field.
/// - The name of the `Tokenizer` that should be used to process the field.
/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
/// to `true`.
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
pub struct TextFieldIndexing {
#[serde(default)]
record: IndexRecordOption,
#[serde(default = "default_fieldnorms")]
fieldnorms: bool,
tokenizer: Cow<'static, str>,
#[serde(default)]
tokenizer: TokenizerName,
}
pub(crate) fn default_fieldnorms() -> bool {
true
}
impl Default for TextFieldIndexing {
fn default() -> TextFieldIndexing {
TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::Basic,
fieldnorms: true,
tokenizer: TokenizerName::default(),
record: IndexRecordOption::default(),
fieldnorms: default_fieldnorms(),
}
}
}
@@ -67,13 +100,13 @@ impl TextFieldIndexing {
/// Sets the tokenizer to be used for a given field.
#[must_use]
pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
self.tokenizer = Cow::Owned(tokenizer_name.to_string());
self.tokenizer = TokenizerName::from_name(tokenizer_name);
self
}
/// Returns the tokenizer that will be used for this field.
pub fn tokenizer(&self) -> &str {
&self.tokenizer
&self.tokenizer.name()
}
/// Sets fieldnorms
@@ -83,7 +116,7 @@ impl TextFieldIndexing {
self
}
/// Returns true if and only if fieldnorms are stored.
/// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored.
pub fn fieldnorms(&self) -> bool {
self.fieldnorms
}
@@ -108,7 +141,7 @@ impl TextFieldIndexing {
/// The field will be untokenized and indexed.
pub const STRING: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
tokenizer: TokenizerName::from_static("raw"),
fieldnorms: true,
record: IndexRecordOption::Basic,
}),
@@ -118,7 +151,7 @@ pub const STRING: TextOptions = TextOptions {
/// The field will be tokenized and indexed.
pub const TEXT: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
tokenizer: TokenizerName::from_static("default"),
fieldnorms: true,
record: IndexRecordOption::WithFreqsAndPositions,
}),
@@ -187,4 +220,24 @@ mod tests {
assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs);
assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
}
#[test]
fn serde_default_test() {
let json = r#"
{
"indexing": {
"record": "basic",
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
}
"#;
let options: TextOptions = serde_json::from_str(json).unwrap();
let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap();
assert_eq!(options, options2);
assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic);
let options3: TextOptions = serde_json::from_str("{}").unwrap();
assert_eq!(options3.indexing, None);
}
}