tantivy/src/schema/schema.rs

use std::collections::HashMap;

use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use rustc_serialize::json;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
use schema::field_type::ValueParsingError;
use std::sync::Arc;
use super::*;
use std::fmt;

/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::*;
///
/// let mut schema_builder = SchemaBuilder::default();
/// let id_field = schema_builder.add_text_field("id", STRING);
/// let title_field = schema_builder.add_text_field("title", TEXT);
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let schema = schema_builder.build();
///
/// ```
pub struct SchemaBuilder {
    fields: Vec<FieldEntry>,
    fields_map: HashMap<String, Field>,
}


impl SchemaBuilder {


    /// Create a new `SchemaBuilder`
    pub fn new() -> SchemaBuilder {
        SchemaBuilder::default()
    }

    /// Adds a new u64 field.
    /// Returns the associated field handle
    ///
    /// # Caution
    ///
    /// Appending two fields with the same name
    /// will result in the shadowing of the first
    /// by the second one.
    /// The first field will get a field id
    /// but only the second one will be indexed
    pub fn add_u64_field(
            &mut self,
            field_name_str: &str,
            field_options: IntOptions) -> Field {
        let field_name = String::from(field_name_str);
        let field_entry = FieldEntry::new_u64(field_name, field_options);
        self.add_field(field_entry)
    }

    /// Adds a new text field.
    /// Returns the associated field handle
    ///
    /// # Caution
    ///
    /// Appending two fields with the same name
    /// will result in the shadowing of the first
    /// by the second one.
    /// The first field will get a field id
    /// but only the second one will be indexed
    pub fn add_text_field(
            &mut self,
            field_name_str: &str,
            field_options: TextOptions) -> Field {
        let field_name = String::from(field_name_str);
        let field_entry = FieldEntry::new_text(field_name, field_options);
        self.add_field(field_entry)
    }


    /// Adds a field entry to the schema in build.
    fn add_field(&mut self, field_entry: FieldEntry) -> Field {
        let field = Field(self.fields.len() as u32);
        let field_name = field_entry.name().clone();
        self.fields.push(field_entry);
        self.fields_map.insert(field_name, field);
        field
    }


    /// Finalize the creation of a `Schema`
    /// This will consume your `SchemaBuilder`
    pub fn build(self,) -> Schema {
        Schema(Arc::new(InnerSchema {
            fields: self.fields,
            fields_map: self.fields_map,
        }))
    }
}


impl Default for SchemaBuilder {
    fn default() -> SchemaBuilder {
        SchemaBuilder {
            fields: Vec::new(),
            fields_map: HashMap::new(),
        }
    }
}

#[derive(Debug)]
struct InnerSchema {
    fields: Vec<FieldEntry>,
    fields_map: HashMap<String, Field>,  // transient
}


/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::*;
///
/// let mut schema_builder = SchemaBuilder::default();
/// let id_field = schema_builder.add_text_field("id", STRING);
/// let title_field = schema_builder.add_text_field("title", TEXT);
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let schema = schema_builder.build();
///
/// ```
#[derive(Clone)]
pub struct Schema(Arc<InnerSchema>);

impl Schema {

    /// Return the `FieldEntry` associated to a `Field`.
    pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
        &self.0.fields[field.0 as usize]
    }

    /// Return the field name for a given `Field`.
    pub fn get_field_name(&self, field: Field) -> &String {
        self.get_field_entry(field).name()
    }

    /// Return the list of all the `Field`s.
    pub fn fields(&self,) -> &[FieldEntry] {
        &self.0.fields
    }

    /// Returns the field options associated with a given name.
    ///
    /// # Panics
    /// Panics if the field name does not exist.
    /// It is meant as an helper for user who created
    /// and control the content of their schema.
    ///
    /// If panicking is not an option for you,
    /// you may use `get(&self, field_name: &str)`.
    pub fn get_field(&self, field_name: &str) -> Option<Field> {
        self.0.fields_map.get(field_name).cloned()
    }

    /// Create a named document off the doc.
    pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
        let mut field_map = BTreeMap::new();
        for (field, field_values) in doc.get_sorted_field_values() {
            let field_name = self.get_field_name(field);
            let values: Vec<Value> = field_values
                .into_iter()
                .map(|field_val| field_val.value() )
                .cloned()
                .collect();
            field_map.insert(field_name.clone(), values);
        }
        NamedFieldDocument(field_map)
    }


    /// Encode the schema in JSON.
    ///
    /// Encoding a document cannot fail.
    pub fn to_json(&self, doc: &Document) -> String {
        json::encode(&self.to_named_doc(doc)).unwrap()
    }

    /// Build a document object from a json-object.
    pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
        let json_node = try!(Json::from_str(doc_json));
        let some_json_obj = json_node.as_object();
        if !some_json_obj.is_some() {
            let doc_json_sample: String =
                if doc_json.len() < 20 {
                    String::from(doc_json)
                }
                else {
                    format!("{:?}...", &doc_json[0..20])
                };
            return Err(DocParsingError::NotJSONObject(doc_json_sample))
        }
        let json_obj = some_json_obj.unwrap();
        let mut doc = Document::default();
        for (field_name, json_value) in json_obj.iter() {
            match self.get_field(field_name) {
                Some(field) => {
                    let field_entry = self.get_field_entry(field);
                    let field_type = field_entry.field_type();
                    match *json_value {
                        Json::Array(ref json_items) => {
                            for json_item in json_items {
                                let value = try!(
                                    field_type
                                     .value_from_json(&json_item)
                                     .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))
                                );
                                doc.add(FieldValue::new(field, value));
                            }
                        }
                        _ => {
                            let value = try!(
                                field_type
                                 .value_from_json(&json_value)
                                 .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))
                            );
                            doc.add(FieldValue::new(field, value));
                        }

                    }
                }
                None => {
                    return Err(DocParsingError::NoSuchFieldInSchema(field_name.clone()))
                }
            }
        }
        Ok(doc)
    }
}

impl fmt::Debug for Schema {
    fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
        self.0.fmt(f)
    }
}

impl Decodable for Schema {
    fn decode<D: Decoder>(d: &mut D) -> Result  <Self, D::Error> {
        let mut schema_builder = SchemaBuilder::default();
        try!(d.read_seq(|d, num_fields| {
            for _ in 0..num_fields {
                let field_entry = try!(FieldEntry::decode(d));
                schema_builder.add_field(field_entry);
            }
            Ok(())
        }));
        Ok(schema_builder.build())
    }
}

impl Encodable for Schema {
    fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
        try!(s.emit_seq(self.0.fields.len(),
            |mut e| {
                for (ord, field) in self.0.fields.iter().enumerate() {
                    try!(e.emit_seq_elt(ord, |e| field.encode(e)));
                }
                Ok(())
            }));
        Ok(())
    }
}


impl From<SchemaBuilder> for Schema {
    fn from(schema_builder: SchemaBuilder) -> Schema {
        schema_builder.build()
    }
}


/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug)]
pub enum DocParsingError {
    /// The payload given is not valid JSON.
    NotJSON(json::ParserError),
    /// The payload given is not a JSON Object (`{...}`).
    NotJSONObject(String),
    /// One of the value node could not be parsed.
    ValueError(String, ValueParsingError),
    /// The json-document contains a field that is not declared in the schema.
    NoSuchFieldInSchema(String),
}

impl From<json::ParserError> for DocParsingError {
    fn from(err: json::ParserError) -> DocParsingError {
        DocParsingError::NotJSON(err)
    }
}


#[cfg(test)]
mod tests {

    use schema::*;
    use rustc_serialize::json;
    use schema::field_type::ValueParsingError;
    use schema::schema::DocParsingError::NotJSON;

    #[test]
    pub fn test_schema_serialization() {
        let mut schema_builder = SchemaBuilder::default();
        let count_options = IntOptions::default().set_stored().set_fast();
        schema_builder.add_text_field("title", TEXT);
        schema_builder.add_text_field("author", STRING);
        schema_builder.add_u64_field("count", count_options);
        let schema = schema_builder.build();
        let schema_json: String = format!("{}", json::as_pretty_json(&schema));
        let expected = r#"[
  {
    "name": "title",
    "type": "text",
    "options": {
      "indexing": "position",
      "stored": false
    }
  },
  {
    "name": "author",
    "type": "text",
    "options": {
      "indexing": "untokenized",
      "stored": false
    }
  },
  {
    "name": "count",
    "type": "u64",
    "options": {
      "indexed": false,
      "fast": true,
      "stored": true
    }
  }
]"#;
        assert_eq!(schema_json, expected);

    }


    #[test]
    pub fn test_document_to_json() {
        let mut schema_builder = SchemaBuilder::default();
        let count_options = IntOptions::default().set_stored().set_fast();
        schema_builder.add_text_field("title", TEXT);
        schema_builder.add_text_field("author", STRING);
        schema_builder.add_u64_field("count", count_options);
        let schema = schema_builder.build();
        let doc_json = r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": 4
        }"#;
        let doc = schema.parse_document(doc_json).unwrap();
        let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
        assert_eq!(doc, doc_serdeser);
    }

    #[test]
    pub fn test_parse_document() {
        let mut schema_builder = SchemaBuilder::default();
        let count_options = IntOptions::default().set_stored().set_fast();
        let title_field = schema_builder.add_text_field("title", TEXT);
        let author_field = schema_builder.add_text_field("author", STRING);
        let count_field = schema_builder.add_u64_field("count", count_options);
        let schema = schema_builder.build();
        {
            let doc = schema.parse_document("{}").unwrap();
            assert!(doc.field_values().is_empty());
        }
        {
            let doc = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": 4
            }"#).unwrap();
            assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
            assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
            assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton"
                "count": 4
            }"#);
            match json_err {
                Err(DocParsingError::NotJSON(__)) => {
                    assert!(true);
                }
                _ => {
                    assert!(false);
                }
            }
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": 4,
                "jambon": "bayonne"
            }"#);
            match json_err {
                Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
                    assert_eq!(field_name, "jambon");
                }
                _ => {
                    assert!(false);
                }
            }
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": "5",
                "jambon": "bayonne"
            }"#);
            match json_err {
                Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
                    assert!(true);
                }
                _ => {
                    assert!(false);
                }
            }
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": -5
            }"#);
            match json_err {
                Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
                    assert!(true);
                }
                _ => {
                    assert!(false);
                }
            }
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": 5000000000
            }"#);
            match json_err {
                Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
                    assert!(false);
                }
                _ => {
                    assert!(true);
                }
            }
        }
        {
            let json_err = schema.parse_document(r#"{
                "title": "my title",
                "author": "fulmicoton",
                "count": 50000000000000000000
            }"#);
            match json_err {
                Err(NotJSON(_)) => {
                    assert!(true);
                }
                _ => {
                    assert!(false)
                }
            }
        }
    }
}