Files
tantivy/src/schema/schema.rs
2017-04-21 09:13:26 +09:00

504 lines
15 KiB
Rust

use std::collections::HashMap;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use rustc_serialize::json;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
use schema::field_type::ValueParsingError;
use std::sync::Arc;
use super::*;
use std::fmt;
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::*;
///
/// let mut schema_builder = SchemaBuilder::default();
/// let id_field = schema_builder.add_text_field("id", STRING);
/// let title_field = schema_builder.add_text_field("title", TEXT);
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let schema = schema_builder.build();
///
/// ```
pub struct SchemaBuilder {
fields: Vec<FieldEntry>,
fields_map: HashMap<String, Field>,
}
impl SchemaBuilder {
/// Create a new `SchemaBuilder`
pub fn new() -> SchemaBuilder {
SchemaBuilder::default()
}
/// Adds a new u64 field.
/// Returns the associated field handle
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_u64_field(
&mut self,
field_name_str: &str,
field_options: IntOptions) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_u64(field_name, field_options);
self.add_field(field_entry)
}
/// Adds a new text field.
/// Returns the associated field handle
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_text_field(
&mut self,
field_name_str: &str,
field_options: TextOptions) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_text(field_name, field_options);
self.add_field(field_entry)
}
/// Adds a field entry to the schema in build.
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
let field = Field(self.fields.len() as u32);
let field_name = field_entry.name().clone();
self.fields.push(field_entry);
self.fields_map.insert(field_name, field);
field
}
/// Finalize the creation of a `Schema`
/// This will consume your `SchemaBuilder`
pub fn build(self,) -> Schema {
Schema(Arc::new(InnerSchema {
fields: self.fields,
fields_map: self.fields_map,
}))
}
}
impl Default for SchemaBuilder {
fn default() -> SchemaBuilder {
SchemaBuilder {
fields: Vec::new(),
fields_map: HashMap::new(),
}
}
}
#[derive(Debug)]
struct InnerSchema {
fields: Vec<FieldEntry>,
fields_map: HashMap<String, Field>, // transient
}
/// Tantivy has a very strict schema.
/// You need to specify in advance, whether a field is indexed or not,
/// stored or not, and RAM-based or not.
///
/// This is done by creating a schema object, and
/// setting up the fields one by one.
/// It is for the moment impossible to remove fields.
///
/// # Examples
///
/// ```
/// use tantivy::schema::*;
///
/// let mut schema_builder = SchemaBuilder::default();
/// let id_field = schema_builder.add_text_field("id", STRING);
/// let title_field = schema_builder.add_text_field("title", TEXT);
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let schema = schema_builder.build();
///
/// ```
#[derive(Clone)]
pub struct Schema(Arc<InnerSchema>);
impl Schema {
/// Return the `FieldEntry` associated to a `Field`.
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
&self.0.fields[field.0 as usize]
}
/// Return the field name for a given `Field`.
pub fn get_field_name(&self, field: Field) -> &String {
self.get_field_entry(field).name()
}
/// Return the list of all the `Field`s.
pub fn fields(&self,) -> &[FieldEntry] {
&self.0.fields
}
/// Returns the field options associated with a given name.
///
/// # Panics
/// Panics if the field name does not exist.
/// It is meant as an helper for user who created
/// and control the content of their schema.
///
/// If panicking is not an option for you,
/// you may use `get(&self, field_name: &str)`.
pub fn get_field(&self, field_name: &str) -> Option<Field> {
self.0.fields_map.get(field_name).cloned()
}
/// Create a named document off the doc.
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
let mut field_map = BTreeMap::new();
for (field, field_values) in doc.get_sorted_field_values() {
let field_name = self.get_field_name(field);
let values: Vec<Value> = field_values
.into_iter()
.map(|field_val| field_val.value() )
.cloned()
.collect();
field_map.insert(field_name.clone(), values);
}
NamedFieldDocument(field_map)
}
/// Encode the schema in JSON.
///
/// Encoding a document cannot fail.
pub fn to_json(&self, doc: &Document) -> String {
json::encode(&self.to_named_doc(doc)).unwrap()
}
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_node = try!(Json::from_str(doc_json));
let some_json_obj = json_node.as_object();
if !some_json_obj.is_some() {
let doc_json_sample: String =
if doc_json.len() < 20 {
String::from(doc_json)
}
else {
format!("{:?}...", &doc_json[0..20])
};
return Err(DocParsingError::NotJSONObject(doc_json_sample))
}
let json_obj = some_json_obj.unwrap();
let mut doc = Document::default();
for (field_name, json_value) in json_obj.iter() {
match self.get_field(field_name) {
Some(field) => {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match *json_value {
Json::Array(ref json_items) => {
for json_item in json_items {
let value = try!(
field_type
.value_from_json(&json_item)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))
);
doc.add(FieldValue::new(field, value));
}
}
_ => {
let value = try!(
field_type
.value_from_json(&json_value)
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))
);
doc.add(FieldValue::new(field, value));
}
}
}
None => {
return Err(DocParsingError::NoSuchFieldInSchema(field_name.clone()))
}
}
}
Ok(doc)
}
}
impl fmt::Debug for Schema {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
self.0.fmt(f)
}
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema_builder = SchemaBuilder::default();
try!(d.read_seq(|d, num_fields| {
for _ in 0..num_fields {
let field_entry = try!(FieldEntry::decode(d));
schema_builder.add_field(field_entry);
}
Ok(())
}));
Ok(schema_builder.build())
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.0.fields.len(),
|mut e| {
for (ord, field) in self.0.fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
}
Ok(())
}));
Ok(())
}
}
impl From<SchemaBuilder> for Schema {
fn from(schema_builder: SchemaBuilder) -> Schema {
schema_builder.build()
}
}
/// Error that may happen when deserializing
/// a document from JSON.
#[derive(Debug)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
NotJSON(json::ParserError),
/// The payload given is not a JSON Object (`{...}`).
NotJSONObject(String),
/// One of the value node could not be parsed.
ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema.
NoSuchFieldInSchema(String),
}
impl From<json::ParserError> for DocParsingError {
fn from(err: json::ParserError) -> DocParsingError {
DocParsingError::NotJSON(err)
}
}
#[cfg(test)]
mod tests {
use schema::*;
use rustc_serialize::json;
use schema::field_type::ValueParsingError;
use schema::schema::DocParsingError::NotJSON;
#[test]
pub fn test_schema_serialization() {
let mut schema_builder = SchemaBuilder::default();
let count_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
let schema_json: String = format!("{}", json::as_pretty_json(&schema));
let expected = r#"[
{
"name": "title",
"type": "text",
"options": {
"indexing": "position",
"stored": false
}
},
{
"name": "author",
"type": "text",
"options": {
"indexing": "untokenized",
"stored": false
}
},
{
"name": "count",
"type": "u64",
"options": {
"indexed": false,
"fast": true,
"stored": true
}
}
]"#;
assert_eq!(schema_json, expected);
}
#[test]
pub fn test_document_to_json() {
let mut schema_builder = SchemaBuilder::default();
let count_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
let doc_json = r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
assert_eq!(doc, doc_serdeser);
}
#[test]
pub fn test_parse_document() {
let mut schema_builder = SchemaBuilder::default();
let count_options = IntOptions::default().set_stored().set_fast();
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
assert!(doc.field_values().is_empty());
}
{
let doc = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4
}"#).unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton"
"count": 4
}"#);
match json_err {
Err(DocParsingError::NotJSON(__)) => {
assert!(true);
}
_ => {
assert!(false);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"jambon": "bayonne"
}"#);
match json_err {
Err(DocParsingError::NoSuchFieldInSchema(field_name)) => {
assert_eq!(field_name, "jambon");
}
_ => {
assert!(false);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": "5",
"jambon": "bayonne"
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
}
_ => {
assert!(false);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": -5
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
}
_ => {
assert!(false);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 5000000000
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(false);
}
_ => {
assert!(true);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50000000000000000000
}"#);
match json_err {
Err(NotJSON(_)) => {
assert!(true);
}
_ => {
assert!(false)
}
}
}
}
}