diff --git a/Cargo.toml b/Cargo.toml index e749e0e65..cc0753c7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ bincode = "0.4.0" libc = "0.2.6" argparse = "*" num_cpus = "0.2" +itertools = "0.4.16" lz4 = "1.13.131" time = "0.1.34" uuid = "0.1" diff --git a/src/core/index.rs b/src/core/index.rs index f0fd86c1b..8c7e1a81b 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -78,7 +78,7 @@ impl Index { let directory = try!(MmapDirectory::open(directory_path)); let directory_ptr = Box::new(directory); let mut index = Index::from_directory(directory_ptr, Schema::new()); - try!(index.load_metas()); //< does the directory already exists? + try!(index.load_metas()); //< TODO does the directory already exists? Ok(index) } diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs new file mode 100644 index 000000000..a55ea19dc --- /dev/null +++ b/src/core/segment_component.rs @@ -0,0 +1,41 @@ +use std::vec::IntoIter; + +#[derive(Copy, Clone)] +pub enum SegmentComponent { + INFO, + POSTINGS, + POSITIONS, + FASTFIELDS, + FIELDNORMS, + TERMS, + STORE, +} + +impl SegmentComponent { + pub fn values() -> IntoIter { + vec!( + SegmentComponent::INFO, + SegmentComponent::POSTINGS, + SegmentComponent::POSITIONS, + SegmentComponent::FASTFIELDS, + SegmentComponent::FIELDNORMS, + SegmentComponent::TERMS, + SegmentComponent::STORE, + ).into_iter() + } + + pub fn path_suffix(&self)-> &'static str { + match *self { + SegmentComponent::POSITIONS => ".pos", + SegmentComponent::INFO => ".info", + SegmentComponent::POSTINGS => ".idx", + SegmentComponent::TERMS => ".term", + SegmentComponent::STORE => ".store", + SegmentComponent::FASTFIELDS => ".fast", + SegmentComponent::FIELDNORMS => ".fieldnorm", + } + } +} + + + \ No newline at end of file diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs new file mode 100644 index 000000000..aa30379a0 --- /dev/null +++ b/src/core/segment_id.rs @@ -0,0 +1,42 @@ +use uuid::Uuid; +use std::fmt; +use rustc_serialize::{Encoder, Decoder, Encodable, Decodable}; +use core::SegmentComponent; +use std::path::PathBuf; + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct SegmentId(Uuid); + +impl SegmentId { + pub fn new() -> SegmentId { + SegmentId(Uuid::new_v4()) + } + + pub fn uuid_string(&self,) -> String { + self.0.to_simple_string() + } + + pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { + let filename = self.uuid_string() + component.path_suffix(); + PathBuf::from(filename) + } +} + +impl Encodable for SegmentId { + fn encode(&self, s: &mut S) -> Result<(), S::Error> { + self.0.encode(s) + } +} + +impl Decodable for SegmentId { + fn decode(d: &mut D) -> Result { + Uuid::decode(d).map(SegmentId) + } +} + +impl fmt::Debug for SegmentId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "SegmentId({:?})", self.uuid_string()) + } +} + diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index febc00bf4..52e6fa0b1 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -28,6 +28,7 @@ pub struct SegmentReader { postings_data: ReadOnlySource, store_reader: StoreReader, fast_fields_reader: U32FastFieldsReader, + fieldnorms_reader: U32FastFieldsReader, schema: Schema, } @@ -54,7 +55,10 @@ impl SegmentReader { self.fast_fields_reader.get_field(field) }, } - + } + + pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result { + self.fieldnorms_reader.get_field(field) } pub fn doc_freq(&self, term: &Term) -> u32 { @@ -79,9 +83,13 @@ impl SegmentReader { let term_infos = try!(FstMap::from_source(source)); let store_reader = StoreReader::new(try!(segment.open_read(SegmentComponent::STORE))); let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS)); + let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS)); let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data)); + let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS)); + let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data)); + let schema = segment.schema(); Ok(SegmentReader { segment_info: segment_info, @@ -90,6 +98,7 @@ impl SegmentReader { segment_id: segment.id(), store_reader: store_reader, fast_fields_reader: fast_fields_reader, + fieldnorms_reader: fieldnorms_reader, schema: schema, }) } @@ -97,7 +106,7 @@ impl SegmentReader { pub fn term_infos(&self) -> &FstMap { &self.term_infos } - + /// Returns the document (or to be accurate, its stored field) /// bearing the given doc id. /// This method is slow and should seldom be called from diff --git a/src/core/segment_serializer.rs b/src/core/segment_serializer.rs index 176f88dfb..3d81747e6 100644 --- a/src/core/segment_serializer.rs +++ b/src/core/segment_serializer.rs @@ -13,6 +13,7 @@ pub struct SegmentSerializer { segment: Segment, store_writer: StoreWriter, fast_field_serializer: FastFieldSerializer, + fieldnorms_serializer: FastFieldSerializer, postings_serializer: PostingsSerializer, } @@ -20,14 +21,20 @@ impl SegmentSerializer { pub fn for_segment(segment: &Segment) -> io::Result { let store_write = try!(segment.open_write(SegmentComponent::STORE)); + let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS)); let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write)); + + let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS)); + let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write)); + let postings_serializer = try!(PostingsSerializer::open(segment)); Ok(SegmentSerializer { segment: segment.clone(), postings_serializer: postings_serializer, store_writer: StoreWriter::new(store_write), fast_field_serializer: fast_field_serializer, + fieldnorms_serializer: fieldnorms_serializer, }) } @@ -38,6 +45,10 @@ impl SegmentSerializer { pub fn get_fast_field_serializer(&mut self,) -> &mut FastFieldSerializer { &mut self.fast_field_serializer } + + pub fn get_fieldnorms_serializer(&mut self,) -> &mut FastFieldSerializer { + &mut self.fieldnorms_serializer + } pub fn get_store_writer(&mut self,) -> &mut StoreWriter { &mut self.store_writer @@ -55,6 +66,7 @@ impl SegmentSerializer { try!(self.fast_field_serializer.close()); try!(self.postings_serializer.close()); try!(self.store_writer.close()); + try!(self.fieldnorms_serializer.close()); Ok(()) } } diff --git a/src/core/segment_writer.rs b/src/core/segment_writer.rs index 5f90bcafd..c5ecb865b 100644 --- a/src/core/segment_writer.rs +++ b/src/core/segment_writer.rs @@ -13,6 +13,7 @@ use postings::PostingsWriter; use fastfield::U32FastFieldsWriter; use std::clone::Clone; use std::io; +use schema::Field; use schema::FieldValue; pub struct SegmentWriter { @@ -21,15 +22,33 @@ pub struct SegmentWriter { postings_writer: PostingsWriter, segment_serializer: SegmentSerializer, fast_field_writers: U32FastFieldsWriter, + fieldnorms_writer: U32FastFieldsWriter, +} + +fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter { + let u32_fields: Vec = schema.fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| field_entry.is_indexed()) + .map(|(field_id, _)| Field(field_id as u8)) + .collect(); + U32FastFieldsWriter::new(u32_fields) +} + +fn compute_field_norm(num_tokens: usize) -> u32 { + ((350f32 / (1f32 + num_tokens as f32).sqrt()) as u32) } impl SegmentWriter { + + pub fn for_segment(segment: Segment, schema: &Schema) -> io::Result { let segment_serializer = try!(SegmentSerializer::for_segment(&segment)); Ok(SegmentWriter { max_doc: 0, postings_writer: PostingsWriter::new(), + fieldnorms_writer: create_fieldnorms_writer(schema), segment_serializer: segment_serializer, tokenizer: SimpleTokenizer::new(), fast_field_writers: U32FastFieldsWriter::from_schema(schema), @@ -48,41 +67,56 @@ impl SegmentWriter { self.postings_writer.close(); write(&self.postings_writer, &self.fast_field_writers, + &self.fieldnorms_writer, segment_info, self.segment_serializer) } pub fn add_document(&mut self, doc: &Document, schema: &Schema) -> io::Result<()> { let doc_id = self.max_doc; - for field_value in doc.get_fields() { - let field_options = schema.get_field_entry(field_value.field()); - match *field_options { - FieldEntry::Text(_, ref text_options) => { - if text_options.get_indexing_options().is_tokenized() { - let mut tokens = self.tokenizer.tokenize(field_value.text()); - let mut pos = 0u32; - let field = field_value.field(); - loop { - match tokens.next() { - Some(token) => { - let term = Term::from_field_text(field, token); - self.postings_writer.suscribe(doc_id, pos, term); - pos += 1; - }, - None => { break; } + for (field, field_values) in doc.get_sorted_fields() { + let mut num_tokens: usize = 0; + for field_value in field_values { + let field_options = schema.get_field_entry(field); + match *field_options { + FieldEntry::Text(_, ref text_options) => { + if text_options.get_indexing_options().is_tokenized() { + let mut tokens = self.tokenizer.tokenize(field_value.text()); + // right now num_tokens and pos are redundant, but it should + // change when we get proper analyzers + + let mut pos = 0u32; + let field = field_value.field(); + loop { + match tokens.next() { + Some(token) => { + let term = Term::from_field_text(field, token); + self.postings_writer.suscribe(doc_id, pos, term); + pos += 1; + num_tokens += 1; + }, + None => { break; } + } } } + // TODO untokenized yet indexed } - // TODO untokenized yet indexed - } - FieldEntry::U32(_, ref u32_options) => { - if u32_options.is_indexed() { - let term = Term::from_field_u32(field_value.field(), field_value.u32_value()); - self.postings_writer.suscribe(doc_id, 0.clone(), term); + FieldEntry::U32(_, ref u32_options) => { + if u32_options.is_indexed() { + let term = Term::from_field_u32(field_value.field(), field_value.u32_value()); + self.postings_writer.suscribe(doc_id, 0.clone(), term); + } } } } + let field_norm = compute_field_norm(num_tokens); + self.fieldnorms_writer + .get_field_writer(field) + .map(|field_norms_writer| { + field_norms_writer.set_val(doc_id, field_norm) + }); } + self.fast_field_writers.add_document(&doc); let stored_fieldvalues: Vec<&FieldValue> = doc .get_fields() @@ -104,10 +138,12 @@ impl SegmentWriter { fn write(postings_writer: &PostingsWriter, fast_field_writers: &U32FastFieldsWriter, + fieldnorms_writer: &U32FastFieldsWriter, segment_info: SegmentInfo, mut serializer: SegmentSerializer) -> io::Result<()> { try!(postings_writer.serialize(serializer.get_postings_serializer())); try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); + try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); try!(serializer.write_segment_info(&segment_info)); try!(serializer.close()); Ok(()) @@ -117,6 +153,7 @@ impl SerializableSegment for SegmentWriter { fn write(&self, serializer: SegmentSerializer) -> io::Result<()> { write(&self.postings_writer, &self.fast_field_writers, + &self.fieldnorms_writer, self.segment_info(), serializer) } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 0765e1fca..ae64a97fd 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -104,7 +104,7 @@ impl U32FastFieldsReader { U32FastFieldReader::open(field_source) } None => { - Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field, has it been set as a fast field?")) + Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field")) } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 329ea7708..5edd4ecbc 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,6 +1,7 @@ use schema::{Schema, FieldValue, Field, Document}; use fastfield::FastFieldSerializer; use std::io; +use DocId; pub struct U32FastFieldsWriter { field_writers: Vec, @@ -9,7 +10,6 @@ pub struct U32FastFieldsWriter { impl U32FastFieldsWriter { pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter { - // TODO fix let u32_fields: Vec = schema.fields() .iter() .enumerate() @@ -27,7 +27,14 @@ impl U32FastFieldsWriter { .collect(), } } - + + pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> { + self.field_writers + .iter_mut() + .filter(|field_writer| field_writer.field == field) + .next() + } + pub fn add_document(&mut self, doc: &Document) { for field_writer in self.field_writers.iter_mut() { field_writer.add_document(doc); @@ -45,6 +52,7 @@ impl U32FastFieldsWriter { pub struct U32FastFieldWriter { field: Field, vals: Vec, + cur_doc: DocId, } impl U32FastFieldWriter { @@ -52,14 +60,22 @@ impl U32FastFieldWriter { U32FastFieldWriter { field: field.clone(), vals: Vec::new(), + cur_doc: 0, } } - pub fn add_val(&mut self, val: u32) { - self.vals.push(val); + pub fn set_val(&mut self, doc: DocId, val: u32) { + for _ in self.cur_doc .. doc { + self.vals.push(0u32); + } + self.cur_doc = doc; + self.add_val(val); } - + pub fn add_val(&mut self, val: u32) { + self.vals.push(val); + self.cur_doc += 1; + } fn extract_val(&self, doc: &Document) -> u32 { match doc.get_first(self.field) { diff --git a/src/lib.rs b/src/lib.rs index 25d2012d9..09312f066 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,6 +23,7 @@ extern crate lz4; extern crate uuid; extern crate num_cpus; extern crate combine; +extern crate itertools; #[cfg(test)] extern crate test; #[cfg(test)] extern crate rand; @@ -183,6 +184,49 @@ mod tests { assert_eq!(searcher.doc_freq(&term_d), 0); } } + + + #[test] + fn test_fieldnorm() { + let mut schema = schema::Schema::new(); + let text_field = schema.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1).unwrap(); + { + let mut doc = Document::new(); + doc.add_text(text_field, "a b c"); + index_writer.add_document(doc).unwrap(); + } + { + let doc = Document::new(); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = Document::new(); + doc.add_text(text_field, "a b"); + index_writer.add_document(doc).unwrap(); + } + index_writer.wait().unwrap(); + } + { + + let searcher = index.searcher().unwrap(); + let segment_reader: &SegmentReader = searcher.segments().iter().next().unwrap(); + let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); + assert_eq!(fieldnorms_reader.get(0), 175); + assert_eq!(fieldnorms_reader.get(1), 0); + assert_eq!(fieldnorms_reader.get(2), 202); + // let term_a = Term::from_field_text(text_field, "a"); + // assert_eq!(searcher.doc_freq(&term_a), 3); + // let term_b = Term::from_field_text(text_field, "b"); + // assert_eq!(searcher.doc_freq(&term_b), 1); + // let term_c = Term::from_field_text(text_field, "c"); + // assert_eq!(searcher.doc_freq(&term_c), 2); + // let term_d = Term::from_field_text(text_field, "d"); + // assert_eq!(searcher.doc_freq(&term_d), 0); + } + } #[test] fn test_termfreq() { diff --git a/src/schema/document.rs b/src/schema/document.rs index dddcb39d0..92ebe6da6 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,5 +1,5 @@ use super::*; - +use itertools::Itertools; /// /// Document are really just a list of field values. @@ -37,6 +37,20 @@ impl Document { &self.field_values } + pub fn get_sorted_fields(&self) -> Vec<(Field, Vec<&FieldValue>)> { + let mut field_values: Vec<&FieldValue> = self.get_fields().iter().collect(); + field_values.sort_by_key(|field_value| field_value.field()); + let sorted_fields: Vec<(Field, Vec<&FieldValue>)> = field_values + .into_iter() + .group_by(|field_value| field_value.field()) + .into_iter() + .map(|(key, group)| { + (key, group.into_iter().collect()) + }) + .collect(); + sorted_fields + } + pub fn get_all<'a>(&'a self, field: Field) -> Vec<&'a FieldValue> { self.field_values .iter() diff --git a/src/schema/field.rs b/src/schema/field.rs index 0c991f187..e4f129120 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -3,7 +3,7 @@ use std::io::Write; use std::io::Read; use common::BinarySerializable; -#[derive(Copy,Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +#[derive(Copy,Clone,Debug,PartialEq,PartialOrd,Eq,Ord,Hash)] pub struct Field(pub u8); impl BinarySerializable for Field { diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 6f1849554..5f4ba7c32 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -20,14 +20,17 @@ impl FieldEntry { } } + pub fn is_indexed(&self,) -> bool { + match self { + &FieldEntry::Text(_, ref options) => options.get_indexing_options().is_indexed(), + _ => false, + } + } + pub fn is_u32_fast(&self,) -> bool { match self { - &FieldEntry::U32(_, ref options) => { - options.is_fast() - } - _ => { - false - } + &FieldEntry::U32(_, ref options) => options.is_fast(), + _ => false, } } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 1f1c9605a..67ce47d87 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -11,7 +11,7 @@ pub struct TextOptions { impl TextOptions { pub fn get_indexing_options(&self,) -> TextIndexingOptions { - self.indexing_options.clone() + self.indexing_options } pub fn is_stored(&self,) -> bool { @@ -46,7 +46,7 @@ impl TextOptions { } } -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)] +#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)] pub enum TextIndexingOptions { Unindexed, Untokenized, @@ -71,6 +71,13 @@ impl TextIndexingOptions { TextIndexingOptions::TokenizedWithFreqAndPosition => true, _ => false, } + } + + pub fn is_indexed(&self,) -> bool { + match *self { + TextIndexingOptions::Unindexed => false, + _ => true, + } } pub fn is_position_enabled(&self,) -> bool {