From 1ba5bddd7f7b8cb66903b872890c25d43aeec157 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 16 Jan 2016 16:33:38 +0900 Subject: [PATCH] better serial. No extra level for field --- src/core/global.rs | 9 +- src/core/reader.rs | 2 +- src/core/schema.rs | 6 +- src/core/serial.rs | 24 +++-- src/core/writer.rs | 239 ++++++++++++++++++++++++--------------------- tests/core.rs | 81 ++++++--------- 6 files changed, 176 insertions(+), 185 deletions(-) diff --git a/src/core/global.rs b/src/core/global.rs index ca66384e5..cf6fb4410 100644 --- a/src/core/global.rs +++ b/src/core/global.rs @@ -2,12 +2,11 @@ use std::io::{BufWriter, Write}; use std::io; pub type DocId = usize; +pub type FieldId = u32; + +#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] +pub struct Field(pub FieldId); // pub trait SeekableIterator: Iterator { // pub fn seek(&mut self, el: &T) -> bool; // } - - -pub trait Flushable { - fn flush(&self, writer: &mut W) -> Result; -} diff --git a/src/core/reader.rs b/src/core/reader.rs index a10ed8714..2f939f7a3 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -1,6 +1,6 @@ use core::directory::Directory; use core::global::DocId; -use core::schema::Field; +use core::schema::*; pub struct SegmentIndexReader { directory: Directory, diff --git a/src/core/schema.rs b/src/core/schema.rs index 52797bbbe..b9e02e753 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -1,6 +1,4 @@ -#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] -pub struct Field(pub &'static str); - +use core::global::*; #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] pub struct FieldValue { @@ -11,7 +9,7 @@ pub struct FieldValue { #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] pub struct Term<'a> { - pub field: &'a Field, + pub field: Field, pub text: &'a str, } diff --git a/src/core/serial.rs b/src/core/serial.rs index 681d56609..370f421b1 100644 --- a/src/core/serial.rs +++ b/src/core/serial.rs @@ -1,24 +1,22 @@ -use core::global::DocId; -use core::schema::Field; +use core::global::*; +use core::schema::*; // Trait sufficient to serialize a segment. pub trait SerializableSegment<'a> { - type TFieldCur: FieldCursor<'a>; - fn field_cursor(&'a self) -> Self::TFieldCur; + type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl + fn term_cursor(&'a mut self) -> Self::TermCur; } pub trait DocCursor: Iterator { fn doc(&self) -> DocId; } -pub trait TermCursor<'a>: Iterator { - type TDocCur: DocCursor; - fn get_term(&self) -> &'a String; - fn doc_cursor(&self) -> Self::TDocCur; -} -pub trait FieldCursor<'a>: Iterator { - type TTermCur: TermCursor<'a>; - fn get_field(&self) -> Option<&'a Field>; - fn term_cursor(&'a self) -> Self::TTermCur; +// TODO make iteration over Fields somehow sorted +// (Not only forms) +pub trait TermCursor<'a> { + type DocCur: DocCursor; + fn advance(&mut self,) -> bool; + fn get_term(&self) -> Term<'a>; + fn doc_cursor(&self) -> Self::DocCur; } diff --git a/src/core/writer.rs b/src/core/writer.rs index 8e5b0e536..033fd5acc 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -1,19 +1,18 @@ use std::io; -use core::schema::Document; -use core::schema::Field; +use std::slice; +use core::global::*; +use core::schema::*; use core::directory::Directory; use core::analyzer::tokenize; use std::collections::{HashMap, BTreeMap}; use std::collections::{hash_map, btree_map}; -use core::DocId; use core::postings::PostingsWriter; -use core::global::Flushable; use std::io::{BufWriter, Write}; use std::mem; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use std::iter::Peekable; -use core::serial::{FieldCursor, TermCursor, DocCursor, SerializableSegment}; +use core::serial::*; pub struct SimplePostingsWriter { doc_ids: Vec, @@ -29,7 +28,9 @@ impl SimplePostingsWriter { impl PostingsWriter for SimplePostingsWriter { fn suscribe(&mut self, doc_id: DocId) { - self.doc_ids.push(doc_id); + if self.doc_ids.len() == 0 || self.doc_ids[self.doc_ids.len() - 1] < doc_id { + self.doc_ids.push(doc_id); + } } } @@ -38,20 +39,6 @@ struct FieldWriter { term_index: BTreeMap, } -impl Flushable for SimplePostingsWriter { - fn flush(&self, writer: &mut W) -> Result { - let mut num_bytes_written = 0; - let num_docs = self.doc_ids.len() as u64; - writer.write_u64::(num_docs); - num_bytes_written += 8; - for &doc_id in self.doc_ids.iter() { - writer.write_u64::(doc_id as u64); - num_bytes_written += 8; - } - Ok(num_bytes_written) - } -} - impl FieldWriter { pub fn new() -> FieldWriter { FieldWriter { @@ -132,107 +119,31 @@ pub struct ClosedIndexWriter { -//----------------------------------------- -// Implementation of SerializableSegment -// - -pub struct CIWFieldCursor<'a> { - field_it: hash_map::Iter<'a, Field, FieldWriter>, - current: Option<(&'a Field, &'a FieldWriter)> -} - -impl<'a> CIWFieldCursor<'a> { - fn get_field_writer(&self) -> &'a FieldWriter { - self.current.map(|(_, second)| second).unwrap() - } -} - -impl<'a> Iterator for CIWFieldCursor<'a> { - type Item=&'a Field; - - fn next(&mut self) -> Option<&'a Field> { - self.current = self.field_it.next(); - self.get_field() - } -} - -impl<'a> FieldCursor<'a> for CIWFieldCursor<'a> { - - type TTermCur = CIWTermCursor<'a>; - - fn get_field(&self) -> Option<&'a Field> { - self.current.map(|(first, _)| first) - } - - fn term_cursor<'b>(&'b self) -> CIWTermCursor<'b> { - let field_writer = self.get_field_writer(); - CIWTermCursor { - postings: &field_writer.postings, - term_it: field_writer.term_index.iter(), - current: None - } - } -} - -// TODO use a Term type - -impl<'a> SerializableSegment<'a> for ClosedIndexWriter { - - type TFieldCur = CIWFieldCursor<'a>; - - fn field_cursor(&'a self) -> CIWFieldCursor<'a> { - let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter(); - let current: Option<(&'a Field, &'a FieldWriter)> = None; - CIWFieldCursor { - current: current, - field_it: field_it - } - } -} ////////////////////////////////// -// CIWTermCursor +// CIWFormCursor // -pub struct CIWTermCursor<'a> { - postings: &'a Vec, - term_it: btree_map::Iter<'a, String, usize>, - current: Option<(&'a String, &'a usize)> +struct CIWFormCursor<'a> { + term_it: btree_map::Iter<'a, String, usize>, // term -> postings_idx + postings_map: &'a Vec, // postings_idx -> postings } -impl<'a> CIWTermCursor<'a> { - fn get_term_option(&self) -> Option<&'a String> { - self.current - .map(|(first, _)| first) - } +struct FormPostings<'a> { + form: &'a str, + postings: &'a SimplePostingsWriter, } -impl<'a> Iterator for CIWTermCursor<'a> { - type Item=&'a String; +impl<'a> Iterator for CIWFormCursor<'a> { + type Item = FormPostings<'a>; - fn next(&mut self) -> Option<&'a String> { - self.current = self.term_it.next(); - self.get_term_option() - } -} - -impl<'a> TermCursor<'a> for CIWTermCursor<'a> { - type TDocCur = CIWDocCursor<'a>; - - fn doc_cursor(&self) -> CIWDocCursor<'a> { - let (_, &postings_id) = self.current.unwrap(); - unsafe { - let postings_writer = self.postings.get_unchecked(postings_id); - let docs_it = postings_writer.doc_ids.iter(); - CIWDocCursor { - docs_it: Box::new(docs_it), - current: None, + fn next(&mut self,) -> Option> { + self.term_it.next() + .map(|(form, postings_idx)| { + FormPostings { + form: form, + postings: unsafe { self.postings_map.get_unchecked(*postings_idx) } } - } - } - - fn get_term(&self) -> &'a String { - self.get_term_option() - .unwrap() + }) } } @@ -240,10 +151,112 @@ impl<'a> TermCursor<'a> for CIWTermCursor<'a> { // CIWDocCursor // +pub struct CIWTermCursor<'a> { + field_it: hash_map::Iter<'a, Field, FieldWriter>, + form_it: CIWFormCursor<'a>, + current_form_postings: Option>, + field: &'a Field, +} + +impl<'a> CIWTermCursor<'a> { + + + fn next_form(&mut self,) -> bool { + match self.form_it.next() { + Some(form_postings) => { + self.current_form_postings = Some(form_postings); + return true; + }, + None => { false } + } + } + + // Advance to the next field + // sets up form_it to iterate on forms + // returns true iff there was a next field + fn next_field(&mut self,) -> bool { + match self.field_it.next() { + Some((field, field_writer)) => { + self.form_it = CIWFormCursor { + term_it: field_writer.term_index.iter(), + postings_map: &field_writer.postings, + }; + self.field = field; + true + }, + None => false, + } + } +} + +impl<'a> TermCursor<'a> for CIWTermCursor<'a> { + + type DocCur = CIWDocCursor<'a>; + + fn get_term(&self) -> Term<'a> { + Term { + field: self.field.clone(), + text: self.current_form_postings.as_ref().unwrap().form, + } + } + + fn doc_cursor(&self,) -> CIWDocCursor<'a> { + CIWDocCursor { + docs_it: self.current_form_postings + .as_ref() + .unwrap() + .postings + .doc_ids + .iter(), + current: None + } + } + + fn advance(&mut self,) -> bool { + let next_form = self.next_form(); + if next_form { + true + } + else { + if self.next_field() { + self.advance() + } + else { + false + } + } + } +} + +// +// TODO use a Term type +// + +impl<'a> SerializableSegment<'a> for ClosedIndexWriter { + + type TermCur = CIWTermCursor<'a>; + + fn term_cursor(&'a mut self) -> CIWTermCursor<'a> { + let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter(); + let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field + let mut term_cursor = CIWTermCursor { + field_it: field_it, + form_it: CIWFormCursor { + term_it: field_writer.term_index.iter(), + postings_map: &field_writer.postings, + }, + field: field, + current_form_postings: None, + }; + // TODO handle having no fields at all + term_cursor + } +} + // TODO add positions pub struct CIWDocCursor<'a> { - docs_it: Box + 'a>, + docs_it: slice::Iter<'a, DocId>, current: Option, } diff --git a/tests/core.rs b/tests/core.rs index 1ae607b73..ad21d0eb7 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -3,18 +3,17 @@ extern crate itertools; extern crate byteorder; extern crate regex; -use tantivy::core::DocId; use tantivy::core::postings::{VecPostings, intersection}; use tantivy::core::postings::Postings; use tantivy::core::analyzer::tokenize; use tantivy::core::serial::*; +use tantivy::core::schema::*; +use tantivy::core::global::*; use tantivy::core::writer::{IndexWriter, ClosedIndexWriter}; use tantivy::core::directory::{Directory, generate_segment_name, SegmentId}; -use tantivy::core::schema::{Field, Document}; use std::ops::DerefMut; use tantivy::core::writer::SimplePostingsWriter; use tantivy::core::postings::PostingsWriter; -use tantivy::core::global::Flushable; use std::io::{ BufWriter, Write}; use regex::Regex; use std::convert::From; @@ -41,27 +40,29 @@ fn test_indexing() { let mut index_writer = IndexWriter::open(&directory); { let mut doc = Document::new(); - doc.set(Field("text"), "toto titi"); + doc.set(Field(1), "a b"); index_writer.add(doc); } { let mut doc = Document::new(); - doc.set(Field("text"), "titi tata"); + doc.set(Field(1), "a b c"); index_writer.add(doc); } - let closed_index_writer: ClosedIndexWriter = index_writer.close(); - let mut field_cursor = closed_index_writer.field_cursor(); + { + let mut doc = Document::new(); + doc.set(Field(1), "a b c d"); + // TODO make iteration over Fields somehow sorted + index_writer.add(doc); + } + let mut closed_index_writer: ClosedIndexWriter = index_writer.close(); + let mut term_cursor = closed_index_writer.term_cursor(); loop { - match field_cursor.next() { - Some(field) => { - println!(" {:?}", field); - show_term_cursor(field_cursor.term_cursor()); - }, - None => { break; }, + if !term_cursor.advance() { + break; } + show_term(&term_cursor); } assert!(false); - // index_writer.sync().unwrap(); } { // TODO add index opening stuff @@ -70,45 +71,27 @@ fn test_indexing() { } -fn show_term_cursor<'a, T: TermCursor<'a>>(mut term_cursor: T) { - loop { - match term_cursor.next() { - Some(term) => { - println!(" term: {:?}", term); - show_doc_cursor(term_cursor.doc_cursor()); - }, - None => { - break; - } - } +fn show_term<'a, T: TermCursor<'a>>(term_cursor: &T) { + println!("{:?}", term_cursor.get_term()); + let doc_cursor = term_cursor.doc_cursor(); + for doc in doc_cursor { + println!("doc({})", doc); } } -fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) { - loop { - match doc_cursor.next() { - Some(doc) => { - println!(" {}", doc); - }, - None => { - break; - } - } - } -} +// fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) { +// loop { +// match doc_cursor.next() { +// Some(doc) => { +// println!(" {}", doc); +// }, +// None => { +// break; +// } +// } +// } +// } -#[test] -fn test_postings_writer() { - let mut postings_writer = SimplePostingsWriter::new(); - postings_writer.suscribe(1); - postings_writer.suscribe(4); - postings_writer.suscribe(5); - postings_writer.suscribe(17); - let mut buffer: Vec = Vec::new(); - assert_eq!(buffer.len(), 0); - postings_writer.flush(&mut buffer); - assert_eq!(buffer.len(), 5 * 8); -} #[test] fn test_new_segment() {