better serial. No extra level for field

This commit is contained in:
Paul Masurel
2016-01-16 16:33:38 +09:00
parent 4d12776b99
commit 1ba5bddd7f
6 changed files with 176 additions and 185 deletions

View File

@@ -2,12 +2,11 @@ use std::io::{BufWriter, Write};
use std::io; use std::io;
pub type DocId = usize; pub type DocId = usize;
pub type FieldId = u32;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(pub FieldId);
// pub trait SeekableIterator<T>: Iterator<T> { // pub trait SeekableIterator<T>: Iterator<T> {
// pub fn seek(&mut self, el: &T) -> bool; // pub fn seek(&mut self, el: &T) -> bool;
// } // }
pub trait Flushable {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error>;
}

View File

@@ -1,6 +1,6 @@
use core::directory::Directory; use core::directory::Directory;
use core::global::DocId; use core::global::DocId;
use core::schema::Field; use core::schema::*;
pub struct SegmentIndexReader { pub struct SegmentIndexReader {
directory: Directory, directory: Directory,

View File

@@ -1,6 +1,4 @@
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] use core::global::*;
pub struct Field(pub &'static str);
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue { pub struct FieldValue {
@@ -11,7 +9,7 @@ pub struct FieldValue {
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)] #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Term<'a> { pub struct Term<'a> {
pub field: &'a Field, pub field: Field,
pub text: &'a str, pub text: &'a str,
} }

View File

@@ -1,24 +1,22 @@
use core::global::DocId; use core::global::*;
use core::schema::Field; use core::schema::*;
// Trait sufficient to serialize a segment. // Trait sufficient to serialize a segment.
pub trait SerializableSegment<'a> { pub trait SerializableSegment<'a> {
type TFieldCur: FieldCursor<'a>; type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
fn field_cursor(&'a self) -> Self::TFieldCur; fn term_cursor(&'a mut self) -> Self::TermCur;
} }
pub trait DocCursor: Iterator<Item=DocId> { pub trait DocCursor: Iterator<Item=DocId> {
fn doc(&self) -> DocId; fn doc(&self) -> DocId;
} }
pub trait TermCursor<'a>: Iterator<Item=&'a String> {
type TDocCur: DocCursor;
fn get_term(&self) -> &'a String;
fn doc_cursor(&self) -> Self::TDocCur;
}
pub trait FieldCursor<'a>: Iterator<Item=&'a Field> { // TODO make iteration over Fields somehow sorted
type TTermCur: TermCursor<'a>; // (Not only forms)
fn get_field(&self) -> Option<&'a Field>; pub trait TermCursor<'a> {
fn term_cursor(&'a self) -> Self::TTermCur; type DocCur: DocCursor;
fn advance(&mut self,) -> bool;
fn get_term(&self) -> Term<'a>;
fn doc_cursor(&self) -> Self::DocCur;
} }

View File

@@ -1,19 +1,18 @@
use std::io; use std::io;
use core::schema::Document; use std::slice;
use core::schema::Field; use core::global::*;
use core::schema::*;
use core::directory::Directory; use core::directory::Directory;
use core::analyzer::tokenize; use core::analyzer::tokenize;
use std::collections::{HashMap, BTreeMap}; use std::collections::{HashMap, BTreeMap};
use std::collections::{hash_map, btree_map}; use std::collections::{hash_map, btree_map};
use core::DocId;
use core::postings::PostingsWriter; use core::postings::PostingsWriter;
use core::global::Flushable;
use std::io::{BufWriter, Write}; use std::io::{BufWriter, Write};
use std::mem; use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::iter::Peekable; use std::iter::Peekable;
use core::serial::{FieldCursor, TermCursor, DocCursor, SerializableSegment}; use core::serial::*;
pub struct SimplePostingsWriter { pub struct SimplePostingsWriter {
doc_ids: Vec<DocId>, doc_ids: Vec<DocId>,
@@ -29,7 +28,9 @@ impl SimplePostingsWriter {
impl PostingsWriter for SimplePostingsWriter { impl PostingsWriter for SimplePostingsWriter {
fn suscribe(&mut self, doc_id: DocId) { fn suscribe(&mut self, doc_id: DocId) {
self.doc_ids.push(doc_id); if self.doc_ids.len() == 0 || self.doc_ids[self.doc_ids.len() - 1] < doc_id {
self.doc_ids.push(doc_id);
}
} }
} }
@@ -38,20 +39,6 @@ struct FieldWriter {
term_index: BTreeMap<String, usize>, term_index: BTreeMap<String, usize>,
} }
impl Flushable for SimplePostingsWriter {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
let mut num_bytes_written = 0;
let num_docs = self.doc_ids.len() as u64;
writer.write_u64::<NativeEndian>(num_docs);
num_bytes_written += 8;
for &doc_id in self.doc_ids.iter() {
writer.write_u64::<NativeEndian>(doc_id as u64);
num_bytes_written += 8;
}
Ok(num_bytes_written)
}
}
impl FieldWriter { impl FieldWriter {
pub fn new() -> FieldWriter { pub fn new() -> FieldWriter {
FieldWriter { FieldWriter {
@@ -132,107 +119,31 @@ pub struct ClosedIndexWriter {
//-----------------------------------------
// Implementation of SerializableSegment
//
pub struct CIWFieldCursor<'a> {
field_it: hash_map::Iter<'a, Field, FieldWriter>,
current: Option<(&'a Field, &'a FieldWriter)>
}
impl<'a> CIWFieldCursor<'a> {
fn get_field_writer(&self) -> &'a FieldWriter {
self.current.map(|(_, second)| second).unwrap()
}
}
impl<'a> Iterator for CIWFieldCursor<'a> {
type Item=&'a Field;
fn next(&mut self) -> Option<&'a Field> {
self.current = self.field_it.next();
self.get_field()
}
}
impl<'a> FieldCursor<'a> for CIWFieldCursor<'a> {
type TTermCur = CIWTermCursor<'a>;
fn get_field(&self) -> Option<&'a Field> {
self.current.map(|(first, _)| first)
}
fn term_cursor<'b>(&'b self) -> CIWTermCursor<'b> {
let field_writer = self.get_field_writer();
CIWTermCursor {
postings: &field_writer.postings,
term_it: field_writer.term_index.iter(),
current: None
}
}
}
// TODO use a Term type
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
type TFieldCur = CIWFieldCursor<'a>;
fn field_cursor(&'a self) -> CIWFieldCursor<'a> {
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
let current: Option<(&'a Field, &'a FieldWriter)> = None;
CIWFieldCursor {
current: current,
field_it: field_it
}
}
}
////////////////////////////////// //////////////////////////////////
// CIWTermCursor // CIWFormCursor
// //
pub struct CIWTermCursor<'a> { struct CIWFormCursor<'a> {
postings: &'a Vec<SimplePostingsWriter>, term_it: btree_map::Iter<'a, String, usize>, // term -> postings_idx
term_it: btree_map::Iter<'a, String, usize>, postings_map: &'a Vec<SimplePostingsWriter>, // postings_idx -> postings
current: Option<(&'a String, &'a usize)>
} }
impl<'a> CIWTermCursor<'a> { struct FormPostings<'a> {
fn get_term_option(&self) -> Option<&'a String> { form: &'a str,
self.current postings: &'a SimplePostingsWriter,
.map(|(first, _)| first)
}
} }
impl<'a> Iterator for CIWTermCursor<'a> { impl<'a> Iterator for CIWFormCursor<'a> {
type Item=&'a String; type Item = FormPostings<'a>;
fn next(&mut self) -> Option<&'a String> { fn next(&mut self,) -> Option<FormPostings<'a>> {
self.current = self.term_it.next(); self.term_it.next()
self.get_term_option() .map(|(form, postings_idx)| {
} FormPostings {
} form: form,
postings: unsafe { self.postings_map.get_unchecked(*postings_idx) }
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
type TDocCur = CIWDocCursor<'a>;
fn doc_cursor(&self) -> CIWDocCursor<'a> {
let (_, &postings_id) = self.current.unwrap();
unsafe {
let postings_writer = self.postings.get_unchecked(postings_id);
let docs_it = postings_writer.doc_ids.iter();
CIWDocCursor {
docs_it: Box::new(docs_it),
current: None,
} }
} })
}
fn get_term(&self) -> &'a String {
self.get_term_option()
.unwrap()
} }
} }
@@ -240,10 +151,112 @@ impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
// CIWDocCursor // CIWDocCursor
// //
pub struct CIWTermCursor<'a> {
field_it: hash_map::Iter<'a, Field, FieldWriter>,
form_it: CIWFormCursor<'a>,
current_form_postings: Option<FormPostings<'a>>,
field: &'a Field,
}
impl<'a> CIWTermCursor<'a> {
fn next_form(&mut self,) -> bool {
match self.form_it.next() {
Some(form_postings) => {
self.current_form_postings = Some(form_postings);
return true;
},
None => { false }
}
}
// Advance to the next field
// sets up form_it to iterate on forms
// returns true iff there was a next field
fn next_field(&mut self,) -> bool {
match self.field_it.next() {
Some((field, field_writer)) => {
self.form_it = CIWFormCursor {
term_it: field_writer.term_index.iter(),
postings_map: &field_writer.postings,
};
self.field = field;
true
},
None => false,
}
}
}
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
type DocCur = CIWDocCursor<'a>;
fn get_term(&self) -> Term<'a> {
Term {
field: self.field.clone(),
text: self.current_form_postings.as_ref().unwrap().form,
}
}
fn doc_cursor(&self,) -> CIWDocCursor<'a> {
CIWDocCursor {
docs_it: self.current_form_postings
.as_ref()
.unwrap()
.postings
.doc_ids
.iter(),
current: None
}
}
fn advance(&mut self,) -> bool {
let next_form = self.next_form();
if next_form {
true
}
else {
if self.next_field() {
self.advance()
}
else {
false
}
}
}
}
//
// TODO use a Term type
//
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
type TermCur = CIWTermCursor<'a>;
fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
let mut term_cursor = CIWTermCursor {
field_it: field_it,
form_it: CIWFormCursor {
term_it: field_writer.term_index.iter(),
postings_map: &field_writer.postings,
},
field: field,
current_form_postings: None,
};
// TODO handle having no fields at all
term_cursor
}
}
// TODO add positions // TODO add positions
pub struct CIWDocCursor<'a> { pub struct CIWDocCursor<'a> {
docs_it: Box<Iterator<Item=&'a DocId> + 'a>, docs_it: slice::Iter<'a, DocId>,
current: Option<DocId>, current: Option<DocId>,
} }

View File

@@ -3,18 +3,17 @@ extern crate itertools;
extern crate byteorder; extern crate byteorder;
extern crate regex; extern crate regex;
use tantivy::core::DocId;
use tantivy::core::postings::{VecPostings, intersection}; use tantivy::core::postings::{VecPostings, intersection};
use tantivy::core::postings::Postings; use tantivy::core::postings::Postings;
use tantivy::core::analyzer::tokenize; use tantivy::core::analyzer::tokenize;
use tantivy::core::serial::*; use tantivy::core::serial::*;
use tantivy::core::schema::*;
use tantivy::core::global::*;
use tantivy::core::writer::{IndexWriter, ClosedIndexWriter}; use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId}; use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
use tantivy::core::schema::{Field, Document};
use std::ops::DerefMut; use std::ops::DerefMut;
use tantivy::core::writer::SimplePostingsWriter; use tantivy::core::writer::SimplePostingsWriter;
use tantivy::core::postings::PostingsWriter; use tantivy::core::postings::PostingsWriter;
use tantivy::core::global::Flushable;
use std::io::{ BufWriter, Write}; use std::io::{ BufWriter, Write};
use regex::Regex; use regex::Regex;
use std::convert::From; use std::convert::From;
@@ -41,27 +40,29 @@ fn test_indexing() {
let mut index_writer = IndexWriter::open(&directory); let mut index_writer = IndexWriter::open(&directory);
{ {
let mut doc = Document::new(); let mut doc = Document::new();
doc.set(Field("text"), "toto titi"); doc.set(Field(1), "a b");
index_writer.add(doc); index_writer.add(doc);
} }
{ {
let mut doc = Document::new(); let mut doc = Document::new();
doc.set(Field("text"), "titi tata"); doc.set(Field(1), "a b c");
index_writer.add(doc); index_writer.add(doc);
} }
let closed_index_writer: ClosedIndexWriter = index_writer.close(); {
let mut field_cursor = closed_index_writer.field_cursor(); let mut doc = Document::new();
doc.set(Field(1), "a b c d");
// TODO make iteration over Fields somehow sorted
index_writer.add(doc);
}
let mut closed_index_writer: ClosedIndexWriter = index_writer.close();
let mut term_cursor = closed_index_writer.term_cursor();
loop { loop {
match field_cursor.next() { if !term_cursor.advance() {
Some(field) => { break;
println!(" {:?}", field);
show_term_cursor(field_cursor.term_cursor());
},
None => { break; },
} }
show_term(&term_cursor);
} }
assert!(false); assert!(false);
// index_writer.sync().unwrap();
} }
{ {
// TODO add index opening stuff // TODO add index opening stuff
@@ -70,45 +71,27 @@ fn test_indexing() {
} }
fn show_term_cursor<'a, T: TermCursor<'a>>(mut term_cursor: T) { fn show_term<'a, T: TermCursor<'a>>(term_cursor: &T) {
loop { println!("{:?}", term_cursor.get_term());
match term_cursor.next() { let doc_cursor = term_cursor.doc_cursor();
Some(term) => { for doc in doc_cursor {
println!(" term: {:?}", term); println!("doc({})", doc);
show_doc_cursor(term_cursor.doc_cursor());
},
None => {
break;
}
}
} }
} }
fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) { // fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
loop { // loop {
match doc_cursor.next() { // match doc_cursor.next() {
Some(doc) => { // Some(doc) => {
println!(" {}", doc); // println!(" {}", doc);
}, // },
None => { // None => {
break; // break;
} // }
} // }
} // }
} // }
#[test]
fn test_postings_writer() {
let mut postings_writer = SimplePostingsWriter::new();
postings_writer.suscribe(1);
postings_writer.suscribe(4);
postings_writer.suscribe(5);
postings_writer.suscribe(17);
let mut buffer: Vec<u8> = Vec::new();
assert_eq!(buffer.len(), 0);
postings_writer.flush(&mut buffer);
assert_eq!(buffer.len(), 5 * 8);
}
#[test] #[test]
fn test_new_segment() { fn test_new_segment() {