better serial. No extra level for field

This commit is contained in:
Paul Masurel
2016-01-16 16:33:38 +09:00
parent 4d12776b99
commit 1ba5bddd7f
6 changed files with 176 additions and 185 deletions

View File

@@ -2,12 +2,11 @@ use std::io::{BufWriter, Write};
use std::io;
pub type DocId = usize;
pub type FieldId = u32;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(pub FieldId);
// pub trait SeekableIterator<T>: Iterator<T> {
// pub fn seek(&mut self, el: &T) -> bool;
// }
pub trait Flushable {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error>;
}

View File

@@ -1,6 +1,6 @@
use core::directory::Directory;
use core::global::DocId;
use core::schema::Field;
use core::schema::*;
pub struct SegmentIndexReader {
directory: Directory,

View File

@@ -1,6 +1,4 @@
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Field(pub &'static str);
use core::global::*;
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
pub struct FieldValue {
@@ -11,7 +9,7 @@ pub struct FieldValue {
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
pub struct Term<'a> {
pub field: &'a Field,
pub field: Field,
pub text: &'a str,
}

View File

@@ -1,24 +1,22 @@
use core::global::DocId;
use core::schema::Field;
use core::global::*;
use core::schema::*;
// Trait sufficient to serialize a segment.
pub trait SerializableSegment<'a> {
type TFieldCur: FieldCursor<'a>;
fn field_cursor(&'a self) -> Self::TFieldCur;
type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
fn term_cursor(&'a mut self) -> Self::TermCur;
}
pub trait DocCursor: Iterator<Item=DocId> {
fn doc(&self) -> DocId;
}
pub trait TermCursor<'a>: Iterator<Item=&'a String> {
type TDocCur: DocCursor;
fn get_term(&self) -> &'a String;
fn doc_cursor(&self) -> Self::TDocCur;
}
pub trait FieldCursor<'a>: Iterator<Item=&'a Field> {
type TTermCur: TermCursor<'a>;
fn get_field(&self) -> Option<&'a Field>;
fn term_cursor(&'a self) -> Self::TTermCur;
// TODO make iteration over Fields somehow sorted
// (Not only forms)
pub trait TermCursor<'a> {
type DocCur: DocCursor;
fn advance(&mut self,) -> bool;
fn get_term(&self) -> Term<'a>;
fn doc_cursor(&self) -> Self::DocCur;
}

View File

@@ -1,19 +1,18 @@
use std::io;
use core::schema::Document;
use core::schema::Field;
use std::slice;
use core::global::*;
use core::schema::*;
use core::directory::Directory;
use core::analyzer::tokenize;
use std::collections::{HashMap, BTreeMap};
use std::collections::{hash_map, btree_map};
use core::DocId;
use core::postings::PostingsWriter;
use core::global::Flushable;
use std::io::{BufWriter, Write};
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::iter::Peekable;
use core::serial::{FieldCursor, TermCursor, DocCursor, SerializableSegment};
use core::serial::*;
pub struct SimplePostingsWriter {
doc_ids: Vec<DocId>,
@@ -29,7 +28,9 @@ impl SimplePostingsWriter {
impl PostingsWriter for SimplePostingsWriter {
fn suscribe(&mut self, doc_id: DocId) {
self.doc_ids.push(doc_id);
if self.doc_ids.len() == 0 || self.doc_ids[self.doc_ids.len() - 1] < doc_id {
self.doc_ids.push(doc_id);
}
}
}
@@ -38,20 +39,6 @@ struct FieldWriter {
term_index: BTreeMap<String, usize>,
}
impl Flushable for SimplePostingsWriter {
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
let mut num_bytes_written = 0;
let num_docs = self.doc_ids.len() as u64;
writer.write_u64::<NativeEndian>(num_docs);
num_bytes_written += 8;
for &doc_id in self.doc_ids.iter() {
writer.write_u64::<NativeEndian>(doc_id as u64);
num_bytes_written += 8;
}
Ok(num_bytes_written)
}
}
impl FieldWriter {
pub fn new() -> FieldWriter {
FieldWriter {
@@ -132,107 +119,31 @@ pub struct ClosedIndexWriter {
//-----------------------------------------
// Implementation of SerializableSegment
//
pub struct CIWFieldCursor<'a> {
field_it: hash_map::Iter<'a, Field, FieldWriter>,
current: Option<(&'a Field, &'a FieldWriter)>
}
impl<'a> CIWFieldCursor<'a> {
fn get_field_writer(&self) -> &'a FieldWriter {
self.current.map(|(_, second)| second).unwrap()
}
}
impl<'a> Iterator for CIWFieldCursor<'a> {
type Item=&'a Field;
fn next(&mut self) -> Option<&'a Field> {
self.current = self.field_it.next();
self.get_field()
}
}
impl<'a> FieldCursor<'a> for CIWFieldCursor<'a> {
type TTermCur = CIWTermCursor<'a>;
fn get_field(&self) -> Option<&'a Field> {
self.current.map(|(first, _)| first)
}
fn term_cursor<'b>(&'b self) -> CIWTermCursor<'b> {
let field_writer = self.get_field_writer();
CIWTermCursor {
postings: &field_writer.postings,
term_it: field_writer.term_index.iter(),
current: None
}
}
}
// TODO use a Term type
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
type TFieldCur = CIWFieldCursor<'a>;
fn field_cursor(&'a self) -> CIWFieldCursor<'a> {
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
let current: Option<(&'a Field, &'a FieldWriter)> = None;
CIWFieldCursor {
current: current,
field_it: field_it
}
}
}
//////////////////////////////////
// CIWTermCursor
// CIWFormCursor
//
pub struct CIWTermCursor<'a> {
postings: &'a Vec<SimplePostingsWriter>,
term_it: btree_map::Iter<'a, String, usize>,
current: Option<(&'a String, &'a usize)>
struct CIWFormCursor<'a> {
term_it: btree_map::Iter<'a, String, usize>, // term -> postings_idx
postings_map: &'a Vec<SimplePostingsWriter>, // postings_idx -> postings
}
impl<'a> CIWTermCursor<'a> {
fn get_term_option(&self) -> Option<&'a String> {
self.current
.map(|(first, _)| first)
}
struct FormPostings<'a> {
form: &'a str,
postings: &'a SimplePostingsWriter,
}
impl<'a> Iterator for CIWTermCursor<'a> {
type Item=&'a String;
impl<'a> Iterator for CIWFormCursor<'a> {
type Item = FormPostings<'a>;
fn next(&mut self) -> Option<&'a String> {
self.current = self.term_it.next();
self.get_term_option()
}
}
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
type TDocCur = CIWDocCursor<'a>;
fn doc_cursor(&self) -> CIWDocCursor<'a> {
let (_, &postings_id) = self.current.unwrap();
unsafe {
let postings_writer = self.postings.get_unchecked(postings_id);
let docs_it = postings_writer.doc_ids.iter();
CIWDocCursor {
docs_it: Box::new(docs_it),
current: None,
fn next(&mut self,) -> Option<FormPostings<'a>> {
self.term_it.next()
.map(|(form, postings_idx)| {
FormPostings {
form: form,
postings: unsafe { self.postings_map.get_unchecked(*postings_idx) }
}
}
}
fn get_term(&self) -> &'a String {
self.get_term_option()
.unwrap()
})
}
}
@@ -240,10 +151,112 @@ impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
// CIWDocCursor
//
pub struct CIWTermCursor<'a> {
field_it: hash_map::Iter<'a, Field, FieldWriter>,
form_it: CIWFormCursor<'a>,
current_form_postings: Option<FormPostings<'a>>,
field: &'a Field,
}
impl<'a> CIWTermCursor<'a> {
fn next_form(&mut self,) -> bool {
match self.form_it.next() {
Some(form_postings) => {
self.current_form_postings = Some(form_postings);
return true;
},
None => { false }
}
}
// Advance to the next field
// sets up form_it to iterate on forms
// returns true iff there was a next field
fn next_field(&mut self,) -> bool {
match self.field_it.next() {
Some((field, field_writer)) => {
self.form_it = CIWFormCursor {
term_it: field_writer.term_index.iter(),
postings_map: &field_writer.postings,
};
self.field = field;
true
},
None => false,
}
}
}
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
type DocCur = CIWDocCursor<'a>;
fn get_term(&self) -> Term<'a> {
Term {
field: self.field.clone(),
text: self.current_form_postings.as_ref().unwrap().form,
}
}
fn doc_cursor(&self,) -> CIWDocCursor<'a> {
CIWDocCursor {
docs_it: self.current_form_postings
.as_ref()
.unwrap()
.postings
.doc_ids
.iter(),
current: None
}
}
fn advance(&mut self,) -> bool {
let next_form = self.next_form();
if next_form {
true
}
else {
if self.next_field() {
self.advance()
}
else {
false
}
}
}
}
//
// TODO use a Term type
//
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
type TermCur = CIWTermCursor<'a>;
fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
let mut term_cursor = CIWTermCursor {
field_it: field_it,
form_it: CIWFormCursor {
term_it: field_writer.term_index.iter(),
postings_map: &field_writer.postings,
},
field: field,
current_form_postings: None,
};
// TODO handle having no fields at all
term_cursor
}
}
// TODO add positions
pub struct CIWDocCursor<'a> {
docs_it: Box<Iterator<Item=&'a DocId> + 'a>,
docs_it: slice::Iter<'a, DocId>,
current: Option<DocId>,
}

View File

@@ -3,18 +3,17 @@ extern crate itertools;
extern crate byteorder;
extern crate regex;
use tantivy::core::DocId;
use tantivy::core::postings::{VecPostings, intersection};
use tantivy::core::postings::Postings;
use tantivy::core::analyzer::tokenize;
use tantivy::core::serial::*;
use tantivy::core::schema::*;
use tantivy::core::global::*;
use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
use tantivy::core::schema::{Field, Document};
use std::ops::DerefMut;
use tantivy::core::writer::SimplePostingsWriter;
use tantivy::core::postings::PostingsWriter;
use tantivy::core::global::Flushable;
use std::io::{ BufWriter, Write};
use regex::Regex;
use std::convert::From;
@@ -41,27 +40,29 @@ fn test_indexing() {
let mut index_writer = IndexWriter::open(&directory);
{
let mut doc = Document::new();
doc.set(Field("text"), "toto titi");
doc.set(Field(1), "a b");
index_writer.add(doc);
}
{
let mut doc = Document::new();
doc.set(Field("text"), "titi tata");
doc.set(Field(1), "a b c");
index_writer.add(doc);
}
let closed_index_writer: ClosedIndexWriter = index_writer.close();
let mut field_cursor = closed_index_writer.field_cursor();
{
let mut doc = Document::new();
doc.set(Field(1), "a b c d");
// TODO make iteration over Fields somehow sorted
index_writer.add(doc);
}
let mut closed_index_writer: ClosedIndexWriter = index_writer.close();
let mut term_cursor = closed_index_writer.term_cursor();
loop {
match field_cursor.next() {
Some(field) => {
println!(" {:?}", field);
show_term_cursor(field_cursor.term_cursor());
},
None => { break; },
if !term_cursor.advance() {
break;
}
show_term(&term_cursor);
}
assert!(false);
// index_writer.sync().unwrap();
}
{
// TODO add index opening stuff
@@ -70,45 +71,27 @@ fn test_indexing() {
}
fn show_term_cursor<'a, T: TermCursor<'a>>(mut term_cursor: T) {
loop {
match term_cursor.next() {
Some(term) => {
println!(" term: {:?}", term);
show_doc_cursor(term_cursor.doc_cursor());
},
None => {
break;
}
}
fn show_term<'a, T: TermCursor<'a>>(term_cursor: &T) {
println!("{:?}", term_cursor.get_term());
let doc_cursor = term_cursor.doc_cursor();
for doc in doc_cursor {
println!("doc({})", doc);
}
}
fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
loop {
match doc_cursor.next() {
Some(doc) => {
println!(" {}", doc);
},
None => {
break;
}
}
}
}
// fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
// loop {
// match doc_cursor.next() {
// Some(doc) => {
// println!(" {}", doc);
// },
// None => {
// break;
// }
// }
// }
// }
#[test]
fn test_postings_writer() {
let mut postings_writer = SimplePostingsWriter::new();
postings_writer.suscribe(1);
postings_writer.suscribe(4);
postings_writer.suscribe(5);
postings_writer.suscribe(17);
let mut buffer: Vec<u8> = Vec::new();
assert_eq!(buffer.len(), 0);
postings_writer.flush(&mut buffer);
assert_eq!(buffer.len(), 5 * 8);
}
#[test]
fn test_new_segment() {