mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
better serial. No extra level for field
This commit is contained in:
@@ -2,12 +2,11 @@ use std::io::{BufWriter, Write};
|
||||
use std::io;
|
||||
|
||||
pub type DocId = usize;
|
||||
pub type FieldId = u32;
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(pub FieldId);
|
||||
|
||||
// pub trait SeekableIterator<T>: Iterator<T> {
|
||||
// pub fn seek(&mut self, el: &T) -> bool;
|
||||
// }
|
||||
|
||||
|
||||
pub trait Flushable {
|
||||
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error>;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use core::directory::Directory;
|
||||
use core::global::DocId;
|
||||
use core::schema::Field;
|
||||
use core::schema::*;
|
||||
|
||||
pub struct SegmentIndexReader {
|
||||
directory: Directory,
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Field(pub &'static str);
|
||||
|
||||
use core::global::*;
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct FieldValue {
|
||||
@@ -11,7 +9,7 @@ pub struct FieldValue {
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
pub struct Term<'a> {
|
||||
pub field: &'a Field,
|
||||
pub field: Field,
|
||||
pub text: &'a str,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,24 +1,22 @@
|
||||
use core::global::DocId;
|
||||
use core::schema::Field;
|
||||
use core::global::*;
|
||||
use core::schema::*;
|
||||
|
||||
// Trait sufficient to serialize a segment.
|
||||
pub trait SerializableSegment<'a> {
|
||||
type TFieldCur: FieldCursor<'a>;
|
||||
fn field_cursor(&'a self) -> Self::TFieldCur;
|
||||
type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
|
||||
fn term_cursor(&'a mut self) -> Self::TermCur;
|
||||
}
|
||||
|
||||
pub trait DocCursor: Iterator<Item=DocId> {
|
||||
fn doc(&self) -> DocId;
|
||||
}
|
||||
|
||||
pub trait TermCursor<'a>: Iterator<Item=&'a String> {
|
||||
type TDocCur: DocCursor;
|
||||
fn get_term(&self) -> &'a String;
|
||||
fn doc_cursor(&self) -> Self::TDocCur;
|
||||
}
|
||||
|
||||
pub trait FieldCursor<'a>: Iterator<Item=&'a Field> {
|
||||
type TTermCur: TermCursor<'a>;
|
||||
fn get_field(&self) -> Option<&'a Field>;
|
||||
fn term_cursor(&'a self) -> Self::TTermCur;
|
||||
// TODO make iteration over Fields somehow sorted
|
||||
// (Not only forms)
|
||||
pub trait TermCursor<'a> {
|
||||
type DocCur: DocCursor;
|
||||
fn advance(&mut self,) -> bool;
|
||||
fn get_term(&self) -> Term<'a>;
|
||||
fn doc_cursor(&self) -> Self::DocCur;
|
||||
}
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
|
||||
use std::io;
|
||||
use core::schema::Document;
|
||||
use core::schema::Field;
|
||||
use std::slice;
|
||||
use core::global::*;
|
||||
use core::schema::*;
|
||||
use core::directory::Directory;
|
||||
use core::analyzer::tokenize;
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use std::collections::{hash_map, btree_map};
|
||||
use core::DocId;
|
||||
use core::postings::PostingsWriter;
|
||||
use core::global::Flushable;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::mem;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::iter::Peekable;
|
||||
use core::serial::{FieldCursor, TermCursor, DocCursor, SerializableSegment};
|
||||
use core::serial::*;
|
||||
|
||||
pub struct SimplePostingsWriter {
|
||||
doc_ids: Vec<DocId>,
|
||||
@@ -29,7 +28,9 @@ impl SimplePostingsWriter {
|
||||
|
||||
impl PostingsWriter for SimplePostingsWriter {
|
||||
fn suscribe(&mut self, doc_id: DocId) {
|
||||
self.doc_ids.push(doc_id);
|
||||
if self.doc_ids.len() == 0 || self.doc_ids[self.doc_ids.len() - 1] < doc_id {
|
||||
self.doc_ids.push(doc_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,20 +39,6 @@ struct FieldWriter {
|
||||
term_index: BTreeMap<String, usize>,
|
||||
}
|
||||
|
||||
impl Flushable for SimplePostingsWriter {
|
||||
fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
|
||||
let mut num_bytes_written = 0;
|
||||
let num_docs = self.doc_ids.len() as u64;
|
||||
writer.write_u64::<NativeEndian>(num_docs);
|
||||
num_bytes_written += 8;
|
||||
for &doc_id in self.doc_ids.iter() {
|
||||
writer.write_u64::<NativeEndian>(doc_id as u64);
|
||||
num_bytes_written += 8;
|
||||
}
|
||||
Ok(num_bytes_written)
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldWriter {
|
||||
pub fn new() -> FieldWriter {
|
||||
FieldWriter {
|
||||
@@ -132,107 +119,31 @@ pub struct ClosedIndexWriter {
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------
|
||||
// Implementation of SerializableSegment
|
||||
//
|
||||
|
||||
pub struct CIWFieldCursor<'a> {
|
||||
field_it: hash_map::Iter<'a, Field, FieldWriter>,
|
||||
current: Option<(&'a Field, &'a FieldWriter)>
|
||||
}
|
||||
|
||||
impl<'a> CIWFieldCursor<'a> {
|
||||
fn get_field_writer(&self) -> &'a FieldWriter {
|
||||
self.current.map(|(_, second)| second).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CIWFieldCursor<'a> {
|
||||
type Item=&'a Field;
|
||||
|
||||
fn next(&mut self) -> Option<&'a Field> {
|
||||
self.current = self.field_it.next();
|
||||
self.get_field()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FieldCursor<'a> for CIWFieldCursor<'a> {
|
||||
|
||||
type TTermCur = CIWTermCursor<'a>;
|
||||
|
||||
fn get_field(&self) -> Option<&'a Field> {
|
||||
self.current.map(|(first, _)| first)
|
||||
}
|
||||
|
||||
fn term_cursor<'b>(&'b self) -> CIWTermCursor<'b> {
|
||||
let field_writer = self.get_field_writer();
|
||||
CIWTermCursor {
|
||||
postings: &field_writer.postings,
|
||||
term_it: field_writer.term_index.iter(),
|
||||
current: None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO use a Term type
|
||||
|
||||
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
|
||||
|
||||
type TFieldCur = CIWFieldCursor<'a>;
|
||||
|
||||
fn field_cursor(&'a self) -> CIWFieldCursor<'a> {
|
||||
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
|
||||
let current: Option<(&'a Field, &'a FieldWriter)> = None;
|
||||
CIWFieldCursor {
|
||||
current: current,
|
||||
field_it: field_it
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// CIWTermCursor
|
||||
// CIWFormCursor
|
||||
//
|
||||
pub struct CIWTermCursor<'a> {
|
||||
postings: &'a Vec<SimplePostingsWriter>,
|
||||
term_it: btree_map::Iter<'a, String, usize>,
|
||||
current: Option<(&'a String, &'a usize)>
|
||||
struct CIWFormCursor<'a> {
|
||||
term_it: btree_map::Iter<'a, String, usize>, // term -> postings_idx
|
||||
postings_map: &'a Vec<SimplePostingsWriter>, // postings_idx -> postings
|
||||
}
|
||||
|
||||
impl<'a> CIWTermCursor<'a> {
|
||||
fn get_term_option(&self) -> Option<&'a String> {
|
||||
self.current
|
||||
.map(|(first, _)| first)
|
||||
}
|
||||
struct FormPostings<'a> {
|
||||
form: &'a str,
|
||||
postings: &'a SimplePostingsWriter,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CIWTermCursor<'a> {
|
||||
type Item=&'a String;
|
||||
impl<'a> Iterator for CIWFormCursor<'a> {
|
||||
type Item = FormPostings<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<&'a String> {
|
||||
self.current = self.term_it.next();
|
||||
self.get_term_option()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
|
||||
type TDocCur = CIWDocCursor<'a>;
|
||||
|
||||
fn doc_cursor(&self) -> CIWDocCursor<'a> {
|
||||
let (_, &postings_id) = self.current.unwrap();
|
||||
unsafe {
|
||||
let postings_writer = self.postings.get_unchecked(postings_id);
|
||||
let docs_it = postings_writer.doc_ids.iter();
|
||||
CIWDocCursor {
|
||||
docs_it: Box::new(docs_it),
|
||||
current: None,
|
||||
fn next(&mut self,) -> Option<FormPostings<'a>> {
|
||||
self.term_it.next()
|
||||
.map(|(form, postings_idx)| {
|
||||
FormPostings {
|
||||
form: form,
|
||||
postings: unsafe { self.postings_map.get_unchecked(*postings_idx) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_term(&self) -> &'a String {
|
||||
self.get_term_option()
|
||||
.unwrap()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,10 +151,112 @@ impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
|
||||
// CIWDocCursor
|
||||
//
|
||||
|
||||
pub struct CIWTermCursor<'a> {
|
||||
field_it: hash_map::Iter<'a, Field, FieldWriter>,
|
||||
form_it: CIWFormCursor<'a>,
|
||||
current_form_postings: Option<FormPostings<'a>>,
|
||||
field: &'a Field,
|
||||
}
|
||||
|
||||
impl<'a> CIWTermCursor<'a> {
|
||||
|
||||
|
||||
fn next_form(&mut self,) -> bool {
|
||||
match self.form_it.next() {
|
||||
Some(form_postings) => {
|
||||
self.current_form_postings = Some(form_postings);
|
||||
return true;
|
||||
},
|
||||
None => { false }
|
||||
}
|
||||
}
|
||||
|
||||
// Advance to the next field
|
||||
// sets up form_it to iterate on forms
|
||||
// returns true iff there was a next field
|
||||
fn next_field(&mut self,) -> bool {
|
||||
match self.field_it.next() {
|
||||
Some((field, field_writer)) => {
|
||||
self.form_it = CIWFormCursor {
|
||||
term_it: field_writer.term_index.iter(),
|
||||
postings_map: &field_writer.postings,
|
||||
};
|
||||
self.field = field;
|
||||
true
|
||||
},
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
|
||||
|
||||
type DocCur = CIWDocCursor<'a>;
|
||||
|
||||
fn get_term(&self) -> Term<'a> {
|
||||
Term {
|
||||
field: self.field.clone(),
|
||||
text: self.current_form_postings.as_ref().unwrap().form,
|
||||
}
|
||||
}
|
||||
|
||||
fn doc_cursor(&self,) -> CIWDocCursor<'a> {
|
||||
CIWDocCursor {
|
||||
docs_it: self.current_form_postings
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.postings
|
||||
.doc_ids
|
||||
.iter(),
|
||||
current: None
|
||||
}
|
||||
}
|
||||
|
||||
fn advance(&mut self,) -> bool {
|
||||
let next_form = self.next_form();
|
||||
if next_form {
|
||||
true
|
||||
}
|
||||
else {
|
||||
if self.next_field() {
|
||||
self.advance()
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// TODO use a Term type
|
||||
//
|
||||
|
||||
impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
|
||||
|
||||
type TermCur = CIWTermCursor<'a>;
|
||||
|
||||
fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
|
||||
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
|
||||
let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
|
||||
let mut term_cursor = CIWTermCursor {
|
||||
field_it: field_it,
|
||||
form_it: CIWFormCursor {
|
||||
term_it: field_writer.term_index.iter(),
|
||||
postings_map: &field_writer.postings,
|
||||
},
|
||||
field: field,
|
||||
current_form_postings: None,
|
||||
};
|
||||
// TODO handle having no fields at all
|
||||
term_cursor
|
||||
}
|
||||
}
|
||||
|
||||
// TODO add positions
|
||||
|
||||
pub struct CIWDocCursor<'a> {
|
||||
docs_it: Box<Iterator<Item=&'a DocId> + 'a>,
|
||||
docs_it: slice::Iter<'a, DocId>,
|
||||
current: Option<DocId>,
|
||||
}
|
||||
|
||||
|
||||
@@ -3,18 +3,17 @@ extern crate itertools;
|
||||
extern crate byteorder;
|
||||
extern crate regex;
|
||||
|
||||
use tantivy::core::DocId;
|
||||
use tantivy::core::postings::{VecPostings, intersection};
|
||||
use tantivy::core::postings::Postings;
|
||||
use tantivy::core::analyzer::tokenize;
|
||||
use tantivy::core::serial::*;
|
||||
use tantivy::core::schema::*;
|
||||
use tantivy::core::global::*;
|
||||
use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
|
||||
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
|
||||
use tantivy::core::schema::{Field, Document};
|
||||
use std::ops::DerefMut;
|
||||
use tantivy::core::writer::SimplePostingsWriter;
|
||||
use tantivy::core::postings::PostingsWriter;
|
||||
use tantivy::core::global::Flushable;
|
||||
use std::io::{ BufWriter, Write};
|
||||
use regex::Regex;
|
||||
use std::convert::From;
|
||||
@@ -41,27 +40,29 @@ fn test_indexing() {
|
||||
let mut index_writer = IndexWriter::open(&directory);
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field("text"), "toto titi");
|
||||
doc.set(Field(1), "a b");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field("text"), "titi tata");
|
||||
doc.set(Field(1), "a b c");
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let closed_index_writer: ClosedIndexWriter = index_writer.close();
|
||||
let mut field_cursor = closed_index_writer.field_cursor();
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.set(Field(1), "a b c d");
|
||||
// TODO make iteration over Fields somehow sorted
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let mut closed_index_writer: ClosedIndexWriter = index_writer.close();
|
||||
let mut term_cursor = closed_index_writer.term_cursor();
|
||||
loop {
|
||||
match field_cursor.next() {
|
||||
Some(field) => {
|
||||
println!(" {:?}", field);
|
||||
show_term_cursor(field_cursor.term_cursor());
|
||||
},
|
||||
None => { break; },
|
||||
if !term_cursor.advance() {
|
||||
break;
|
||||
}
|
||||
show_term(&term_cursor);
|
||||
}
|
||||
assert!(false);
|
||||
// index_writer.sync().unwrap();
|
||||
}
|
||||
{
|
||||
// TODO add index opening stuff
|
||||
@@ -70,45 +71,27 @@ fn test_indexing() {
|
||||
}
|
||||
|
||||
|
||||
fn show_term_cursor<'a, T: TermCursor<'a>>(mut term_cursor: T) {
|
||||
loop {
|
||||
match term_cursor.next() {
|
||||
Some(term) => {
|
||||
println!(" term: {:?}", term);
|
||||
show_doc_cursor(term_cursor.doc_cursor());
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
fn show_term<'a, T: TermCursor<'a>>(term_cursor: &T) {
|
||||
println!("{:?}", term_cursor.get_term());
|
||||
let doc_cursor = term_cursor.doc_cursor();
|
||||
for doc in doc_cursor {
|
||||
println!("doc({})", doc);
|
||||
}
|
||||
}
|
||||
|
||||
fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
|
||||
loop {
|
||||
match doc_cursor.next() {
|
||||
Some(doc) => {
|
||||
println!(" {}", doc);
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
|
||||
// loop {
|
||||
// match doc_cursor.next() {
|
||||
// Some(doc) => {
|
||||
// println!(" {}", doc);
|
||||
// },
|
||||
// None => {
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn test_postings_writer() {
|
||||
let mut postings_writer = SimplePostingsWriter::new();
|
||||
postings_writer.suscribe(1);
|
||||
postings_writer.suscribe(4);
|
||||
postings_writer.suscribe(5);
|
||||
postings_writer.suscribe(17);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
assert_eq!(buffer.len(), 0);
|
||||
postings_writer.flush(&mut buffer);
|
||||
assert_eq!(buffer.len(), 5 * 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new_segment() {
|
||||
|
||||
Reference in New Issue
Block a user