better serial. No extra level for field

2025-12-27 04:29:58 +00:00 · 2016-01-16 16:33:38 +09:00
parent 4d12776b99
commit 1ba5bddd7f
6 changed files with 176 additions and 185 deletions
--- a/src/core/global.rs
+++ b/src/core/global.rs
@@ -2,12 +2,11 @@ use std::io::{BufWriter, Write};
 use std::io;
 pub type DocId = usize;
 pub type FieldId = u32;
 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
 pub struct Field(pub FieldId);
 // pub trait SeekableIterator<T>: Iterator<T> {
 //     pub fn seek(&mut self, el: &T) -> bool;
 // }
 pub trait Flushable {
    fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error>;
 }
--- a/src/core/reader.rs
+++ b/src/core/reader.rs
@@ -1,6 +1,6 @@
 use core::directory::Directory;
 use core::global::DocId;
-use core::schema::Field;
+use core::schema::*;
 pub struct SegmentIndexReader {
    directory: Directory,
--- a/src/core/schema.rs
+++ b/src/core/schema.rs
@@ -1,6 +1,4 @@
-#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
+use core::global::*;
 pub struct Field(pub &'static str);
 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
 pub struct FieldValue {
@@ -11,7 +9,7 @@ pub struct FieldValue {
 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
 pub struct Term<'a> {
-    pub field: &'a Field,
+    pub field: Field,
 	pub text: &'a str,
 }
--- a/src/core/serial.rs
+++ b/src/core/serial.rs
@@ -1,24 +1,22 @@
-use core::global::DocId;
+use core::global::*;
-use core::schema::Field;
+use core::schema::*;
 // Trait sufficient to serialize a segment.
 pub trait SerializableSegment<'a> {
-    type TFieldCur: FieldCursor<'a>;
+    type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
-    fn field_cursor(&'a self) -> Self::TFieldCur;
+    fn term_cursor(&'a mut self) -> Self::TermCur;
 }
 pub trait DocCursor: Iterator<Item=DocId> {
    fn doc(&self) -> DocId;
 }
 pub trait TermCursor<'a>: Iterator<Item=&'a String> {
    type TDocCur: DocCursor;
    fn get_term(&self) -> &'a String;
    fn doc_cursor(&self) -> Self::TDocCur;
 }
-pub trait FieldCursor<'a>: Iterator<Item=&'a Field> {
+// TODO make iteration over Fields somehow sorted
-    type TTermCur: TermCursor<'a>;
+// (Not only forms)
-    fn get_field(&self) -> Option<&'a Field>;
+pub trait TermCursor<'a> {
-    fn term_cursor(&'a self) -> Self::TTermCur;
+    type DocCur: DocCursor;
    fn advance(&mut self,) -> bool;
    fn get_term(&self) -> Term<'a>;
    fn doc_cursor(&self) -> Self::DocCur;
 }
--- a/src/core/writer.rs
+++ b/src/core/writer.rs
@@ -1,19 +1,18 @@
 use std::io;
-use core::schema::Document;
+use std::slice;
-use core::schema::Field;
+use core::global::*;
 use core::schema::*;
 use core::directory::Directory;
 use core::analyzer::tokenize;
 use std::collections::{HashMap, BTreeMap};
 use std::collections::{hash_map, btree_map};
 use core::DocId;
 use core::postings::PostingsWriter;
 use core::global::Flushable;
 use std::io::{BufWriter, Write};
 use std::mem;
 use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
 use std::iter::Peekable;
-use core::serial::{FieldCursor, TermCursor, DocCursor, SerializableSegment};
+use core::serial::*;
 pub struct SimplePostingsWriter {
 	doc_ids: Vec<DocId>,
@@ -29,7 +28,9 @@ impl SimplePostingsWriter {
 impl PostingsWriter for SimplePostingsWriter {
 	fn suscribe(&mut self, doc_id: DocId) {
-		self.doc_ids.push(doc_id);
+		if self.doc_ids.len() == 0 || self.doc_ids[self.doc_ids.len() - 1] < doc_id {
 			self.doc_ids.push(doc_id);
 		}
 	}
 }
@@ -38,20 +39,6 @@ struct FieldWriter {
    term_index: BTreeMap<String, usize>,
 }
 impl Flushable for SimplePostingsWriter {
 	fn flush<W: Write>(&self, writer: &mut W) -> Result<usize, io::Error> {
 		let mut num_bytes_written = 0;
 		let num_docs = self.doc_ids.len() as u64;
 		writer.write_u64::<NativeEndian>(num_docs);
 		num_bytes_written += 8;
 		for &doc_id in self.doc_ids.iter() {
 			writer.write_u64::<NativeEndian>(doc_id as u64);
 			num_bytes_written += 8;
 		}
 		Ok(num_bytes_written)
 	}
 }
 impl FieldWriter {
    pub fn new() -> FieldWriter {
        FieldWriter {
@@ -132,107 +119,31 @@ pub struct ClosedIndexWriter {
 //-----------------------------------------
 // Implementation of SerializableSegment
 //
 pub struct CIWFieldCursor<'a> {
 	field_it: hash_map::Iter<'a, Field, FieldWriter>,
 	current: Option<(&'a Field, &'a FieldWriter)>
 }
 impl<'a> CIWFieldCursor<'a> {
 	fn get_field_writer(&self) -> &'a FieldWriter {
 		self.current.map(|(_, second)| second).unwrap()
 	}
 }
 impl<'a> Iterator for CIWFieldCursor<'a> {
 	type Item=&'a Field;
 	fn next(&mut self) -> Option<&'a Field> {
 		self.current = self.field_it.next();
 		self.get_field()
 	}
 }
 impl<'a> FieldCursor<'a> for CIWFieldCursor<'a> {
 	type TTermCur = CIWTermCursor<'a>;
 	fn get_field(&self) -> Option<&'a Field> {
 		self.current.map(|(first, _)| first)
 	}
 	fn term_cursor<'b>(&'b self) -> CIWTermCursor<'b>  {
 		let field_writer = self.get_field_writer();
 		CIWTermCursor {
 			postings: &field_writer.postings,
 			term_it: field_writer.term_index.iter(),
 			current: None
 		}
 	}
 }
 // TODO use a Term type
 impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
 	type TFieldCur = CIWFieldCursor<'a>;
 	fn field_cursor(&'a self) -> CIWFieldCursor<'a> {
 		let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
 		let current: Option<(&'a Field, &'a FieldWriter)> = None;
 		CIWFieldCursor {
 				current: current,
 				field_it: field_it
 		}
 	}
 }
 //////////////////////////////////
-// CIWTermCursor
+// CIWFormCursor
 //
-pub struct CIWTermCursor<'a> {
+struct CIWFormCursor<'a> {
-	postings: &'a Vec<SimplePostingsWriter>,
+	term_it: btree_map::Iter<'a, String, usize>, // term -> postings_idx
-	term_it: btree_map::Iter<'a, String, usize>,
+	postings_map: &'a Vec<SimplePostingsWriter>, 	 // postings_idx -> postings
 	current: Option<(&'a String, &'a usize)>
 }
-impl<'a> CIWTermCursor<'a> {
+struct FormPostings<'a> {
-    fn get_term_option(&self) -> Option<&'a String> {
+	form: &'a str,
-		self.current
+	postings: &'a SimplePostingsWriter,
 			.map(|(first, _)| first)
 	}
 }
-impl<'a> Iterator for CIWTermCursor<'a> {
+impl<'a> Iterator for CIWFormCursor<'a> {
-	type Item=&'a String;
+	type Item = FormPostings<'a>;
-	fn next(&mut self) -> Option<&'a String> {
+	fn next(&mut self,) -> Option<FormPostings<'a>> {
-		self.current = self.term_it.next();
+		self.term_it.next()
-		self.get_term_option()
+			   .map(|(form, postings_idx)| {
-	}
+			FormPostings {
-}
+				form: form,
-
+				postings: unsafe { self.postings_map.get_unchecked(*postings_idx) }
 impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
 	type TDocCur = CIWDocCursor<'a>;
 	fn doc_cursor(&self) -> CIWDocCursor<'a> {
 		let (_, &postings_id) = self.current.unwrap();
 		unsafe {
 			let postings_writer = self.postings.get_unchecked(postings_id);
 			let docs_it = postings_writer.doc_ids.iter();
 			CIWDocCursor {
 				docs_it: Box::new(docs_it),
 				current: None,
 			}
-		}
+		})
 	}
    fn get_term(&self) -> &'a String {
 		self.get_term_option()
 			.unwrap()
 	}
 }
@@ -240,10 +151,112 @@ impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
 // CIWDocCursor
 //
 pub struct CIWTermCursor<'a> {
 	field_it: hash_map::Iter<'a, Field, FieldWriter>,
 	form_it: CIWFormCursor<'a>,
 	current_form_postings: Option<FormPostings<'a>>,
 	field: &'a Field,
 }
 impl<'a> CIWTermCursor<'a> {
 	fn next_form(&mut self,) -> bool {
 		match self.form_it.next() {
 			Some(form_postings) => {
 				self.current_form_postings = Some(form_postings);
 				return true;
 			},
 			None => { false }
 		}
 	}
 	// Advance to the next field
 	// sets up form_it to iterate on forms
 	// returns true iff there was a next field
 	fn next_field(&mut self,) -> bool {
 		match self.field_it.next() {
 			Some((field, field_writer)) => {
 				self.form_it = CIWFormCursor {
 					term_it: field_writer.term_index.iter(),
 					postings_map: &field_writer.postings,
 				};
 				self.field = field;
 				true
 			},
 			None => false,
 		}
 	}
 }
 impl<'a> TermCursor<'a> for CIWTermCursor<'a> {
 	type DocCur = CIWDocCursor<'a>;
 	fn get_term(&self) -> Term<'a> {
 		Term {
 			field: self.field.clone(),
 			text: self.current_form_postings.as_ref().unwrap().form,
 		}
 	}
 	fn doc_cursor(&self,) -> CIWDocCursor<'a> {
 		CIWDocCursor {
 			docs_it: self.current_form_postings
 				.as_ref()
 				.unwrap()
 				.postings
 				.doc_ids
 				.iter(),
 			current: None
 		}
 	}
 	fn advance(&mut self,) -> bool {
 		let next_form = self.next_form();
 		if next_form {
 			true
 		}
 		else {
 			if self.next_field() {
 				self.advance()
 			}
 			else {
 				false
 			}
 		}
 	}
 }
 //
 // TODO use a Term type
 //
 impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
 	type TermCur = CIWTermCursor<'a>;
 	fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
 		let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
 		let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
 		let mut term_cursor = CIWTermCursor {
 			field_it: field_it,
 			form_it: CIWFormCursor {
 				term_it: field_writer.term_index.iter(),
 				postings_map: &field_writer.postings,
 			},
 			field: field,
 			current_form_postings: None,
 		};
 		// TODO handle having no fields at all
 		term_cursor
 	}
 }
 // TODO add positions
 pub struct CIWDocCursor<'a> {
-	docs_it: Box<Iterator<Item=&'a DocId> + 'a>,
+	docs_it: slice::Iter<'a, DocId>,
 	current: Option<DocId>,
 }
--- a/tests/core.rs
+++ b/tests/core.rs
@@ -3,18 +3,17 @@ extern crate itertools;
 extern crate byteorder;
 extern crate regex;
 use tantivy::core::DocId;
 use tantivy::core::postings::{VecPostings, intersection};
 use tantivy::core::postings::Postings;
 use tantivy::core::analyzer::tokenize;
 use tantivy::core::serial::*;
 use tantivy::core::schema::*;
 use tantivy::core::global::*;
 use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
 use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
 use tantivy::core::schema::{Field, Document};
 use std::ops::DerefMut;
 use tantivy::core::writer::SimplePostingsWriter;
 use tantivy::core::postings::PostingsWriter;
 use tantivy::core::global::Flushable;
 use std::io::{ BufWriter, Write};
 use regex::Regex;
 use std::convert::From;
@@ -41,27 +40,29 @@ fn test_indexing() {
        let mut index_writer = IndexWriter::open(&directory);
        {
            let mut doc = Document::new();
-            doc.set(Field("text"), "toto titi");
+            doc.set(Field(1), "a b");
            index_writer.add(doc);
        }
        {
            let mut doc = Document::new();
-            doc.set(Field("text"), "titi tata");
+            doc.set(Field(1), "a b c");
            index_writer.add(doc);
        }
-        let closed_index_writer:  ClosedIndexWriter = index_writer.close();
+        {
-        let mut field_cursor = closed_index_writer.field_cursor();
+            let mut doc = Document::new();
            doc.set(Field(1), "a b c d");
            // TODO make iteration over Fields somehow sorted
            index_writer.add(doc);
        }
        let mut closed_index_writer:  ClosedIndexWriter = index_writer.close();
        let mut term_cursor = closed_index_writer.term_cursor();
        loop {
-            match field_cursor.next() {
+            if !term_cursor.advance() {
-                Some(field) => {
+                break;
                    println!("  {:?}", field);
                    show_term_cursor(field_cursor.term_cursor());
                },
                None => { break; },
            }
            show_term(&term_cursor);
        }
        assert!(false);
        // index_writer.sync().unwrap();
    }
    {
        // TODO add index opening stuff
@@ -70,45 +71,27 @@ fn test_indexing() {
 }
-fn show_term_cursor<'a, T: TermCursor<'a>>(mut term_cursor: T) {
+fn show_term<'a, T: TermCursor<'a>>(term_cursor: &T) {
-    loop {
+    println!("{:?}", term_cursor.get_term());
-        match term_cursor.next() {
+    let doc_cursor = term_cursor.doc_cursor();
-            Some(term) => {
+    for doc in doc_cursor {
-                println!("    term: {:?}", term);
+        println!("doc({})", doc);
                show_doc_cursor(term_cursor.doc_cursor());
            },
            None =>  {
                break;
            }
        }
    }
 }
-fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
+// fn show_doc_cursor<'a, D: DocCursor>(mut doc_cursor: D) {
-    loop {
+//     loop {
-        match doc_cursor.next() {
+//         match doc_cursor.next() {
-            Some(doc) => {
+//             Some(doc) => {
-                println!("       {}", doc);
+//                 println!("       {}", doc);
-            },
+//             },
-            None =>  {
+//             None =>  {
-                break;
+//                 break;
-            }
+//             }
-        }
+//         }
-    }
+//     }
-}
+// }
 #[test]
 fn test_postings_writer() {
    let mut postings_writer = SimplePostingsWriter::new();
    postings_writer.suscribe(1);
    postings_writer.suscribe(4);
    postings_writer.suscribe(5);
    postings_writer.suscribe(17);
    let mut buffer: Vec<u8> = Vec::new();
    assert_eq!(buffer.len(), 0);
    postings_writer.flush(&mut buffer);
    assert_eq!(buffer.len(), 5 * 8);
 }
 #[test]
 fn test_new_segment() {