From d69036a35a53a0c003f0265ecc0ec88d4cbcbad9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 20 Jan 2016 16:29:57 +0900 Subject: [PATCH] reader impl serialize. --- src/core/codec.rs | 4 +++- src/core/schema.rs | 7 ++++++- src/core/writer.rs | 5 +++-- tests/core.rs | 44 +++++++++++++++++++++++--------------------- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index 3da1c90e8..1784f4918 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -28,7 +28,7 @@ impl SimpleCodec { }, } for doc_id in doc_it { - println!("doc {}", doc_id); + println!(" Doc {}", doc_id); match postings.write_u32::(doc_id as u32) { Ok(_) => {}, Err(_) => { @@ -69,6 +69,7 @@ impl Codec for SimpleCodec { loop { match term_cursor.next() { Some((term, doc_it)) => { + println!("Term {}", term.text()); term.write_into(&mut term_buffer); match term_trie_builder.insert(&term_buffer, offset as u64) { Ok(_) => {} @@ -83,6 +84,7 @@ impl Codec for SimpleCodec { } } } + term_trie_builder.finish(); Ok(0) } diff --git a/src/core/schema.rs b/src/core/schema.rs index 044db69e7..356385144 100644 --- a/src/core/schema.rs +++ b/src/core/schema.rs @@ -1,6 +1,7 @@ use core::global::*; use std::fmt::Write; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use std::string::FromUtf8Error; #[derive(Clone,Debug,PartialEq,PartialOrd,Eq)] pub struct FieldValue { @@ -11,7 +12,7 @@ pub struct FieldValue { #[derive(Clone,PartialEq,PartialOrd,Eq,Hash)] pub struct Term { - pub data: Vec, // avoid copies + data: Vec, // avoid copies // pub field: Field, // pub text: &'a [u8], } @@ -20,6 +21,10 @@ impl Term { // TODO avoid all these copies. + pub fn text(&self,) -> String { + String::from_utf8_lossy(&self.data[1..]).into_owned() + } + pub fn from_field_text(field: Field, text: &str) -> Term { let mut buffer = Vec::with_capacity(1 + text.len()); let Field(field_idx) = field; diff --git a/src/core/writer.rs b/src/core/writer.rs index 4ecff9352..6de0401ba 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -17,6 +17,7 @@ use core::serial::*; use core::error::*; use std::cell::RefCell; use std::borrow::BorrowMut; +use core::directory::Segment; pub struct SimplePostingsWriter { doc_ids: Vec, @@ -104,9 +105,9 @@ impl IndexWriter { self.max_doc += 1; } - pub fn commit(self,) -> Result { + pub fn commit(self,) -> Result<(Segment, usize)> { let segment = self.directory.new_segment(); - SimpleCodec::write(&self, &segment) + SimpleCodec::write(&self, &segment).map(|sz| (segment, sz)) } } diff --git a/tests/core.rs b/tests/core.rs index 3a109e607..819adcf69 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -15,6 +15,7 @@ use tantivy::core::directory::{Directory, generate_segment_name, SegmentId}; use std::ops::DerefMut; use tantivy::core::writer::SimplePostingsWriter; use tantivy::core::postings::PostingsWriter; +use tantivy::core::reader::SegmentIndexReader; use std::io::{ BufWriter, Write}; use regex::Regex; use std::convert::From; @@ -41,7 +42,7 @@ fn test_indexing() { let mut index_writer = IndexWriter::open(&directory); { let mut doc = Document::new(); - doc.set(Field(1), "a b"); + doc.set(Field(1), "af b"); index_writer.add(doc); } { @@ -54,27 +55,28 @@ fn test_indexing() { doc.set(Field(1), "a b c d"); index_writer.add(doc); } - let commit_result = index_writer.commit(); - println!("{:?}", commit_result.err()); - //debug_assert!(commit_result.is_ok(), commit_result); - // assert!(commit_result.is_ok()); + + let (segment, num_bytes) = index_writer.commit().unwrap(); + // reading the segment + println!("------"); + { + let index_reader = SegmentIndexReader::open(segment).unwrap(); + let mut term_cursor = index_reader.term_cursor(); + loop { + match term_cursor.next() { + Some((term, mut doc_cursor)) => { + println!("Term {:?}", term.text()); + for doc in doc_cursor { + println!(" Doc {}", doc); + } + }, + None => { + break; + }, + } + } + } assert!(false); - // SimpleCodec::write(closed_index_writer, output); - // let mut term_cursor = closed_index_writer.term_cursor(); - // loop { - // match term_cursor.next() { - // Some((term, doc_it)) => { - // println!("{:?}", term); - // for doc in doc_it { - // println!(" doc {}", doc); - // } - // }, - // None => { - // break; - // } - // } - // } - // assert!(false); } { // TODO add index opening stuff