mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
blop
This commit is contained in:
8
readme.md
Normal file
8
readme.md
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
Closing an IndexWriter spawns an IndexSerializable
|
||||
|
||||
An indexserializable contains all of the method
|
||||
to be written on disk as a segment.
|
||||
|
||||
|
||||
IndexReader impl IndexSerializable
|
||||
@@ -1,21 +1,73 @@
|
||||
use std::io;
|
||||
use core::serial::SerializableSegment;
|
||||
use core::serial::*;
|
||||
use std::io::Write;
|
||||
use fst::MapBuilder;
|
||||
use core::error::*;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
|
||||
pub struct SegmentOutput<'a> {
|
||||
terms: &'a Write,
|
||||
postings: &'a Write,
|
||||
pub trait SegmentOutput<'a, W: Write> {
|
||||
fn terms(&self,) -> W;
|
||||
fn postings(&self,) -> W;
|
||||
// TODO positions, docvalues, ...
|
||||
}
|
||||
|
||||
|
||||
pub trait Codec {
|
||||
fn write<'a, 'b, I: SerializableSegment<'a>>(index: &I, output: &'b SegmentOutput) -> Result<usize, io::Error> {
|
||||
Ok(0)
|
||||
fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result<usize>;
|
||||
}
|
||||
|
||||
pub struct SimpleCodec;
|
||||
|
||||
impl SimpleCodec {
|
||||
fn write_postings<D: DocCursor, W: Write>(mut doc_it: D, postings: &mut W) -> Result<usize> {
|
||||
let mut written_bytes: usize = 4;
|
||||
postings.write_u32::<LittleEndian>(doc_it.len() as u32);
|
||||
// TODO handle error correctly
|
||||
for doc_id in doc_it {
|
||||
postings.write_u32::<LittleEndian>(doc_id as u32);
|
||||
written_bytes += 4;
|
||||
}
|
||||
Ok(written_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DebugCodec;
|
||||
impl Codec for SimpleCodec {
|
||||
fn write<'a, I: SerializableSegment<'a>, W: Write>(index: &'a I, output: &'a SegmentOutput<'a, W>) -> Result<usize> {
|
||||
let term_trie_builder_result = MapBuilder::new(output.terms());
|
||||
if term_trie_builder_result.is_err() {
|
||||
// TODO include cause somehow
|
||||
return Err(Error::IOError(String::from("Failed creating the term builder")));
|
||||
}
|
||||
let mut term_buffer: String = String::new();
|
||||
let mut term_trie_builder = term_trie_builder_result.unwrap();
|
||||
let mut term_cursor = index.term_cursor();
|
||||
let mut offset: usize = 0;
|
||||
let mut postings_output = output.postings();
|
||||
loop {
|
||||
match term_cursor.next() {
|
||||
Some((term, doc_it)) => {
|
||||
term.write_into(&mut term_buffer);
|
||||
match term_trie_builder.insert(&term_buffer, offset as u64) {
|
||||
Ok(_) => {}
|
||||
Err(_) => {
|
||||
return Err(Error::IOError(String::from("Failed while inserting into the fst")))
|
||||
},
|
||||
}
|
||||
offset += try!(SimpleCodec::write_postings(doc_it, &mut postings_output));
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(0)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// impl DebugCodec {
|
||||
// fn write_field(field_name) {
|
||||
|
||||
8
src/core/error.rs
Normal file
8
src/core/error.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
use std::result;
|
||||
|
||||
|
||||
pub enum Error {
|
||||
IOError(String),
|
||||
}
|
||||
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
@@ -8,5 +8,6 @@ pub mod analyzer;
|
||||
pub mod serial;
|
||||
pub mod reader;
|
||||
pub mod codec;
|
||||
pub mod error;
|
||||
|
||||
pub use core::global::DocId;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use core::global::*;
|
||||
use std::fmt::Write;
|
||||
|
||||
#[derive(Clone,Debug,PartialEq,PartialOrd,Eq)]
|
||||
pub struct FieldValue {
|
||||
@@ -13,6 +14,15 @@ pub struct Term<'a> {
|
||||
pub text: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> Term<'a> {
|
||||
pub fn write_into(&self, term_str: &mut String) {
|
||||
term_str.clear();
|
||||
let Field(field_idx) = self.field;
|
||||
// TODO avoid writing the field idx.
|
||||
term_str.write_fmt(format_args!("{}:{}", field_idx, self.text));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct Document {
|
||||
fields: Vec<FieldValue>,
|
||||
|
||||
@@ -4,15 +4,15 @@ use core::schema::*;
|
||||
// Trait sufficient to serialize a segment.
|
||||
pub trait SerializableSegment<'a> {
|
||||
type TermCur: TermCursor<'a>; // TODO rename TermCursorImpl
|
||||
fn term_cursor(&'a mut self) -> Self::TermCur;
|
||||
fn term_cursor(&'a self) -> Self::TermCur;
|
||||
}
|
||||
|
||||
pub trait DocCursor: Iterator<Item=DocId> {
|
||||
fn doc(&self) -> DocId;
|
||||
fn len(&self) -> usize;
|
||||
}
|
||||
|
||||
// TODO make iteration over Fields somehow sorted
|
||||
// (Not only forms)
|
||||
|
||||
pub trait TermCursor<'a> {
|
||||
type DocCur: DocCursor;
|
||||
|
||||
@@ -183,11 +183,14 @@ impl<'a> CIWTermCursor<'a> {
|
||||
}
|
||||
|
||||
fn doc_cursor(&self,) -> CIWDocCursor<'a> {
|
||||
let postings = self.current_form_postings
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.postings;
|
||||
let num_docs = postings.doc_ids.len();
|
||||
CIWDocCursor {
|
||||
docs_it: self.current_form_postings
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.postings
|
||||
num_docs: num_docs,
|
||||
docs_it: postings
|
||||
.doc_ids
|
||||
.iter(),
|
||||
current: None
|
||||
@@ -245,7 +248,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
|
||||
|
||||
type TermCur = CIWTermCursor<'a>;
|
||||
|
||||
fn term_cursor(&'a mut self) -> CIWTermCursor<'a> {
|
||||
fn term_cursor(&'a self) -> CIWTermCursor<'a> {
|
||||
let mut field_it: hash_map::Iter<'a, Field, FieldWriter> = self.index_writer.term_writers.iter();
|
||||
let (field, field_writer) = field_it.next().unwrap(); // TODO handle no field
|
||||
let mut term_cursor = CIWTermCursor {
|
||||
@@ -267,6 +270,7 @@ impl<'a> SerializableSegment<'a> for ClosedIndexWriter {
|
||||
pub struct CIWDocCursor<'a> {
|
||||
docs_it: slice::Iter<'a, DocId>,
|
||||
current: Option<DocId>,
|
||||
num_docs: usize,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for CIWDocCursor<'a> {
|
||||
@@ -279,7 +283,12 @@ impl<'a> Iterator for CIWDocCursor<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DocCursor for CIWDocCursor<'a> {
|
||||
|
||||
fn doc(&self,) -> DocId {
|
||||
self.current.unwrap()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.num_docs
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
extern crate fst;
|
||||
extern crate byteorder;
|
||||
|
||||
extern crate rand;
|
||||
|
||||
@@ -8,6 +8,7 @@ use tantivy::core::postings::Postings;
|
||||
use tantivy::core::analyzer::tokenize;
|
||||
use tantivy::core::serial::*;
|
||||
use tantivy::core::schema::*;
|
||||
use tantivy::core::codec::SimpleCodec;
|
||||
use tantivy::core::global::*;
|
||||
use tantivy::core::writer::{IndexWriter, ClosedIndexWriter};
|
||||
use tantivy::core::directory::{Directory, generate_segment_name, SegmentId};
|
||||
@@ -54,20 +55,21 @@ fn test_indexing() {
|
||||
index_writer.add(doc);
|
||||
}
|
||||
let mut closed_index_writer: ClosedIndexWriter = index_writer.close();
|
||||
let mut term_cursor = closed_index_writer.term_cursor();
|
||||
loop {
|
||||
match term_cursor.next() {
|
||||
Some((term, doc_it)) => {
|
||||
println!("{:?}", term);
|
||||
for doc in doc_it {
|
||||
println!(" doc {}", doc);
|
||||
}
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// SimpleCodec::write(closed_index_writer, output);
|
||||
// let mut term_cursor = closed_index_writer.term_cursor();
|
||||
// loop {
|
||||
// match term_cursor.next() {
|
||||
// Some((term, doc_it)) => {
|
||||
// println!("{:?}", term);
|
||||
// for doc in doc_it {
|
||||
// println!(" doc {}", doc);
|
||||
// }
|
||||
// },
|
||||
// None => {
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
assert!(false);
|
||||
}
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user