Code cleaning

This commit is contained in:
Paul Masurel
2017-05-11 20:47:30 +09:00
parent 54ab897755
commit 6fd17e0ead
16 changed files with 91 additions and 75 deletions

View File

@@ -28,6 +28,7 @@ script:
travis-cargo test &&
travis-cargo bench &&
travis-cargo doc
- cargo run --example simple_search
after_success:
- bash ./script/build-doc.sh
- travis-cargo doc-upload

View File

@@ -1,6 +1,6 @@
Tantivy 0.4.0
==========================
- Raise the limit of number of fields (previously 256 fields)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- QueryParser:
@@ -13,7 +13,6 @@ Tantivy 0.3.1
==========================
- Expose a method to trigger files garbage collection
- Raise the limit of number of fields (previously 256 fields)
Tantivy 0.3

View File

@@ -21,4 +21,5 @@ install:
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_LOG=tantivy,test & cargo run --example simple_search

View File

@@ -27,7 +27,6 @@ pub trait HasLen {
}
}
const HIGHEST_BIT: u64 = 1 << 63;

View File

@@ -175,9 +175,7 @@ mod tests {
let mut term_it = searcher.terms();
let mut terms = String::new();
while let Some(term) = term_it.next() {
unsafe {
terms.push_str(term.text());
}
terms.push_str(term.text());
}
assert_eq!(terms, "abcdef");
}

View File

@@ -45,11 +45,6 @@ impl Heap {
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
}
/// Return the amount of memory that has been allocated so far.
pub fn len(&self,) -> u32 {
self.inner().len()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
@@ -90,10 +85,6 @@ impl Heap {
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
self.inner().get_mut_ref(addr)
}
}
@@ -108,8 +99,9 @@ struct InnerHeap {
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: vec![0u8; num_bytes],
buffer: buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
@@ -124,10 +116,6 @@ impl InnerHeap {
pub fn capacity(&self,) -> u32 {
self.buffer.len() as u32
}
pub fn len(&self,) -> u32 {
self.used
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
@@ -195,8 +183,6 @@ impl InnerHeap {
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)

View File

@@ -120,7 +120,9 @@ pub fn open_index_writer(
let delete_queue = DeleteQueue::new();
let stamper = Stamper::new(index.opstamp());
let current_opstamp = index.opstamp();
let stamper = Stamper::new(current_opstamp);
let segment_updater = SegmentUpdater::new(index.clone(),
stamper.clone(),
@@ -143,7 +145,7 @@ pub fn open_index_writer(
delete_queue: delete_queue,
committed_opstamp: index.opstamp(),
committed_opstamp: current_opstamp,
stamper: stamper,
generation: 0,

View File

@@ -32,7 +32,7 @@ struct DeltaPositionComputer {
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
buffer: vec![0u32, 512]
buffer: vec![0u32; 512]
}
}
@@ -201,6 +201,8 @@ impl IndexMerger {
}
merged_doc_id_map.push(segment_local_map);
}
let mut field = Field(u32::max_value());
while merged_terms.advance() {
// Create the total list of doc ids
@@ -231,15 +233,19 @@ impl IndexMerger {
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
while segment_postings.advance() {
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
if !term_written {
let current_field = term.field();
if current_field != field {
postings_serializer.new_field(current_field);
field = current_field;
}
// we make sure to only write the term iff
// there is at least one document.
postings_serializer.new_term(&term)?;
postings_serializer.new_term(term.as_slice())?;
term_written = true;
}
let delta_positions: &[u32] =

View File

@@ -77,7 +77,7 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: schema,
opstamp: opstamp,
};
let mut w = try!(serde_json::to_vec(&metas));
let mut w = try!(serde_json::to_vec_pretty(&metas));
try!(write!(&mut w, "\n"));
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));

View File

@@ -76,9 +76,6 @@ impl<'a> SegmentWriter<'a> {
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
// for per_field_postings_writer in &mut self.per_field_postings_writers {
// per_field_postings_writer.close(self.heap);
// }
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
@@ -149,7 +146,6 @@ impl<'a> SegmentWriter<'a> {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
self.multifield_postings.suscribe(doc_id, &term);
// field_posting_writer.suscribe(term_index, doc_id, 0, &term, self.heap);
}
}
}

View File

@@ -126,7 +126,6 @@ pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use self::common::TimerTree;
pub use postings::DocSet;
pub use postings::Postings;
pub use postings::SegmentPostingsOption;

View File

@@ -17,16 +17,15 @@ mod docset;
mod segment_postings_option;
pub use self::docset::{SkipResult, DocSet};
pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
pub use self::serializer::PostingsSerializer;
pub use self::postings_writer::PostingsWriter;
pub use self::postings_writer::SpecializedPostingsWriter;
pub use self::postings_writer::MultiFieldPostingsWriter;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
#[cfg(test)]
pub use self::vec_postings::VecPostings;
pub use self::segment_postings::SegmentPostings;
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
@@ -61,8 +60,8 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut segment = index.new_segment();
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
let term = Term::from_field_text(text_field, "abc");
posting_serializer.new_term(&term).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec!(1,2,3,2);
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();

View File

@@ -114,6 +114,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (_, stop) = offsets[i+1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
postings_writer.serialize(
field,
&term_offsets[start..stop],
serializer,
self.heap)?;
@@ -144,7 +145,7 @@ pub trait PostingsWriter {
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
@@ -156,7 +157,8 @@ pub trait PostingsWriter {
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = Term::allocate(field, 100);
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
for field_value in field_values {
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
// right now num_tokens and pos are redundant, but it should
@@ -226,17 +228,19 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
}
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap) -> io::Result<()> {
let mut term = Term::allocate(Field(0), 100);
serializer.new_field(field);
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
term.set_content(term_bytes);
try!(serializer.new_term(&term));
try!(serializer.new_term(&term_bytes));
try!(recorder.serialize(addr, serializer, heap));
try!(serializer.close_term());
}
Ok(())
}
}

View File

@@ -1,7 +1,6 @@
use Result;
use datastruct::FstMapBuilder;
use super::TermInfo;
use schema::Term;
use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
@@ -30,7 +29,7 @@ use common::BinarySerializable;
///
/// The serializer expects to receive the following calls
/// in this order :
///
/// * `set_field(...)`
/// * `new_term(...)`
/// * `write_doc(...)`
/// * `write_doc(...)`
@@ -41,6 +40,8 @@ use common::BinarySerializable;
/// * `write_doc(...)`
/// * ...
/// * `close_term()`
/// * `set_field(...)`
/// * ...
/// * `close()`
///
/// Terms have to be pushed in a lexicographically-sorted order.
@@ -105,7 +106,11 @@ impl PostingsSerializer {
segment.schema())
}
fn load_indexing_options(&mut self, field: Field) {
/// Must be called before starting pushing terms of
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field) {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
@@ -130,13 +135,11 @@ impl PostingsSerializer {
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &Term) -> io::Result<()> {
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
}
self.term_open = true;
// TODO avoid load indexing options all the time.
self.load_indexing_options(term.field());
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
@@ -146,7 +149,7 @@ impl PostingsSerializer {
postings_offset: self.written_bytes_postings as u32,
positions_offset: self.written_bytes_positions as u32,
};
self.terms_fst_builder.insert_key(term.as_slice())
self.terms_fst_builder.insert_key(term)
}
/// Finish the serialization for this term postings.

View File

@@ -109,7 +109,7 @@ mod field;
mod value;
mod named_field_document;
pub use self::term::extract_field_from_term_bytes;
pub(crate) use self::term::extract_field_from_term_bytes;
pub use self::named_field_document::NamedFieldDocument;
pub use self::schema::{Schema, SchemaBuilder};
pub use self::value::Value;

View File

@@ -1,11 +1,14 @@
use std::fmt;
use common;
use byteorder::{BigEndian, WriteBytesExt, ByteOrder};
use byteorder::{BigEndian, ByteOrder};
use super::Field;
use std::str;
/// Size (in bytes) of the buffer of a int field.
const INT_TERM_LEN: usize = 4 + 8;
/// Term represents the value that the token can take.
///
/// It actually wraps a `Vec<u8>`.
@@ -14,18 +17,11 @@ pub struct Term(Vec<u8>);
/// Extract `field` from Term.
#[doc(hidden)]
pub fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
Field(BigEndian::read_u32(&term_bytes[..4]))
}
impl Term {
/// Pre-allocate a term buffer.
pub fn allocate(field: Field, num_bytes: usize) -> Term {
let mut term = Term(Vec::with_capacity(num_bytes));
term.0.write_u32::<BigEndian>(field.0).expect("serializing u32 to Vec<u8 should never fail>");
term
}
/// Set the content of the term.
pub fn set_content(&mut self, content: &[u8]) {
@@ -39,6 +35,14 @@ impl Term {
extract_field_from_term_bytes(&self.0)
}
/// Returns the field.
pub fn set_field(&mut self, field: Field) {
if self.0.len() < 4 {
self.0.resize(4, 0u8);
}
BigEndian::write_u32(&mut self.0[0..4], field.0);
}
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
@@ -47,13 +51,21 @@ impl Term {
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_u64(field: Field, val: u64) -> Term {
const U64_TERM_LEN: usize = 4 + 8;
let mut buffer = vec![0u8; U64_TERM_LEN];
// we want BigEndian here to have lexicographic order
// match the natural order of `(field, val)`
BigEndian::write_u32(&mut buffer[0..4], field.0);
BigEndian::write_u64(&mut buffer[4..], val);
Term(buffer)
let mut term = Term(vec![0u8; INT_TERM_LEN]);
term.set_field(field);
term.set_u64(val);
term
}
/// Sets a u64 value in the term.
///
/// U64 are serialized using (8-byte) BigEndian
/// representation.
/// The use of BigEndian has the benefit of preserving
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val);
}
/// Builds a term given a field, and a u64-value
@@ -75,10 +87,21 @@ impl Term {
/// The first byte is 2, and the three following bytes are the utf-8
/// representation of "abc".
pub fn from_field_text(field: Field, text: &str) -> Term {
let mut buffer = vec![0u8; 4 + text.len()];
BigEndian::write_u32(&mut buffer[0..4], field.0);
buffer[4..].clone_from_slice(text.as_bytes());
Term(buffer)
let buffer = Vec::with_capacity(4 + text.len());
let mut term = Term(buffer);
term.set_field(field);
term.set_text(text);
term
}
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term {
Term(Vec::with_capacity(num_bytes))
}
/// Assume the term is a u64 field.
@@ -112,8 +135,8 @@ impl Term {
/// If the value is not valid utf-8. This may happen
/// if the index is corrupted or if you try to
/// call this method on a non-string type.
pub unsafe fn text(&self) -> &str {
str::from_utf8_unchecked(self.value())
pub fn text(&self) -> &str {
str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
}
/// Set the texts only, keeping the field untouched.