mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-18 01:00:40 +00:00
Code cleaning
This commit is contained in:
@@ -28,6 +28,7 @@ script:
|
||||
travis-cargo test &&
|
||||
travis-cargo bench &&
|
||||
travis-cargo doc
|
||||
- cargo run --example simple_search
|
||||
after_success:
|
||||
- bash ./script/build-doc.sh
|
||||
- travis-cargo doc-upload
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
Tantivy 0.4.0
|
||||
==========================
|
||||
|
||||
- Raise the limit of number of fields (previously 256 fields)
|
||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
|
||||
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
|
||||
- QueryParser:
|
||||
@@ -13,7 +13,6 @@ Tantivy 0.3.1
|
||||
==========================
|
||||
|
||||
- Expose a method to trigger files garbage collection
|
||||
- Raise the limit of number of fields (previously 256 fields)
|
||||
|
||||
|
||||
Tantivy 0.3
|
||||
|
||||
@@ -21,4 +21,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_LOG=tantivy,test & cargo run --example simple_search
|
||||
@@ -27,7 +27,6 @@ pub trait HasLen {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
|
||||
|
||||
@@ -175,9 +175,7 @@ mod tests {
|
||||
let mut term_it = searcher.terms();
|
||||
let mut terms = String::new();
|
||||
while let Some(term) = term_it.next() {
|
||||
unsafe {
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
assert_eq!(terms, "abcdef");
|
||||
}
|
||||
|
||||
@@ -45,11 +45,6 @@ impl Heap {
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.inner().capacity()
|
||||
}
|
||||
|
||||
/// Return the amount of memory that has been allocated so far.
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.inner().len()
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
@@ -90,10 +85,6 @@ impl Heap {
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -108,8 +99,9 @@ struct InnerHeap {
|
||||
impl InnerHeap {
|
||||
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer: vec![0u8; num_bytes],
|
||||
buffer: buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
@@ -124,10 +116,6 @@ impl InnerHeap {
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.buffer.len() as u32
|
||||
}
|
||||
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.used
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
@@ -195,8 +183,6 @@ impl InnerHeap {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
|
||||
|
||||
@@ -120,7 +120,9 @@ pub fn open_index_writer(
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let stamper = Stamper::new(index.opstamp());
|
||||
let current_opstamp = index.opstamp();
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater = SegmentUpdater::new(index.clone(),
|
||||
stamper.clone(),
|
||||
@@ -143,7 +145,7 @@ pub fn open_index_writer(
|
||||
|
||||
delete_queue: delete_queue,
|
||||
|
||||
committed_opstamp: index.opstamp(),
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper: stamper,
|
||||
|
||||
generation: 0,
|
||||
|
||||
@@ -32,7 +32,7 @@ struct DeltaPositionComputer {
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer {
|
||||
buffer: vec![0u32, 512]
|
||||
buffer: vec![0u32; 512]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,6 +201,8 @@ impl IndexMerger {
|
||||
}
|
||||
merged_doc_id_map.push(segment_local_map);
|
||||
}
|
||||
|
||||
let mut field = Field(u32::max_value());
|
||||
|
||||
while merged_terms.advance() {
|
||||
// Create the total list of doc ids
|
||||
@@ -231,15 +233,19 @@ impl IndexMerger {
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
while segment_postings.advance() {
|
||||
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
if !term_written {
|
||||
let current_field = term.field();
|
||||
if current_field != field {
|
||||
postings_serializer.new_field(current_field);
|
||||
field = current_field;
|
||||
}
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
postings_serializer.new_term(&term)?;
|
||||
postings_serializer.new_term(term.as_slice())?;
|
||||
term_written = true;
|
||||
}
|
||||
let delta_positions: &[u32] =
|
||||
|
||||
@@ -77,7 +77,7 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
schema: schema,
|
||||
opstamp: opstamp,
|
||||
};
|
||||
let mut w = try!(serde_json::to_vec(&metas));
|
||||
let mut w = try!(serde_json::to_vec_pretty(&metas));
|
||||
try!(write!(&mut w, "\n"));
|
||||
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
|
||||
@@ -76,9 +76,6 @@ impl<'a> SegmentWriter<'a> {
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(self) -> Result<Vec<u64>> {
|
||||
// for per_field_postings_writer in &mut self.per_field_postings_writers {
|
||||
// per_field_postings_writer.close(self.heap);
|
||||
// }
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
@@ -149,7 +146,6 @@ impl<'a> SegmentWriter<'a> {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
// field_posting_writer.suscribe(term_index, doc_id, 0, &term, self.heap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,7 +126,6 @@ pub use schema::{Term, Document};
|
||||
pub use core::SegmentReader;
|
||||
pub use self::common::TimerTree;
|
||||
|
||||
|
||||
pub use postings::DocSet;
|
||||
pub use postings::Postings;
|
||||
pub use postings::SegmentPostingsOption;
|
||||
|
||||
@@ -17,16 +17,15 @@ mod docset;
|
||||
mod segment_postings_option;
|
||||
|
||||
pub use self::docset::{SkipResult, DocSet};
|
||||
pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
pub use self::serializer::PostingsSerializer;
|
||||
pub use self::postings_writer::PostingsWriter;
|
||||
pub use self::postings_writer::SpecializedPostingsWriter;
|
||||
pub use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::vec_postings::VecPostings;
|
||||
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
@@ -61,8 +60,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut segment = index.new_segment();
|
||||
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
|
||||
let term = Term::from_field_text(text_field, "abc");
|
||||
posting_serializer.new_term(&term).unwrap();
|
||||
posting_serializer.new_field(text_field);
|
||||
posting_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..3u32 {
|
||||
let positions = vec!(1,2,3,2);
|
||||
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
|
||||
|
||||
@@ -114,6 +114,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
let (_, stop) = offsets[i+1];
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
postings_writer.serialize(
|
||||
field,
|
||||
&term_offsets[start..stop],
|
||||
serializer,
|
||||
self.heap)?;
|
||||
@@ -144,7 +145,7 @@ pub trait PostingsWriter {
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
@@ -156,7 +157,8 @@ pub trait PostingsWriter {
|
||||
-> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = Term::allocate(field, 100);
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
for field_value in field_values {
|
||||
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
|
||||
// right now num_tokens and pos are redundant, but it should
|
||||
@@ -226,17 +228,19 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
}
|
||||
|
||||
fn serialize(&self,
|
||||
field: Field,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap) -> io::Result<()> {
|
||||
let mut term = Term::allocate(Field(0), 100);
|
||||
|
||||
serializer.new_field(field);
|
||||
for &(term_bytes, addr) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
term.set_content(term_bytes);
|
||||
try!(serializer.new_term(&term));
|
||||
try!(serializer.new_term(&term_bytes));
|
||||
try!(recorder.serialize(addr, serializer, heap));
|
||||
try!(serializer.close_term());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use Result;
|
||||
use datastruct::FstMapBuilder;
|
||||
use super::TermInfo;
|
||||
use schema::Term;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
@@ -30,7 +29,7 @@ use common::BinarySerializable;
|
||||
///
|
||||
/// The serializer expects to receive the following calls
|
||||
/// in this order :
|
||||
///
|
||||
/// * `set_field(...)`
|
||||
/// * `new_term(...)`
|
||||
/// * `write_doc(...)`
|
||||
/// * `write_doc(...)`
|
||||
@@ -41,6 +40,8 @@ use common::BinarySerializable;
|
||||
/// * `write_doc(...)`
|
||||
/// * ...
|
||||
/// * `close_term()`
|
||||
/// * `set_field(...)`
|
||||
/// * ...
|
||||
/// * `close()`
|
||||
///
|
||||
/// Terms have to be pushed in a lexicographically-sorted order.
|
||||
@@ -105,7 +106,11 @@ impl PostingsSerializer {
|
||||
segment.schema())
|
||||
}
|
||||
|
||||
fn load_indexing_options(&mut self, field: Field) {
|
||||
/// Must be called before starting pushing terms of
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(&mut self, field: Field) {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
self.text_indexing_options = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
@@ -130,13 +135,11 @@ impl PostingsSerializer {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &Term) -> io::Result<()> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
panic!("Called new_term, while the previous term was not closed.");
|
||||
}
|
||||
self.term_open = true;
|
||||
// TODO avoid load indexing options all the time.
|
||||
self.load_indexing_options(term.field());
|
||||
self.doc_ids.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
self.term_freqs.clear();
|
||||
@@ -146,7 +149,7 @@ impl PostingsSerializer {
|
||||
postings_offset: self.written_bytes_postings as u32,
|
||||
positions_offset: self.written_bytes_positions as u32,
|
||||
};
|
||||
self.terms_fst_builder.insert_key(term.as_slice())
|
||||
self.terms_fst_builder.insert_key(term)
|
||||
}
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
|
||||
@@ -109,7 +109,7 @@ mod field;
|
||||
mod value;
|
||||
mod named_field_document;
|
||||
|
||||
pub use self::term::extract_field_from_term_bytes;
|
||||
pub(crate) use self::term::extract_field_from_term_bytes;
|
||||
pub use self::named_field_document::NamedFieldDocument;
|
||||
pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::value::Value;
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use std::fmt;
|
||||
|
||||
use common;
|
||||
use byteorder::{BigEndian, WriteBytesExt, ByteOrder};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use super::Field;
|
||||
use std::str;
|
||||
|
||||
|
||||
/// Size (in bytes) of the buffer of a int field.
|
||||
const INT_TERM_LEN: usize = 4 + 8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
@@ -14,18 +17,11 @@ pub struct Term(Vec<u8>);
|
||||
|
||||
/// Extract `field` from Term.
|
||||
#[doc(hidden)]
|
||||
pub fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
|
||||
pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
|
||||
Field(BigEndian::read_u32(&term_bytes[..4]))
|
||||
}
|
||||
|
||||
impl Term {
|
||||
|
||||
/// Pre-allocate a term buffer.
|
||||
pub fn allocate(field: Field, num_bytes: usize) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(num_bytes));
|
||||
term.0.write_u32::<BigEndian>(field.0).expect("serializing u32 to Vec<u8 should never fail>");
|
||||
term
|
||||
}
|
||||
|
||||
/// Set the content of the term.
|
||||
pub fn set_content(&mut self, content: &[u8]) {
|
||||
@@ -39,6 +35,14 @@ impl Term {
|
||||
extract_field_from_term_bytes(&self.0)
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
pub fn set_field(&mut self, field: Field) {
|
||||
if self.0.len() < 4 {
|
||||
self.0.resize(4, 0u8);
|
||||
}
|
||||
BigEndian::write_u32(&mut self.0[0..4], field.0);
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
@@ -47,13 +51,21 @@ impl Term {
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 4 following bytes are encoding the u64 value.
|
||||
pub fn from_field_u64(field: Field, val: u64) -> Term {
|
||||
const U64_TERM_LEN: usize = 4 + 8;
|
||||
let mut buffer = vec![0u8; U64_TERM_LEN];
|
||||
// we want BigEndian here to have lexicographic order
|
||||
// match the natural order of `(field, val)`
|
||||
BigEndian::write_u32(&mut buffer[0..4], field.0);
|
||||
BigEndian::write_u64(&mut buffer[4..], val);
|
||||
Term(buffer)
|
||||
let mut term = Term(vec![0u8; INT_TERM_LEN]);
|
||||
term.set_field(field);
|
||||
term.set_u64(val);
|
||||
term
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
///
|
||||
/// U64 are serialized using (8-byte) BigEndian
|
||||
/// representation.
|
||||
/// The use of BigEndian has the benefit of preserving
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.0.resize(INT_TERM_LEN, 0u8);
|
||||
BigEndian::write_u64(&mut self.0[4..], val);
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u64-value
|
||||
@@ -75,10 +87,21 @@ impl Term {
|
||||
/// The first byte is 2, and the three following bytes are the utf-8
|
||||
/// representation of "abc".
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
let mut buffer = vec![0u8; 4 + text.len()];
|
||||
BigEndian::write_u32(&mut buffer[0..4], field.0);
|
||||
buffer[4..].clone_from_slice(text.as_bytes());
|
||||
Term(buffer)
|
||||
let buffer = Vec::with_capacity(4 + text.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
term.set_text(text);
|
||||
term
|
||||
}
|
||||
|
||||
/// Creates a new Term with an empty buffer,
|
||||
/// but with a given capacity.
|
||||
///
|
||||
/// It is declared unsafe, as the term content
|
||||
/// is not initialized, and a call to `.field()`
|
||||
/// would panic.
|
||||
pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term {
|
||||
Term(Vec::with_capacity(num_bytes))
|
||||
}
|
||||
|
||||
/// Assume the term is a u64 field.
|
||||
@@ -112,8 +135,8 @@ impl Term {
|
||||
/// If the value is not valid utf-8. This may happen
|
||||
/// if the index is corrupted or if you try to
|
||||
/// call this method on a non-string type.
|
||||
pub unsafe fn text(&self) -> &str {
|
||||
str::from_utf8_unchecked(self.value())
|
||||
pub fn text(&self) -> &str {
|
||||
str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
|
||||
Reference in New Issue
Block a user