Let’s create a temporary directory for the
sake of this example
@@ -60,7 +60,7 @@ sake of this example
ifletOk(dir) = TempDir::new("tantivy_example_dir") {
run_example(dir.path()).unwrap();
dir.close().unwrap();
- }
+ }
}
@@ -78,7 +78,7 @@ sake of this example
Defining the schema
The Tantivy index requires a very strict schema.
The schema declares which fields are in the index,
-and for each field, its type and “the way it should
+and for each field, its type and “the way it should
be indexed”.
@@ -111,12 +111,12 @@ be indexed”.
We want full-text search for it, and we want to be able
to retrieve the document after the search.
TEXT | STORED is some syntactic sugar to describe
-that.
+that.
TEXT means the field should be tokenized and indexed,
along with its term frequency and term positions.
STORED means that the field will also be saved
in a compressed, row-oriented key-value store.
-This store is useful to reconstruct the
+This store is useful to reconstruct the
documents that were selected during the search phase.
@@ -139,7 +139,7 @@ to retrieve the body after the search.
schema_builder.add_text_field("body", TEXT);
-
+
let schema = schema_builder.build();
@@ -173,14 +173,12 @@ with our schema in the directory.
There must be only one writer at a time.
This single IndexWriter is already
multithreaded.
-
Here we use a buffer of 1 GB. Using a bigger
-heap for the indexer can increase its throughput.
-This buffer will be split between the indexing
-threads.
+
Here we use a buffer of 50MB per thread. Using a bigger
+heap for the indexer can increase its throughput.
@@ -213,10 +211,12 @@ one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
-
+
letmut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
- old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.");
+ old_man_doc.add_text(body,
+ "He was an old man who fished alone in a skiff in the Gulf Stream and \
+ he had gone eighty-four days now without taking a fish.");
@@ -231,7 +231,7 @@ one by one in a Document object.
-
try!(index_writer.add_document(old_man_doc));
+
index_writer.add_document(old_man_doc);
@@ -248,13 +248,13 @@ a document object directly from json.
-
+
let mice_and_men_doc = try!(schema.parse_document(r#"{
"title": "Of Mice and Men",
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
}"#));
-
- try!(index_writer.add_document(mice_and_men_doc));
+
+ index_writer.add_document(mice_and_men_doc);
@@ -275,7 +275,7 @@ The following document has two titles.
"title": ["Frankenstein", "The Modern Promotheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"#));
- try!(index_writer.add_document(frankenstein_doc));
+ index_writer.add_document(frankenstein_doc);
@@ -288,7 +288,7 @@ The following document has two titles.
This is an example, so we will only index 3 documents
here. You can check out tantivy’s tutorial to index
-the English wikipedia. Tantivy’s indexing is rather fast.
+the English wikipedia. Tantivy’s indexing is rather fast.
Indexing 5 million articles of the English wikipedia takes
around 4 minutes on my computer!
The query parser can interpret human queries.
-Here, if the user does not specify which
-field they want to search, tantivy will search
-in both title and body.
+
Afterwards create one (or more) searchers.
+
You should create a searcher
+every time you start a “search query”.
-
let query_parser = QueryParser::new(index.schema(), vec!(title, body));
The query parser can interpret human queries.
+Here, if the user does not specify which
+field they want to search, tantivy will search
+in both title and body.
+
+
+
+
let query_parser = QueryParser::new(index.schema(), vec![title, body]);
QueryParser may fail if the query is not in the right
format. For user facing applications, this can be a problem.
A ticket has been opened regarding this problem.
@@ -391,11 +406,11 @@ A ticket has been opened regarding this problem.
-
The actual documents still need to be
retrieved from Tantivy’s store.
Since the body field was not configured as stored,
the document returned will only contain
@@ -472,10 +486,10 @@ a title.
-
+
for doc_address in doc_addresses {
- let retrieved_doc = try!(searcher.doc(&doc_address));
- println!("{}", schema.to_json(&retrieved_doc));
+ let retrieved_doc = try!(searcher.doc(&doc_address));
+ println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
diff --git a/examples/simple_search.rs b/examples/simple_search.rs
index cff539b9d..430d7abf0 100644
--- a/examples/simple_search.rs
+++ b/examples/simple_search.rs
@@ -10,105 +10,105 @@ use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
fn main() {
- // Let's create a temporary directory for the
+ // Let's create a temporary directory for the
// sake of this example
if let Ok(dir) = TempDir::new("tantivy_example_dir") {
run_example(dir.path()).unwrap();
dir.close().unwrap();
- }
+ }
}
fn run_example(index_path: &Path) -> tantivy::Result<()> {
-
-
+
+
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
- // and for each field, its type and "the way it should
+ // and for each field, its type and "the way it should
// be indexed".
-
-
+
+
// first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default();
-
+
// Our first field is title.
// We want full-text search for it, and we want to be able
// to retrieve the document after the search.
//
// TEXT | STORED is some syntactic sugar to describe
- // that.
- //
+ // that.
+ //
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
- // This store is useful to reconstruct the
+ // This store is useful to reconstruct the
// documents that were selected during the search phase.
schema_builder.add_text_field("title", TEXT | STORED);
-
+
// Our first field is body.
// We want full-text search for it, and we want to be able
// to retrieve the body after the search.
schema_builder.add_text_field("body", TEXT);
-
- let schema = schema_builder.build();
+
+ let schema = schema_builder.build();
// # Indexing documents
//
// Let's create a brand new index.
- //
+ //
// This will actually just save a meta.json
// with our schema in the directory.
let index = try!(Index::create(index_path, schema.clone()));
-
-
+
+
// To insert document we need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
- // Here we use a buffer of 1 GB. Using a bigger
+ // Here we use a buffer of 50MB per thread. Using a bigger
// heap for the indexer can increase its throughput.
- // This buffer will be split between the indexing
- // threads.
- let mut index_writer = try!(index.writer(1_000_000_000));
+ let mut index_writer = try!(index.writer(50_000_000));
// Let's index our documents!
// We first need a handle on the title and the body field.
-
-
+
+
// ### Create a document "manually".
//
// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
-
+
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
- old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.");
-
+ old_man_doc.add_text(body,
+ "He was an old man who fished alone in a skiff in the Gulf Stream and \
+ he had gone eighty-four days now without taking a fish.");
+
// ... and add it to the `IndexWriter`.
- try!(index_writer.add_document(old_man_doc));
-
+ index_writer.add_document(old_man_doc);
+
// ### Create a document directly from json.
//
// Alternatively, we can use our schema to parse
// a document object directly from json.
-
+
let mice_and_men_doc = try!(schema.parse_document(r#"{
"title": "Of Mice and Men",
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
}"#));
-
- try!(index_writer.add_document(mice_and_men_doc));
-
+
+ index_writer.add_document(mice_and_men_doc);
+
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
// The following document has two titles.
@@ -116,20 +116,20 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Promotheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"#));
- try!(index_writer.add_document(frankenstein_doc));
-
+ index_writer.add_document(frankenstein_doc);
+
// This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index
- // the English wikipedia. Tantivy's indexing is rather fast.
+ // the English wikipedia. Tantivy's indexing is rather fast.
// Indexing 5 million articles of the English wikipedia takes
// around 4 minutes on my computer!
-
-
+
+
// ### Committing
- //
+ //
// At this point our documents are not searchable.
//
- //
+ //
// We need to call .commit() explicitly to force the
// index_writer to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
@@ -137,22 +137,25 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This call is blocking.
try!(index_writer.commit());
-
+
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
- //
+ //
// In the scenario of a crash or a power failure,
// tantivy behaves as if has rolled back to its last
// commit.
-
-
+
+
// # Searching
//
- // Let's search our index. We start
- // by creating a searcher. There can be more
- // than one searcher at a time.
- //
+ // Let's search our index. Start by reloading
+ // searchers in the index. This should be done
+ // after every commit().
+ try!(index.load_searchers());
+
+ // Afterwards create one (or more) searchers.
+ //
// You should create a searcher
// every time you start a "search query".
let searcher = index.searcher();
@@ -161,46 +164,45 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
- let query_parser = QueryParser::new(index.schema(), vec!(title, body));
-
+ let query_parser = QueryParser::new(index.schema(), vec![title, body]);
+
// QueryParser may fail if the query is not in the right
// format. For user facing applications, this can be a problem.
// A ticket has been opened regarding this problem.
let query = try!(query_parser.parse_query("sea whale"));
-
-
+
+
// A query defines a set of documents, as
// well as the way they should be scored.
- //
+ //
// A query created by the query parser is scored according
// to a metric called Tf-Idf, and will consider
// any document matching at least one of our terms.
-
- // ### Collectors
+
+ // ### Collectors
//
- // We are not interested in all of the documents but
+ // We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents
// is the role of the TopCollector.
-
let mut top_collector = TopCollector::with_limit(10);
-
+
// We can now perform our query.
try!(searcher.search(&*query, &mut top_collector));
- // Our top collector now contains the 10
+ // Our top collector now contains the 10
// most relevant doc ids...
let doc_addresses = top_collector.docs();
- // The actual documents still need to be
+ // The actual documents still need to be
// retrieved from Tantivy's store.
- //
+ //
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
-
+
for doc_address in doc_addresses {
- let retrieved_doc = try!(searcher.doc(&doc_address));
- println!("{}", schema.to_json(&retrieved_doc));
+ let retrieved_doc = try!(searcher.doc(&doc_address));
+ println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs
index 5840eb775..524ffec58 100644
--- a/src/collector/chained_collector.rs
+++ b/src/collector/chained_collector.rs
@@ -1,7 +1,7 @@
+use Result;
use collector::Collector;
use SegmentLocalId;
use SegmentReader;
-use std::io;
use DocId;
use Score;
@@ -12,7 +12,7 @@ use Score;
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
#[inline]
- fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
#[inline]
@@ -38,7 +38,7 @@ impl ChainedCollector {
}
impl Collector for ChainedCollector {
- fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())
diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs
index 8a9014a25..ff15abd73 100644
--- a/src/collector/count_collector.rs
+++ b/src/collector/count_collector.rs
@@ -1,7 +1,7 @@
-use std::io;
use super::Collector;
use DocId;
use Score;
+use Result;
use SegmentReader;
use SegmentLocalId;
@@ -28,7 +28,7 @@ impl Default for CountCollector {
impl Collector for CountCollector {
- fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
diff --git a/src/collector/mod.rs b/src/collector/mod.rs
index 84bc38485..75c22aded 100644
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -2,7 +2,7 @@ use SegmentReader;
use SegmentLocalId;
use DocId;
use Score;
-use std::io;
+use Result;
mod count_collector;
pub use self::count_collector::CountCollector;
@@ -48,14 +48,14 @@ pub use self::chained_collector::chain;
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// on this segment.
- fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
+ fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
- fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -73,7 +73,6 @@ pub mod tests {
use DocId;
use Score;
use core::SegmentReader;
- use std::io;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use schema::Field;
@@ -107,7 +106,7 @@ pub mod tests {
impl Collector for TestCollector {
- fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
@@ -140,14 +139,14 @@ pub mod tests {
}
}
- pub fn vals(&self,) -> &Vec {
- &self.vals
+ pub fn vals(self,) -> Vec {
+ self.vals
}
}
impl Collector for FastFieldTestCollector {
- fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
- self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field)));
+ fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
+ self.ff_reader = reader.get_fast_field_reader(self.field);
Ok(())
}
diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs
index 6ce999e80..e5eddc7f4 100644
--- a/src/collector/multi_collector.rs
+++ b/src/collector/multi_collector.rs
@@ -1,7 +1,7 @@
-use std::io;
use super::Collector;
use DocId;
use Score;
+use Result;
use SegmentReader;
use SegmentLocalId;
@@ -25,7 +25,7 @@ impl<'a> MultiCollector<'a> {
impl<'a> Collector for MultiCollector<'a> {
- fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}
diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs
index 21c023caf..6425eb300 100644
--- a/src/collector/top_collector.rs
+++ b/src/collector/top_collector.rs
@@ -1,8 +1,8 @@
-use std::io;
use super::Collector;
use SegmentReader;
use SegmentLocalId;
use DocAddress;
+use Result;
use std::collections::BinaryHeap;
use std::cmp::Ordering;
use DocId;
@@ -105,7 +105,7 @@ impl TopCollector {
impl Collector for TopCollector {
- fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
+ fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
}
diff --git a/src/common/mod.rs b/src/common/mod.rs
index b0835c082..4812b8700 100644
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -4,6 +4,7 @@ mod vint;
pub mod bitpacker;
mod counting_writer;
+
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
@@ -13,6 +14,7 @@ pub use self::counting_writer::CountingWriter;
use std::io;
+/// Create a default io error given a string.
pub fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
@@ -30,3 +32,14 @@ pub trait HasLen {
}
+/// Creates an uninitialized Vec of a given usize
+///
+/// `allocate_vec` does an unsafe call to `set_len`
+/// as other solution are extremely slow in debug mode.
+pub fn allocate_vec(capacity: usize) -> Vec {
+ let mut v = Vec::with_capacity(capacity);
+ unsafe {
+ v.set_len(capacity);
+ }
+ v
+}
diff --git a/src/common/serialize.rs b/src/common/serialize.rs
index b1ffab6cd..6bd1426fe 100644
--- a/src/common/serialize.rs
+++ b/src/common/serialize.rs
@@ -74,7 +74,6 @@ impl BinarySerializable for u64 {
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result {
- // TODO error
try!(writer.write_u8(*self));
Ok(1)
}
diff --git a/src/compression/compression_simd.rs b/src/compression/compression_simd.rs
index 605d0ec03..308e13445 100644
--- a/src/compression/compression_simd.rs
+++ b/src/compression/compression_simd.rs
@@ -1,44 +1,45 @@
-
use super::NUM_DOCS_PER_BLOCK;
-use libc::size_t;
-
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
-extern {
- fn compress_sorted_cpp(
- data: *const u32,
- output: *mut u8,
- offset: u32) -> size_t;
+mod simdcomp {
+ use libc::size_t;
- fn uncompress_sorted_cpp(
- compressed_data: *const u8,
- output: *mut u32,
- offset: u32) -> size_t;
-
- fn compress_unsorted_cpp(
- data: *const u32,
- output: *mut u8) -> size_t;
+ extern {
+ pub fn compress_sorted(
+ data: *const u32,
+ output: *mut u8,
+ offset: u32) -> size_t;
- fn uncompress_unsorted_cpp(
- compressed_data: *const u8,
- output: *mut u32) -> size_t;
+ pub fn uncompress_sorted(
+ compressed_data: *const u8,
+ output: *mut u32,
+ offset: u32) -> size_t;
+
+ pub fn compress_unsorted(
+ data: *const u32,
+ output: *mut u8) -> size_t;
+
+ pub fn uncompress_unsorted(
+ compressed_data: *const u8,
+ output: *mut u32) -> size_t;
+ }
}
fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
- unsafe { compress_sorted_cpp(vals.as_ptr(), output.as_mut_ptr(), offset) }
+ unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) }
}
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
- unsafe { uncompress_sorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
+ unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
}
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
- unsafe { compress_unsorted_cpp(vals.as_ptr(), output.as_mut_ptr()) }
+ unsafe { simdcomp::compress_unsorted(vals.as_ptr(), output.as_mut_ptr()) }
}
fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
- unsafe { uncompress_unsorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr()) }
+ unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
}
diff --git a/src/core/index.rs b/src/core/index.rs
index d29cc1fc3..cb97ba569 100644
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -2,59 +2,53 @@ use Result;
use Error;
use schema::Schema;
use std::sync::Arc;
+use std::borrow::BorrowMut;
use std::fmt;
use rustc_serialize::json;
use core::SegmentId;
use directory::{Directory, MmapDirectory, RAMDirectory};
-use indexer::IndexWriter;
+use indexer::index_writer::open_index_writer;
use core::searcher::Searcher;
use std::convert::From;
use num_cpus;
use super::segment::Segment;
use core::SegmentReader;
use super::pool::Pool;
+use core::SegmentMeta;
use super::pool::LeasedItem;
use std::path::Path;
-use indexer::SegmentManager;
use core::IndexMeta;
+use IndexWriter;
+use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
const NUM_SEARCHERS: usize = 12;
-/// Accessor to the index segment manager
-///
-/// This method is not part of tantivy's public API
-pub fn get_segment_manager(index: &Index) -> Arc {
- index.segment_manager.clone()
-}
-
-
fn load_metas(directory: &Directory) -> Result {
- let meta_file = try!(directory.open_read(&META_FILEPATH));
- let meta_content = String::from_utf8_lossy(meta_file.as_slice());
- json::decode(&meta_content)
+ let meta_data = directory.atomic_read(&META_FILEPATH)?;
+ let meta_string = String::from_utf8_lossy(&meta_data);
+ json::decode(&meta_string)
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
}
/// Tantivy's Search Index
pub struct Index {
- segment_manager: Arc,
-
- directory: Box,
+ directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc>,
- docstamp: u64,
}
+
impl Index {
/// Creates a new index using the `RAMDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
- let directory = Box::new(RAMDirectory::create());
+ let ram_directory = RAMDirectory::create();
+ let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail.");
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
}
@@ -63,9 +57,9 @@ impl Index {
///
/// If a previous index was in this directory, then its meta file will be destroyed.
pub fn create(directory_path: &Path, schema: Schema) -> Result {
- let mut directory = MmapDirectory::open(directory_path)?;
- save_new_metas(schema.clone(), 0, &mut directory)?;
- Index::from_directory(box directory, schema)
+ let mmap_directory = MmapDirectory::open(directory_path)?;
+ let directory = ManagedDirectory::new(mmap_directory)?;
+ Index::from_directory(directory, schema)
}
/// Creates a new index in a temp directory.
@@ -77,49 +71,55 @@ impl Index {
/// The temp directory is only used for testing the `MmapDirectory`.
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
pub fn create_from_tempdir(schema: Schema) -> Result {
- let directory = Box::new(try!(MmapDirectory::create_from_tempdir()));
+ let mmap_directory = MmapDirectory::create_from_tempdir()?;
+ let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Creates a new index given a directory and an `IndexMeta`.
- fn create_from_metas(directory: Box, metas: IndexMeta) -> Result {
+ fn create_from_metas(directory: ManagedDirectory, metas: IndexMeta) -> Result {
let schema = metas.schema.clone();
- let docstamp = metas.docstamp;
- let committed_segments = metas.committed_segments;
- // TODO log somethings is uncommitted is not empty.
let index = Index {
- segment_manager: Arc::new(SegmentManager::from_segments(committed_segments)),
directory: directory,
schema: schema,
searcher_pool: Arc::new(Pool::new()),
- docstamp: docstamp,
};
try!(index.load_searchers());
Ok(index)
}
- /// Opens a new directory from a directory.
- pub fn from_directory(directory: Box, schema: Schema) -> Result {
+ /// Create a new index from a directory.
+ pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result {
+ save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
}
/// Opens a new directory from an index path.
pub fn open(directory_path: &Path) -> Result {
- let directory = try!(MmapDirectory::open(directory_path));
- let metas = try!(load_metas(&directory)); //< TODO does the directory already exists?
- Index::create_from_metas(directory.box_clone(), metas)
+ let mmap_directory = MmapDirectory::open(directory_path)?;
+ let directory = ManagedDirectory::new(mmap_directory)?;
+ let metas = try!(load_metas(&directory));
+ Index::create_from_metas(directory, metas)
}
- /// Returns the index docstamp.
+ /// Returns the index opstamp.
///
- /// The docstamp is the number of documents that have been added
+ /// The opstamp is the number of documents that have been added
/// from the beginning of time, and until the moment of the last commit.
- pub fn docstamp(&self) -> u64 {
- self.docstamp
+ pub fn opstamp(&self) -> u64 {
+ load_metas(self.directory()).unwrap().opstamp
}
- /// Creates a multithreaded writer.
- /// Each writer produces an independent segment.
+ /// Open a new index writer. Attempts to acquire a lockfile.
+ ///
+ /// The lockfile should be deleted on drop, but it is possible
+ /// that due to a panic or other error, a stale lockfile will be
+ /// left in the index directory. If you are sure that no other
+ /// `IndexWriter` on the system is accessing the index directory,
+ /// it is safe to manually delete the lockfile.
+ ///
+ /// num_threads specifies the number of indexing workers that
+ /// should work at the same time.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
@@ -129,12 +129,13 @@ impl Index {
num_threads: usize,
heap_size_in_bytes: usize)
-> Result {
- IndexWriter::open(self, num_threads, heap_size_in_bytes)
+ open_index_writer(self, num_threads, heap_size_in_bytes)
}
/// Creates a multithreaded writer
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
+ ///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
@@ -151,47 +152,47 @@ impl Index {
}
/// Returns the list of segments that are searchable
- pub fn searchable_segments(&self) -> Vec {
- self.searchable_segment_ids()
+ pub fn searchable_segments(&self) -> Result> {
+ Ok(self
+ .searchable_segment_metas()?
.into_iter()
- .map(|segment_id| self.segment(segment_id))
- .collect()
+ .map(|segment_meta| self.segment(segment_meta))
+ .collect())
}
- /// Remove all of the file associated with the segment.
- ///
- /// This method cannot fail. If a problem occurs,
- /// some files may end up never being removed.
- /// The error will only be logged.
- pub fn delete_segment(&self, segment_id: SegmentId) {
- self.segment(segment_id).delete();
- }
-
- /// Return a segment object given a `segment_id`
- ///
- /// The segment may or may not exist.
- pub fn segment(&self, segment_id: SegmentId) -> Segment {
- create_segment(self.clone(), segment_id)
- }
-
- /// Return a reference to the index directory.
- pub fn directory(&self) -> &Directory {
- &*self.directory
- }
-
- /// Return a mutable reference to the index directory.
- pub fn directory_mut(&mut self) -> &mut Directory {
- &mut *self.directory
- }
-
- /// Returns the list of segment ids that are searchable.
- fn searchable_segment_ids(&self) -> Vec {
- self.segment_manager.committed_segments()
+ #[doc(hidden)]
+ pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
+ create_segment(self.clone(), segment_meta)
}
/// Creates a new segment.
pub fn new_segment(&self) -> Segment {
- self.segment(SegmentId::generate_random())
+ let segment_meta = SegmentMeta::new(SegmentId::generate_random());
+ create_segment(self.clone(), segment_meta)
+ }
+
+ /// Return a reference to the index directory.
+ pub fn directory(&self) -> &ManagedDirectory {
+ &self.directory
+ }
+
+ /// Return a mutable reference to the index directory.
+ pub fn directory_mut(&mut self) -> &mut ManagedDirectory {
+ &mut self.directory
+ }
+
+ /// Reads the meta.json and returns the list of
+ /// `SegmentMeta` from the last commit.
+ pub fn searchable_segment_metas(&self) -> Result> {
+ Ok(load_metas(self.directory())?.segments)
+ }
+
+ /// Returns the list of segment ids that are searchable.
+ pub fn searchable_segment_ids(&self) -> Result> {
+ Ok(self.searchable_segment_metas()?
+ .iter()
+ .map(|segment_meta| segment_meta.id())
+ .collect())
}
/// Creates a new generation of searchers after
@@ -200,16 +201,14 @@ impl Index {
/// This needs to be called when a new segment has been
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
- let searchable_segments = self.searchable_segments();
- let mut searchers = Vec::new();
- for _ in 0..NUM_SEARCHERS {
- let searchable_segments_clone = searchable_segments.clone();
- let segment_readers: Vec = try!(searchable_segments_clone.into_iter()
+ let searchable_segments = self.searchable_segments()?;
+ let segment_readers: Vec = try!(searchable_segments
+ .into_iter()
.map(SegmentReader::open)
.collect());
- let searcher = Searcher::from(segment_readers);
- searchers.push(searcher);
- }
+ let searchers = (0..NUM_SEARCHERS)
+ .map(|_| Searcher::from(segment_readers.clone()))
+ .collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
@@ -239,12 +238,9 @@ impl fmt::Debug for Index {
impl Clone for Index {
fn clone(&self) -> Index {
Index {
- segment_manager: self.segment_manager.clone(),
-
- directory: self.directory.box_clone(),
+ directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
- docstamp: self.docstamp,
}
}
}
diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs
index a2623f9d0..8a0274b4e 100644
--- a/src/core/index_meta.rs
+++ b/src/core/index_meta.rs
@@ -1,7 +1,5 @@
-
use schema::Schema;
-use core::SegmentId;
-
+use core::SegmentMeta;
/// Meta information about the `Index`.
///
@@ -13,35 +11,17 @@ use core::SegmentId;
///
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct IndexMeta {
- pub committed_segments: Vec,
- pub uncommitted_segments: Vec,
+ pub segments: Vec,
pub schema: Schema,
- pub docstamp: u64,
+ pub opstamp: u64,
}
impl IndexMeta {
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
- committed_segments: Vec::new(),
- uncommitted_segments: Vec::new(),
+ segments: vec!(),
schema: schema,
- docstamp: 0u64,
+ opstamp: 0u64,
}
}
}
-
-#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
-pub struct SegmentMeta {
- pub segment_id: SegmentId,
- pub num_docs: u32,
-}
-
-#[cfg(test)]
-impl SegmentMeta {
- pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta {
- SegmentMeta {
- segment_id: segment_id,
- num_docs: num_docs,
- }
- }
-}
\ No newline at end of file
diff --git a/src/core/mod.rs b/src/core/mod.rs
index 2dfac69d1..6f7cb9edc 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -1,5 +1,4 @@
pub mod searcher;
-
pub mod index;
mod segment_reader;
mod segment_id;
@@ -7,20 +6,38 @@ mod segment_component;
mod segment;
mod index_meta;
mod pool;
+mod segment_meta;
mod term_iterator;
-use std::path::PathBuf;
-
+pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::segment::Segment;
-pub use self::segment::SegmentInfo;
pub use self::segment::SerializableSegment;
pub use self::index::Index;
-pub use self::index_meta::{IndexMeta, SegmentMeta};
+pub use self::segment_meta::SegmentMeta;
+pub use self::index_meta::IndexMeta;
pub use self::term_iterator::TermIterator;
+
+use std::path::PathBuf;
+
lazy_static! {
+ /// The meta file contains all the information about the list of segments and the schema
+ /// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
+
+ /// The managed file contains a list of files that were created by the tantivy
+ /// and will therefore be garbage collected when they are deemed useless by tantivy.
+ ///
+ /// Removing this file is safe, but will prevent the garbage collection of all of the file that
+ /// are currently in the directory
+ pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
+
+ /// Only one process should be able to write tantivy's index at a time.
+ /// This file, when present, is in charge of preventing other processes to open an IndexWriter.
+ ///
+ /// If the process is killed and this file remains, it is safe to remove it manually.
+ pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
}
\ No newline at end of file
diff --git a/src/core/searcher.rs b/src/core/searcher.rs
index 0d99a2897..839e00172 100644
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -8,6 +8,7 @@ use DocId;
use DocAddress;
use schema::Term;
use core::TermIterator;
+use std::fmt;
/// Holds a list of `SegmentReader`s ready for search.
@@ -15,13 +16,13 @@ use core::TermIterator;
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
///
-#[derive(Debug)]
pub struct Searcher {
segment_readers: Vec,
}
+
impl Searcher {
-
+
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
@@ -83,4 +84,14 @@ impl From> for Searcher {
segment_readers: segment_readers,
}
}
+}
+
+impl fmt::Debug for Searcher {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let segment_ids = self.segment_readers
+ .iter()
+ .map(|segment_reader| segment_reader.segment_id())
+ .collect::>();
+ write!(f, "Searcher({:?})", segment_ids)
+ }
}
\ No newline at end of file
diff --git a/src/core/segment.rs b/src/core/segment.rs
index 3e8bc9a42..c99d36e85 100644
--- a/src/core/segment.rs
+++ b/src/core/segment.rs
@@ -1,97 +1,89 @@
use Result;
use std::path::PathBuf;
use schema::Schema;
-use DocId;
use std::fmt;
use core::SegmentId;
-use directory::{ReadOnlySource, WritePtr};
+use directory::{ReadOnlySource, WritePtr, FileProtection};
use indexer::segment_serializer::SegmentSerializer;
use super::SegmentComponent;
use core::Index;
use std::result;
-use directory::error::{FileError, OpenWriteError};
-
-
+use directory::Directory;
+use core::SegmentMeta;
+use directory::error::{OpenReadError, OpenWriteError};
/// A segment is a piece of the index.
#[derive(Clone)]
pub struct Segment {
index: Index,
- segment_id: SegmentId,
+ meta: SegmentMeta,
}
impl fmt::Debug for Segment {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "Segment({:?})", self.segment_id.uuid_string())
+ write!(f, "Segment({:?})", self.id().uuid_string())
}
}
-
/// Creates a new segment given an `Index` and a `SegmentId`
///
/// The function is here to make it private outside `tantivy`.
-pub fn create_segment(index: Index, segment_id: SegmentId) -> Segment {
+pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment {
index: index,
- segment_id: segment_id,
+ meta: meta,
}
}
impl Segment {
-
-
+
/// Returns our index's schema.
pub fn schema(&self,) -> Schema {
self.index.schema()
}
+ /// Returns the segment meta-information
+ pub fn meta(&self) -> &SegmentMeta {
+ &self.meta
+ }
+
+ #[doc(hidden)]
+ pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
+ self.meta.set_delete_meta(num_deleted_docs, opstamp);
+ }
+
/// Returns the segment's id.
pub fn id(&self,) -> SegmentId {
- self.segment_id
+ self.meta.id()
}
-
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
- self.segment_id.relative_path(component)
+ self.meta.relative_path(component)
}
- /// Deletes all of the document of the segment.
- /// This is called when there is a merge or a rollback.
+
+ /// Protects a specific component file from being deleted.
///
- /// # Disclaimer
- /// If deletion of a file fails (e.g. a file
- /// was read-only.), the method does not
- /// fail and just logs an error
- pub fn delete(&self,) {
- for component in SegmentComponent::values() {
- let rel_path = self.relative_path(component);
- if let Err(err) = self.index.directory().delete(&rel_path) {
- match err {
- FileError::FileDoesNotExist(_) => {
- // this is normal behavior.
- // the position file for instance may not exists.
- }
- FileError::IOError(err) => {
- error!("Failed to remove {:?} : {:?}", self.segment_id, err);
- }
- }
- }
- }
+ /// Returns a FileProtection object. The file is guaranteed
+ /// to not be garbage collected as long as this `FileProtection` object
+ /// lives.
+ pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
+ let path = self.relative_path(component);
+ self.index.directory().protect_file_from_delete(&path)
}
-
- /// Open one of the component file for read.
- pub fn open_read(&self, component: SegmentComponent) -> result::Result {
+ /// Open one of the component file for a *regular* read.
+ pub fn open_read(&self, component: SegmentComponent) -> result::Result {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
Ok(source)
}
- /// Open one of the component file for write.
+ /// Open one of the component file for *regular* write.
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
@@ -108,7 +100,34 @@ pub trait SerializableSegment {
fn write(&self, serializer: SegmentSerializer) -> Result;
}
-#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
-pub struct SegmentInfo {
- pub max_doc: DocId,
+#[cfg(test)]
+mod tests {
+
+ use core::SegmentComponent;
+ use directory::Directory;
+ use std::collections::HashSet;
+ use schema::SchemaBuilder;
+ use Index;
+
+ #[test]
+ fn test_segment_protect_component() {
+ let mut index = Index::create_in_ram(SchemaBuilder::new().build());
+ let segment = index.new_segment();
+ let path = segment.relative_path(SegmentComponent::POSTINGS);
+
+ let directory = index.directory_mut();
+ directory.atomic_write(&*path, &vec!(0u8)).unwrap();
+
+ let living_files = HashSet::new();
+ {
+ let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
+ assert!(directory.exists(&*path));
+ directory.garbage_collect(living_files.clone());
+ assert!(directory.exists(&*path));
+ }
+
+ directory.garbage_collect(living_files);
+ assert!(!directory.exists(&*path));
+ }
+
}
\ No newline at end of file
diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs
index a55ea19dc..6f85c4031 100644
--- a/src/core/segment_component.rs
+++ b/src/core/segment_component.rs
@@ -1,41 +1,27 @@
-use std::vec::IntoIter;
-
#[derive(Copy, Clone)]
pub enum SegmentComponent {
- INFO,
POSTINGS,
POSITIONS,
FASTFIELDS,
FIELDNORMS,
TERMS,
STORE,
+ DELETE
}
impl SegmentComponent {
- pub fn values() -> IntoIter {
- vec!(
- SegmentComponent::INFO,
+
+ pub fn iterator() -> impl Iterator {
+ static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
- ).into_iter()
+ SegmentComponent::DELETE
+ ];
+ SEGMENT_COMPONENTS.into_iter()
}
- pub fn path_suffix(&self)-> &'static str {
- match *self {
- SegmentComponent::POSITIONS => ".pos",
- SegmentComponent::INFO => ".info",
- SegmentComponent::POSTINGS => ".idx",
- SegmentComponent::TERMS => ".term",
- SegmentComponent::STORE => ".store",
- SegmentComponent::FASTFIELDS => ".fast",
- SegmentComponent::FIELDNORMS => ".fieldnorm",
- }
- }
-}
-
-
-
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs
index 3d77668b3..9e3a75d3d 100644
--- a/src/core/segment_id.rs
+++ b/src/core/segment_id.rs
@@ -1,14 +1,19 @@
use uuid::Uuid;
use std::fmt;
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
-use core::SegmentComponent;
-use std::path::PathBuf;
use std::cmp::{Ordering, Ord};
-
#[cfg(test)]
use std::sync::atomic;
+/// Tantivy SegmentId.
+///
+/// Tantivy's segment are identified
+/// by a UUID which is used to prefix the filenames
+/// of all of the file associated with the segment.
+///
+/// In unit test, for reproducability, the SegmentId are
+/// simply generated in an autoincrement fashion.
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
pub struct SegmentId(Uuid);
@@ -37,22 +42,27 @@ fn create_uuid() -> Uuid {
}
impl SegmentId {
+ #[doc(hidden)]
pub fn generate_random() -> SegmentId {
SegmentId(create_uuid())
}
+
+ /// Returns a shorter identifier of the segment.
+ ///
+ /// We are using UUID4, so only 6 bits are fixed,
+ /// and the rest is random.
+ ///
+ /// Picking the first 8 chars is ok to identify
+ /// segments in a display message.
pub fn short_uuid_string(&self,) -> String {
(&self.0.simple().to_string()[..8]).to_string()
}
+ /// Returns a segment uuid string.
pub fn uuid_string(&self,) -> String {
self.0.simple().to_string()
}
-
- pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
- let filename = self.uuid_string() + component.path_suffix();
- PathBuf::from(filename)
- }
}
impl Encodable for SegmentId {
@@ -69,7 +79,7 @@ impl Decodable for SegmentId {
impl fmt::Debug for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "SegmentId({:?})", self.uuid_string())
+ write!(f, "Seg({:?})", self.short_uuid_string())
}
}
diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs
new file mode 100644
index 000000000..9716c348b
--- /dev/null
+++ b/src/core/segment_meta.rs
@@ -0,0 +1,121 @@
+use core::SegmentId;
+use super::SegmentComponent;
+use std::path::PathBuf;
+use std::collections::HashSet;
+
+#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
+struct DeleteMeta {
+ num_deleted_docs: u32,
+ opstamp: u64,
+}
+
+/// SegmentMeta contains simple meta information about a segment.
+///
+/// For instance the number of docs it contains,
+/// how many are deleted, etc.
+#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
+pub struct SegmentMeta {
+ segment_id: SegmentId,
+ max_doc: u32,
+ deletes: Option,
+}
+
+impl SegmentMeta {
+
+ /// Creates a new segment meta for
+ /// a segment with no deletes and no documents.
+ pub fn new(segment_id: SegmentId) -> SegmentMeta {
+ SegmentMeta {
+ segment_id: segment_id,
+ max_doc: 0,
+ deletes: None,
+ }
+ }
+
+ /// Returns the segment id.
+ pub fn id(&self) -> SegmentId {
+ self.segment_id
+ }
+
+ /// Returns the number of deleted documents.
+ pub fn num_deleted_docs(&self) -> u32 {
+ self.deletes
+ .as_ref()
+ .map(|delete_meta| delete_meta.num_deleted_docs)
+ .unwrap_or(0u32)
+ }
+
+ /// Returns the list of files that
+ /// are required for the segment meta.
+ ///
+ /// This is useful as the way tantivy removes files
+ /// is by removing all files that have been created by tantivy
+ /// and are not used by any segment anymore.
+ pub fn list_files(&self) -> HashSet {
+ SegmentComponent::iterator()
+ .map(|component| {
+ self.relative_path(*component)
+ })
+ .collect::>()
+
+ }
+
+ /// Returns the relative path of a component of our segment.
+ ///
+ /// It just joins the segment id with the extension
+ /// associated to a segment component.
+ pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
+ let mut path = self.id().uuid_string();
+ path.push_str(&*match component {
+ SegmentComponent::POSITIONS => ".pos".to_string(),
+ SegmentComponent::POSTINGS => ".idx".to_string(),
+ SegmentComponent::TERMS => ".term".to_string(),
+ SegmentComponent::STORE => ".store".to_string(),
+ SegmentComponent::FASTFIELDS => ".fast".to_string(),
+ SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
+ SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
+ });
+ PathBuf::from(path)
+ }
+
+ /// Return the highest doc id + 1
+ ///
+ /// If there are no deletes, then num_docs = max_docs
+ /// and all the doc ids contains in this segment
+ /// are exactly (0..max_doc).
+ pub fn max_doc(&self) -> u32 {
+ self.max_doc
+ }
+
+ /// Return the number of documents in the segment.
+ pub fn num_docs(&self) -> u32 {
+ self.max_doc() - self.num_deleted_docs()
+ }
+
+ /// Returns the opstamp of the last delete operation
+ /// taken in account in this segment.
+ pub fn delete_opstamp(&self) -> Option {
+ self.deletes
+ .as_ref()
+ .map(|delete_meta| delete_meta.opstamp)
+ }
+
+ /// Returns true iff the segment meta contains
+ /// delete information.
+ pub fn has_deletes(&self) -> bool {
+ self.deletes.is_some()
+ }
+
+ #[doc(hidden)]
+ pub fn set_max_doc(&mut self, max_doc: u32) {
+ self.max_doc = max_doc;
+ }
+
+ #[doc(hidden)]
+ pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
+ self.deletes = Some(DeleteMeta {
+ num_deleted_docs: num_deleted_docs,
+ opstamp: opstamp,
+ });
+ }
+}
diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
index f765c982e..7a3325730 100644
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -3,17 +3,18 @@ use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
+use common::HasLen;
+use core::SegmentMeta;
+use fastfield::delete::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use DocId;
-use std::io;
use std::str;
use postings::TermInfo;
use datastruct::TermDictionary;
+use std::sync::Arc;
use std::fmt;
-use rustc_serialize::json;
-use core::SegmentInfo;
use schema::Field;
use postings::{SegmentPostings, BlockSegmentPostings, SegmentPostingsOption};
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
@@ -21,8 +22,6 @@ use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
-use error::Error;
-
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -35,14 +34,16 @@ use error::Error;
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
///
+#[derive(Clone)]
pub struct SegmentReader {
- segment_info: SegmentInfo,
segment_id: SegmentId,
- term_infos: TermDictionary,
+ segment_meta: SegmentMeta,
+ term_infos: Arc>,
postings_data: ReadOnlySource,
store_reader: StoreReader,
- fast_fields_reader: U32FastFieldsReader,
- fieldnorms_reader: U32FastFieldsReader,
+ fast_fields_reader: Arc,
+ fieldnorms_reader: Arc,
+ delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
}
@@ -53,7 +54,7 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
- self.segment_info.max_doc
+ self.segment_meta.max_doc()
}
@@ -67,20 +68,39 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
- self.segment_info.max_doc
+ self.segment_meta.num_docs()
}
+ /// Return the number of documents that have been
+ /// deleted in the segment.
+ pub fn num_deleted_docs(&self) -> DocId {
+ self.delete_bitset.len() as DocId
+ }
+
/// Accessor to a segment's fast field reader given a field.
- pub fn get_fast_field_reader(&self, field: Field) -> io::Result {
+ pub fn get_fast_field_reader(&self, field: Field) -> Option {
+ /// Returns the u32 fast value reader if the field
+ /// is a u32 field indexed as "fast".
+ ///
+ /// Return None if the field is not a u32 field
+ /// indexed with the fast option.
+ ///
+ /// # Panics
+ /// May panic if the index is corrupted.
let field_entry = self.schema.get_field_entry(field);
- match *field_entry.field_type() {
- FieldType::Str(_) => {
- Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
+ match field_entry.field_type() {
+ &FieldType::Str(_) => {
+ warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
+ None
},
- FieldType::U32(_) => {
- // TODO check that the schema allows that
- //Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
- self.fast_fields_reader.get_field(field)
+ &FieldType::U32(ref u32_options) => {
+ if u32_options.is_fast() {
+ self.fast_fields_reader.get_field(field)
+ }
+ else {
+ warn!("Field <{}> is not defined as a fast field.", field_entry.name());
+ None
+ }
},
}
}
@@ -92,7 +112,7 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
- pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result {
+ pub fn get_fieldnorms_reader(&self, field: Field) -> Option {
self.fieldnorms_reader.get_field(field)
}
@@ -111,21 +131,7 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result {
- let segment_info_reader = try!(segment.open_read(SegmentComponent::INFO));
- let segment_info_data = try!(
- str::from_utf8(&*segment_info_reader)
- .map_err(|err| {
- let segment_info_filepath = segment.relative_path(SegmentComponent::INFO);
- Error::CorruptedFile(segment_info_filepath, Box::new(err))
- })
- );
- let segment_info: SegmentInfo = try!(
- json::decode(&segment_info_data)
- .map_err(|err| {
- let file_path = segment.relative_path(SegmentComponent::INFO);
- Error::CorruptedFile(file_path, Box::new(err))
- })
- );
+
let source = try!(segment.open_read(SegmentComponent::TERMS));
let term_infos = try!(TermDictionary::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
@@ -141,15 +147,25 @@ impl SegmentReader {
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
+ let delete_bitset =
+ if segment.meta().has_deletes() {
+ let delete_data = segment.open_read(SegmentComponent::DELETE)?;
+ DeleteBitSet::open(delete_data)
+ }
+ else {
+ DeleteBitSet::empty()
+ };
+
let schema = segment.schema();
Ok(SegmentReader {
- segment_info: segment_info,
+ segment_meta: segment.meta().clone(),
postings_data: postings_shared_mmap,
- term_infos: term_infos,
+ term_infos: Arc::new(term_infos),
segment_id: segment.id(),
store_reader: store_reader,
- fast_fields_reader: fast_fields_reader,
- fieldnorms_reader: fieldnorms_reader,
+ fast_fields_reader: Arc::new(fast_fields_reader),
+ fieldnorms_reader: Arc::new(fieldnorms_reader),
+ delete_bitset: delete_bitset,
positions_data: positions_data,
schema: schema,
})
@@ -237,10 +253,17 @@ impl SegmentReader {
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option {
self.read_block_postings(term, option)
- .map(SegmentPostings::from_block_postings)
+ .map(|block_postings| {
+ SegmentPostings::from_block_postings(block_postings, self.delete_bitset.clone())
+ })
}
-
+
+
/// Returns the posting list associated with a term.
+ ///
+ /// If the term is not found, return None.
+ /// Even when non-null, because of deletes, the posting object
+ /// returned by this method may contain no documents.
pub fn read_postings_all_info(&self, term: &Term) -> Option {
let field_entry = self.schema.get_field_entry(term.field());
let segment_posting_option = match *field_entry.field_type() {
@@ -260,6 +283,24 @@ impl SegmentReader {
pub fn get_term_info(&self, term: &Term) -> Option {
self.term_infos.get(term.as_slice())
}
+
+ /// Returns the segment id
+ pub fn segment_id(&self) -> SegmentId {
+ self.segment_id
+ }
+
+ /// Returns the bitset representing
+ /// the documents that have been deleted.
+ pub fn delete_bitset(&self) -> &DeleteBitSet {
+ &self.delete_bitset
+ }
+
+
+ /// Returns true iff the `doc` is marked
+ /// as deleted.
+ pub fn is_deleted(&self, doc: DocId) -> bool {
+ self.delete_bitset.is_deleted(doc)
+ }
}
diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs
index dc39e5528..5ac68f929 100644
--- a/src/core/term_iterator.rs
+++ b/src/core/term_iterator.rs
@@ -101,7 +101,7 @@ impl<'a> TermIterator<'a> {
for segment_ord in self.current_segment_ords.drain(..) {
if let Some((term, val)) = self.key_streams[segment_ord].next() {
self.heap.push(HeapItem {
- term: Term::from(term),
+ term: Term::from_bytes(term),
segment_ord: segment_ord,
});
}
@@ -150,7 +150,7 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "a b d f");
- index_writer.add_document(doc).unwrap();
+ index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
@@ -158,7 +158,7 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d f");
- index_writer.add_document(doc).unwrap();
+ index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
@@ -166,11 +166,12 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "e f");
- index_writer.add_document(doc).unwrap();
+ index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
}
+ index.load_searchers().unwrap();
let searcher = index.searcher();
let mut term_it = searcher.terms();
let mut terms = String::new();
diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs
index d4e5812ff..adaed1082 100644
--- a/src/datastruct/fstmap.rs
+++ b/src/datastruct/fstmap.rs
@@ -21,7 +21,7 @@ pub struct FstMapBuilder {
}
impl FstMapBuilder {
-
+
pub fn new(w: W) -> io::Result> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
@@ -31,8 +31,28 @@ impl FstMapBuilder {
})
}
+ /// Horribly unsafe, nobody should ever do that... except me :)
+ ///
+ /// If used, it must be used by systematically alternating calls
+ /// to insert_key and insert_value.
+ ///
+ /// TODO see if I can bend Rust typesystem to enforce that
+ /// in a nice way.
+ pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
+ try!(self.fst_builder
+ .insert(key, self.data.len() as u64)
+ .map_err(convert_fst_error));
+ Ok(())
+ }
- pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
+ /// Horribly unsafe, nobody should ever do that... except me :)
+ pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
+ try!(value.serialize(&mut self.data));
+ Ok(())
+ }
+
+ #[cfg(test)]
+ pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
@@ -132,7 +152,6 @@ mod tests {
assert_eq!(keys.next().unwrap(), "abc".as_bytes());
assert_eq!(keys.next().unwrap(), "abcd".as_bytes());
assert_eq!(keys.next(), None);
-
}
}
diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs
index b83406029..9806a69af 100644
--- a/src/datastruct/skip/skiplist_builder.rs
+++ b/src/datastruct/skip/skiplist_builder.rs
@@ -36,7 +36,7 @@ impl LayerBuilder {
fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result