mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-09 02:22:54 +00:00
Merge branch 'master' into bug/4
This commit is contained in:
10
README.md
10
README.md
@@ -1,4 +1,4 @@
|
||||

|
||||

|
||||
|
||||
[](https://travis-ci.org/fulmicoton/tantivy)
|
||||
[](https://coveralls.io/github/fulmicoton/tantivy?branch=master)
|
||||
@@ -13,11 +13,11 @@ It is strongly inspired by Lucene's design.
|
||||
# Features
|
||||
|
||||
- configurable indexing (optional term frequency and position indexing)
|
||||
- Tf-Idf scoring
|
||||
- tf-idf scoring
|
||||
- Basic query language
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing en wikipedia takes 4mn on my desktop)
|
||||
- Mmap based
|
||||
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
|
||||
- mmap based
|
||||
- SIMD integer compression
|
||||
- u32 fast fields (equivalent of doc values in Lucene)
|
||||
- LZ4 compressed document store
|
||||
@@ -35,7 +35,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
|
||||
Tantivy has a git submodule called `simdcomp`.
|
||||
After cloning the repository, you will need to initialize and update
|
||||
the submodules. The project can then be build using `cargo`.
|
||||
the submodules. The project can then be built using `cargo`.
|
||||
|
||||
git clone git@github.com:fulmicoton/tantivy.git
|
||||
git submodule init
|
||||
|
||||
@@ -25,7 +25,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// Tantivy index require to have a very strict schema.
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
@@ -47,7 +47,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// `STORED` means that the field will also be saved
|
||||
// in a compressed, row-oriented key-value store.
|
||||
// This store is useful to reconstruct the
|
||||
// document that were selected during the search phase.
|
||||
// documents that were selected during the search phase.
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
// Our first field is body.
|
||||
@@ -64,29 +64,29 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema the directory.
|
||||
// with our schema in the directory.
|
||||
let index = try!(Index::create(index_path, schema.clone()));
|
||||
|
||||
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There shall be only one writer at a time.
|
||||
// Besides, this single `IndexWriter` is already
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we used a buffer of 1 GB. Using a bigger
|
||||
// Here we use a buffer of 1 GB. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
// This buffer will be split between the indexing
|
||||
// threads.
|
||||
let mut index_writer = try!(index.writer(1_000_000_000));
|
||||
|
||||
// Let's now index our documents!
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting adding the fields
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
@@ -122,7 +122,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 millions articles of the English wikipedia takes
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
|
||||
@@ -131,56 +131,56 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitely to force the
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// flush the current index on the disk, and advertise
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
try!(index_writer.commit());
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents have been added before are guaranteed to be
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if it rollbacked to its last
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. This starts
|
||||
// Let's search our index. We start
|
||||
// by creating a searcher. There can be more
|
||||
// than one searcher at a time.
|
||||
//
|
||||
// You are supposed to acquire a search
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
let searcher = index.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field he wants to search, tantivy will search
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::new(index.schema(), vec!(title, body));
|
||||
|
||||
// QueryParser may fail if the query is not in the right
|
||||
// format. For user facing applications, this can be a problem.
|
||||
// A ticket has been filled regarding this problem.
|
||||
// A ticket has been opened regarding this problem.
|
||||
let query = try!(query_parser.parse_query("sea whale"));
|
||||
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// Query created by the query parser are scoring according
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the document but
|
||||
// only in the top 10. Keep track of our top 10 best documents
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
@@ -188,14 +188,14 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// We can now perform our query.
|
||||
try!(query.search(&searcher, &mut top_collector));
|
||||
|
||||
// Our top collector now contains are 10
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since body was not configured as stored,
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
@@ -205,4 +205,4 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,13 +5,13 @@ use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// document are matching the query.
|
||||
/// documents match the query.
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl CountCollector {
|
||||
/// Returns the count of document that where
|
||||
/// Returns the count of documents that were
|
||||
/// collected.
|
||||
pub fn count(&self,) -> usize {
|
||||
self.count
|
||||
@@ -20,8 +20,7 @@ impl CountCollector {
|
||||
|
||||
impl Default for CountCollector {
|
||||
fn default() -> CountCollector {
|
||||
CountCollector {
|
||||
count: 0,
|
||||
CountCollector {count: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,16 +20,16 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
///
|
||||
/// For instance,
|
||||
/// - keeping track of the top 10 best documents
|
||||
/// - computing a break down over a fast field
|
||||
/// - computing the number of documents matching the query
|
||||
///
|
||||
/// - keeping track of the top 10 best documents
|
||||
/// - computing a breakdown over a fast field
|
||||
/// - computing the number of documents matching the query
|
||||
///
|
||||
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||
///
|
||||
/// As they work on multiple segment, they first inform
|
||||
/// the collector of a change in segment and then
|
||||
/// call the collect method to push document to the collector.
|
||||
/// As they work on multiple segments, they first inform
|
||||
/// the collector of a change in a segment and then
|
||||
/// call the `collect` method to push the document to the collector.
|
||||
///
|
||||
/// Temporally, our collector will receive calls
|
||||
/// - `.set_segment(0, segment_reader_0)`
|
||||
@@ -45,10 +45,10 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
/// `set_segment` is called before starting enumerating
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
|
||||
/// The query pushes scored document to the collector via this method.
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, scored_doc: ScoredDoc);
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes scored document to the collector via this method.
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, scored_doc: ScoredDoc) {
|
||||
(*self).collect(scored_doc);
|
||||
}
|
||||
@@ -120,10 +120,10 @@ pub mod tests {
|
||||
|
||||
|
||||
|
||||
/// Collects in order all of the fast field for all of the
|
||||
/// doc of the `DocSet`
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
/// doc in the `DocSet`
|
||||
///
|
||||
/// This collector is essentially useful for tests.
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u32>,
|
||||
field: Field,
|
||||
|
||||
@@ -5,7 +5,7 @@ use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
@@ -60,4 +60,4 @@ mod tests {
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ pub struct TopCollector {
|
||||
|
||||
impl TopCollector {
|
||||
|
||||
/// Creates a top collector, with a number of document of "limit"
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
@@ -68,9 +68,9 @@ impl TopCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the decreasingly sorted K-best documents.
|
||||
/// Returns K best documents sorted in decreasing order.
|
||||
///
|
||||
/// Calling this method will triggers the sort.
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn docs(&self) -> Vec<DocAddress> {
|
||||
self.score_docs()
|
||||
@@ -79,9 +79,9 @@ impl TopCollector {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Returns the decreasingly sorted K-best ScoredDocument.
|
||||
/// Returns K best ScoredDocument sorted in decreasing order.
|
||||
///
|
||||
/// Calling this method will triggers the sort.
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap
|
||||
@@ -93,9 +93,9 @@ impl TopCollector {
|
||||
.map(|GlobalScoredDoc {score, doc_address}| (score, doc_address))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K document have gone through
|
||||
/// the collector.
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline]
|
||||
pub fn at_capacity(&self, ) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
@@ -183,8 +183,8 @@ mod tests {
|
||||
.collect();
|
||||
assert_eq!(docs, vec!(7, 1, 5, 3));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -192,4 +192,4 @@ mod tests {
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use Result;
|
||||
use Error;
|
||||
use std::path::Path;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
@@ -15,29 +14,27 @@ use super::segment::Segment;
|
||||
use core::SegmentReader;
|
||||
use super::pool::Pool;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use indexer::SegmentManager;
|
||||
use core::IndexMeta;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
/// Accessor to the index segment manager
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn get_segment_manager(index: &Index) -> Arc<SegmentManager> {
|
||||
index.segment_manager.clone()
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_file = try!(directory.open_read(&META_FILEPATH));
|
||||
let meta_content = String::from_utf8_lossy(meta_file.as_slice());
|
||||
let loaded_meta = try!(
|
||||
json::decode(&meta_content)
|
||||
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
|
||||
);
|
||||
Ok(loaded_meta)
|
||||
json::decode(&meta_content)
|
||||
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
|
||||
}
|
||||
|
||||
// pub fn set_metas(index: &mut Index, docstamp: u64) {
|
||||
@@ -47,24 +44,23 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
/// Tantivy's Search Index
|
||||
pub struct Index {
|
||||
segment_manager: Arc<SegmentManager>,
|
||||
|
||||
|
||||
directory: Box<Directory>,
|
||||
schema: Schema,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
docstamp: u64,
|
||||
|
||||
}
|
||||
|
||||
impl Index {
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let directory = Box::new(RAMDirectory::create());
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
|
||||
}
|
||||
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
///
|
||||
/// The index will use the `MMapDirectory`.
|
||||
@@ -76,7 +72,7 @@ impl Index {
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
/// The temp directory will be destroyed automatically when the Index object
|
||||
/// The temp directory will be destroyed automatically when the `Index` object
|
||||
/// is destroyed.
|
||||
///
|
||||
/// The temp directory is only used for testing the `MmapDirectory`.
|
||||
@@ -85,8 +81,8 @@ impl Index {
|
||||
let directory = Box::new(try!(MmapDirectory::create_from_tempdir()));
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an IndexMeta.
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: Box<Directory>, metas: IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let docstamp = metas.docstamp;
|
||||
@@ -102,13 +98,10 @@ impl Index {
|
||||
try!(index.load_searchers());
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
|
||||
/// Opens a new directory from a directory.
|
||||
pub fn from_directory(directory: Box<Directory>, schema: Schema) -> Result<Index> {
|
||||
Index::create_from_metas(
|
||||
directory,
|
||||
IndexMeta::with_schema(schema)
|
||||
)
|
||||
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
@@ -117,37 +110,49 @@ impl Index {
|
||||
let metas = try!(load_metas(&directory)); //< TODO does the directory already exists?
|
||||
Index::create_from_metas(directory.box_clone(), metas)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the index docstamp.
|
||||
///
|
||||
/// The docstamp is the number of documents that have been added
|
||||
/// from the beginning of time, and until the moment of the last commit.
|
||||
pub fn docstamp(&self,) -> u64 {
|
||||
pub fn docstamp(&self) -> u64 {
|
||||
self.docstamp
|
||||
}
|
||||
|
||||
|
||||
/// Creates a multithreaded writer.
|
||||
/// Each writer produces an independant segment.
|
||||
pub fn writer_with_num_threads(&self, num_threads: usize, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
/// Each writer produces an independent segment.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer_with_num_threads(&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize)
|
||||
-> Result<IndexWriter> {
|
||||
IndexWriter::open(self, num_threads, heap_size_in_bytes)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
/// It just calls `writer_with_num_threads` with the number of core as `num_threads`
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer(&self, heap_size_in_bytes: usize) -> Result<IndexWriter> {
|
||||
self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes)
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the index schema
|
||||
///
|
||||
/// The schema is actually cloned.
|
||||
pub fn schema(&self,) -> Schema {
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.schema.clone()
|
||||
}
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self,) -> Vec<Segment> {
|
||||
pub fn searchable_segments(&self) -> Vec<Segment> {
|
||||
self.searchable_segment_ids()
|
||||
.into_iter()
|
||||
.map(|segment_id| self.segment(segment_id))
|
||||
@@ -155,92 +160,88 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Remove all of the file associated with the segment.
|
||||
///
|
||||
///
|
||||
/// This method cannot fail. If a problem occurs,
|
||||
/// some files may end up never being removed.
|
||||
/// The error will only be logged.
|
||||
/// The error will only be logged.
|
||||
pub fn delete_segment(&self, segment_id: SegmentId) {
|
||||
self.segment(segment_id).delete();
|
||||
}
|
||||
|
||||
/// Return a segment object given a segment_id
|
||||
|
||||
/// Return a segment object given a `segment_id`
|
||||
///
|
||||
/// The segment may or may not exist.
|
||||
pub fn segment(&self, segment_id: SegmentId) -> Segment {
|
||||
create_segment(self.clone(), segment_id)
|
||||
}
|
||||
|
||||
|
||||
/// Return a reference to the index directory.
|
||||
pub fn directory(&self,) -> &Directory {
|
||||
pub fn directory(&self) -> &Directory {
|
||||
&*self.directory
|
||||
}
|
||||
|
||||
|
||||
/// Return a mutable reference to the index directory.
|
||||
pub fn directory_mut(&mut self,) -> &mut Directory {
|
||||
pub fn directory_mut(&mut self) -> &mut Directory {
|
||||
&mut *self.directory
|
||||
}
|
||||
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
fn searchable_segment_ids(&self,) -> Vec<SegmentId> {
|
||||
fn searchable_segment_ids(&self) -> Vec<SegmentId> {
|
||||
self.segment_manager.committed_segments()
|
||||
}
|
||||
|
||||
|
||||
/// Creates a new segment.
|
||||
pub fn new_segment(&self,) -> Segment {
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
self.segment(SegmentId::generate_random())
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
/// a change of the set of searchable indexes.
|
||||
///
|
||||
/// This needs to be called when a new segment has been
|
||||
/// published or after a merge.
|
||||
pub fn load_searchers(&self,) -> Result<()>{
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let searchable_segments = self.searchable_segments();
|
||||
let mut searchers = Vec::new();
|
||||
for _ in 0..NUM_SEARCHERS {
|
||||
let searchable_segments_clone = searchable_segments.clone();
|
||||
let segment_readers: Vec<SegmentReader> = try!(
|
||||
searchable_segments_clone
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect()
|
||||
);
|
||||
let segment_readers: Vec<SegmentReader> = try!(searchable_segments_clone.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
let searcher = Searcher::from(segment_readers);
|
||||
searchers.push(searcher);
|
||||
}
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Returns a searcher
|
||||
///
|
||||
///
|
||||
/// This method should be called every single time a search
|
||||
/// query is performed.
|
||||
/// The searcher are taken from a pool of `NUM_SEARCHERS` searchers.
|
||||
/// The searchers are taken from a pool of `NUM_SEARCHERS` searchers.
|
||||
/// If no searcher is available
|
||||
/// it may block.
|
||||
/// this may block.
|
||||
///
|
||||
/// The same searcher must be used for a given query, as it ensures
|
||||
/// the use of a consistent segment set.
|
||||
pub fn searcher(&self,) -> LeasedItem<Searcher> {
|
||||
/// The same searcher must be used for a given query, as it ensures
|
||||
/// the use of a consistent segment set.
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl fmt::Debug for Index {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Index({:?})", self.directory)
|
||||
}
|
||||
write!(f, "Index({:?})", self.directory)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Index {
|
||||
fn clone(&self,) -> Index {
|
||||
fn clone(&self) -> Index {
|
||||
Index {
|
||||
segment_manager: self.segment_manager.clone(),
|
||||
|
||||
|
||||
directory: self.directory.box_clone(),
|
||||
schema: self.schema.clone(),
|
||||
searcher_pool: self.searcher_pool.clone(),
|
||||
|
||||
@@ -25,7 +25,7 @@ use schema::TextIndexingOptions;
|
||||
use error::Error;
|
||||
|
||||
|
||||
/// Entrypoint to access all of the datastructures of the `Segment`
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
/// - term dictionary
|
||||
/// - postings
|
||||
@@ -34,8 +34,8 @@ use error::Error;
|
||||
/// - field norm reader
|
||||
///
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is in Mmapped.
|
||||
///
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
pub struct SegmentReader {
|
||||
segment_info: SegmentInfo,
|
||||
segment_id: SegmentId,
|
||||
@@ -51,7 +51,7 @@ pub struct SegmentReader {
|
||||
impl SegmentReader {
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes so, it happens
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_info.max_doc
|
||||
@@ -233,7 +233,7 @@ impl SegmentReader {
|
||||
self.read_postings(term, segment_posting_option)
|
||||
}
|
||||
|
||||
/// Returns the term info of associated with the term.
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.term_infos.get(term.as_slice())
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ use std::marker::Sync;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where tantivy's index should be stored.
|
||||
///
|
||||
/// There is currently two implementations of `Directory`
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// should be your default choice.
|
||||
@@ -20,19 +20,19 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
|
||||
/// Opens a virtual file for read.
|
||||
///
|
||||
/// Once a virtualfile is open, its data may not
|
||||
/// Once a virtual file is open, its data may not
|
||||
/// change.
|
||||
///
|
||||
/// Specifically, subsequent write or flush should
|
||||
/// have no effect the returned `ReadOnlySource` object.
|
||||
/// Specifically, subsequent writes or flushes should
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError>;
|
||||
|
||||
/// Removes a file
|
||||
///
|
||||
/// Removing a file will not affect eventual
|
||||
/// Removing a file will not affect an eventual
|
||||
/// existing ReadOnlySource pointing to it.
|
||||
///
|
||||
/// Removing a non existing files, yields a
|
||||
/// Removing a nonexistent file, yields a
|
||||
/// `FileError::DoesNotExist`.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), FileError>;
|
||||
|
||||
@@ -47,28 +47,28 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
/// same path should return a `ReadOnlySource`.
|
||||
///
|
||||
/// Write operations may be aggressively buffered.
|
||||
/// The client of this trait is in charge to call flush
|
||||
/// The client of this trait is responsible for calling flush
|
||||
/// to ensure that subsequent `read` operations
|
||||
/// will take in account preceding `write` operations.
|
||||
/// will take into account preceding `write` operations.
|
||||
///
|
||||
/// Flush operation should also be persistent.
|
||||
///
|
||||
/// User shall not rely on `Drop` triggering `flush`.
|
||||
/// The user shall not rely on `Drop` triggering `flush`.
|
||||
/// Note that `RAMDirectory` will panic! if `flush`
|
||||
/// was not called.
|
||||
///
|
||||
/// The file may not previously exists.
|
||||
/// The file may not previously exist.
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
|
||||
|
||||
/// Atomically replace the content of a file by data.
|
||||
/// Atomically replace the content of a file with data.
|
||||
///
|
||||
/// This calls ensure that reads can never *observe*
|
||||
/// a partially written file.
|
||||
///
|
||||
/// The file may or may not previously exists.
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
|
||||
/// Clone the directory and boxes the clone
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
}
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ impl MmapDirectory {
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit test, prefer the RAMDirectory.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
|
||||
let tempdir = try!(TempDir::new("index"));
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
@@ -81,7 +81,7 @@ impl MmapDirectory {
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
/// to create proper complete `filepath`.
|
||||
/// to create a proper complete `filepath`.
|
||||
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
|
||||
self.root_path.join(relative_path)
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use directory::error::{OpenWriteError, FileError};
|
||||
use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated to the `RAMDirectory`
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
@@ -140,9 +140,9 @@ impl fmt::Debug for RAMDirectory {
|
||||
}
|
||||
|
||||
|
||||
/// Directory storing everything in anonymous memory.
|
||||
/// A Directory storing everything in anonymous memory.
|
||||
///
|
||||
/// It's main purpose is unit test.
|
||||
/// It is mainly meant for unit testing.
|
||||
/// Writes are only made visible upon flushing.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
@@ -168,7 +168,7 @@ impl Directory for RAMDirectory {
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
// force the creation of the file to mimick the MMap directory.
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if try!(self.fs.write(path_buf.clone(), &Vec::new())) {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
}
|
||||
|
||||
32
src/indexer/directory_lock.rs
Normal file
32
src/indexer/directory_lock.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use Directory;
|
||||
use std::path::Path;
|
||||
use error::Result;
|
||||
|
||||
pub const LOCKFILE_NAME: &'static str = ".tantivy-indexer.lock";
|
||||
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
///
|
||||
/// Only one lock can exist at a time for a given directory.
|
||||
/// The lock is release automatically on `Drop`.
|
||||
pub struct DirectoryLock {
|
||||
directory: Box<Directory>,
|
||||
}
|
||||
|
||||
impl DirectoryLock {
|
||||
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock> {
|
||||
let lockfile_path = Path::new(LOCKFILE_NAME);
|
||||
try!(directory.open_write(lockfile_path));
|
||||
Ok(DirectoryLock { directory: directory })
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DirectoryLock {
|
||||
fn drop(&mut self) {
|
||||
let lockfile_path = Path::new(LOCKFILE_NAME);
|
||||
if let Err(e) = self.directory.delete(lockfile_path) {
|
||||
error!("Failed to remove the lock file. {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ mod segment_register;
|
||||
mod segment_writer;
|
||||
mod segment_manager;
|
||||
mod segment_updater;
|
||||
mod directory_lock;
|
||||
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
28
src/lib.rs
28
src/lib.rs
@@ -51,6 +51,16 @@ mod macros {
|
||||
macro_rules! get(
|
||||
($e:expr) => (match $e { Some(e) => e, None => return None })
|
||||
);
|
||||
|
||||
macro_rules! doc(
|
||||
($($field:ident => $value:expr),*) => {{
|
||||
let mut document = Document::default();
|
||||
$(
|
||||
document.add(FieldValue::new($field, $value.into()));
|
||||
)*
|
||||
document
|
||||
}};
|
||||
);
|
||||
}
|
||||
|
||||
mod core;
|
||||
@@ -97,7 +107,7 @@ pub use postings::SegmentPostingsOption;
|
||||
|
||||
|
||||
/// u32 identifying a document within a segment.
|
||||
/// Document gets their doc id assigned incrementally,
|
||||
/// Documents have their doc id assigned incrementally,
|
||||
/// as they are added in the segment.
|
||||
pub type DocId = u32;
|
||||
|
||||
@@ -400,4 +410,20 @@ mod tests {
|
||||
}
|
||||
index.searcher();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_macro() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||
let document = doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short");
|
||||
assert_eq!(document.len(), 3);
|
||||
let values = document.get_all(text_field);
|
||||
assert_eq!(values.len(), 2);
|
||||
assert_eq!(values[0].text(), "tantivy");
|
||||
assert_eq!(values[1].text(), "some other value");
|
||||
let values = document.get_all(other_text_field);
|
||||
assert_eq!(values.len(), 1);
|
||||
assert_eq!(values[0].text(), "short");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
|
||||
/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum SkipResult {
|
||||
/// target was in the docset
|
||||
@@ -24,8 +24,8 @@ pub trait DocSet {
|
||||
/// element.
|
||||
fn advance(&mut self,) -> bool;
|
||||
|
||||
/// After skipping position, the iterator in such a way `.doc()`
|
||||
/// will return a value greater or equal to target.
|
||||
/// After skipping, position the iterator in such a way that `.doc()`
|
||||
/// will return a value greater than or equal to target.
|
||||
///
|
||||
/// SkipResult expresses whether the `target value` was reached, overstepped,
|
||||
/// or if the `DocSet` was entirely consumed without finding any value
|
||||
@@ -97,4 +97,4 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -12,8 +12,8 @@ use common::HasLen;
|
||||
/// as well as the list of term positions.
|
||||
///
|
||||
/// Its main implementation is `SegmentPostings`,
|
||||
/// but some other implementation mocking SegmentPostings exists,
|
||||
/// in order to help merging segment or for testing.
|
||||
/// but other implementations mocking SegmentPostings exist,
|
||||
/// in order to help when merging segments or for testing.
|
||||
pub trait Postings: DocSet {
|
||||
/// Returns the term frequency
|
||||
fn term_freq(&self,) -> u32;
|
||||
|
||||
@@ -29,7 +29,7 @@ pub enum ParsingError {
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
/// The language covered by the current is extremely simple.
|
||||
/// The language covered by the current parser is extremely simple.
|
||||
///
|
||||
/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using
|
||||
/// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`.
|
||||
@@ -44,7 +44,7 @@ pub enum ParsingError {
|
||||
///
|
||||
/// This behavior is slower, but is not a bad idea if the user is sorting
|
||||
/// by relevance : The user typically just scans through the first few
|
||||
/// documents in order of decreasing relevance and will stop when the document
|
||||
/// documents in order of decreasing relevance and will stop when the documents
|
||||
/// are not relevant anymore.
|
||||
/// Making it possible to make this behavior customizable is tracked in
|
||||
/// [issue #27](https://github.com/fulmicoton/tantivy/issues/27).
|
||||
@@ -135,9 +135,9 @@ impl QueryParser {
|
||||
/// Parse a query
|
||||
///
|
||||
/// Note that `parse_query` returns an error if the input
|
||||
/// not a valid query.
|
||||
/// is not a valid query.
|
||||
///
|
||||
/// There is currently no lenient mode for the query parse
|
||||
/// There is currently no lenient mode for the query parser
|
||||
/// which makes it a bad choice for a public/broad user search engine.
|
||||
///
|
||||
/// Implementing a lenient mode for this query parser is tracked
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# Schema definition
|
||||
|
||||
Tantivy has a very strict schema.
|
||||
The schema defines information about the fields your index contains, that is for each field :
|
||||
The schema defines information about the fields your index contains, that is, for each field :
|
||||
|
||||
* the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`)
|
||||
* the type of the field (currently only `text` and `u32` are supported)
|
||||
@@ -37,20 +37,20 @@ let schema = schema_builder.build();
|
||||
|
||||
We can split the problem of generating a search result page into two phases :
|
||||
|
||||
* identifying the list of 10 or so document to be displayed (Conceptually `query -> doc_ids[]`)
|
||||
* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`)
|
||||
* for each of these documents, retrieving the information required to generate the serp page. (`doc_ids[] -> Document[]`)
|
||||
|
||||
In the first phase, the hability to search for documents by the given field, is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our
|
||||
In the first phase, the ability to search for documents by the given field is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our
|
||||
[`TextOptions`](struct.TextOptions.html).
|
||||
|
||||
The effect of each possible settings is described more in detail [`TextIndexingOptions`](enum.TextIndexingOptions.html).
|
||||
The effect of each possible setting is described more in detail [`TextIndexingOptions`](enum.TextIndexingOptions.html).
|
||||
|
||||
On the other hand setting the field as stored or not determines whether the field should be returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc)
|
||||
is called.
|
||||
|
||||
### Shortcuts
|
||||
|
||||
For convenience, a few special value of `TextOptions` for your convenience.
|
||||
For convenience, a few special values of `TextOptions`.
|
||||
They can be composed using the `|` operator.
|
||||
The example can be rewritten :
|
||||
|
||||
@@ -82,7 +82,7 @@ Just like for Text fields (see above),
|
||||
setting the field as stored defines whether the field will be
|
||||
returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called,
|
||||
and setting the field as indexed means that we will be able perform queries such as `num_stars:10`.
|
||||
Note that contrary to text fields, u32 can only be indexed in one way for the moment.
|
||||
Note that unlike text fields, u32 can only be indexed in one way for the moment.
|
||||
This may change when we will start supporting range queries.
|
||||
|
||||
The `fast` option on the other hand is specific to u32 fields, and is only relevant
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::fmt;
|
||||
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance, whether a field is indexed or not,
|
||||
/// You need to specify in advance whether a field is indexed or not,
|
||||
/// stored or not, and RAM-based or not.
|
||||
///
|
||||
/// This is done by creating a schema object, and
|
||||
@@ -483,4 +483,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +59,11 @@ impl From<u32> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a str> for Value {
|
||||
fn from(s: &'a str) -> Value {
|
||||
Value::Str(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U32_CODE: u8 = 1;
|
||||
@@ -95,4 +100,4 @@ impl BinarySerializable for Value {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user