From 15567e0aa3a842ca9bc942ae7075f0ecfe55e09d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 26 Sep 2016 18:23:19 +0900 Subject: [PATCH 01/19] NOBUG Build failing --- examples/simple_search.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index e1c46f75d..851968576 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -20,12 +20,6 @@ fn main() { } -fn create_schema() -> Schema { - -} - - - fn run_example(index_path: &Path) -> tantivy::Result<()> { From d85cfab7a1d2208773c465267abb5a1725b17ae4 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Thu, 29 Sep 2016 16:28:10 -0400 Subject: [PATCH 02/19] minor fixes to grammar and usage in example file --- examples/simple_search.rs | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 851968576..39efabe13 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -25,7 +25,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // # Defining the schema // - // Tantivy index require to have a very strict schema. + // The Tantivy index requires a very strict schema. // The schema declares which fields are in the index, // and for each field, its type and "the way it should // be indexed". @@ -47,7 +47,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // `STORED` means that the field will also be saved // in a compressed, row-oriented key-value store. // This store is useful to reconstruct the - // document that were selected during the search phase. + // documents that were selected during the search phase. schema_builder.add_text_field("title", TEXT | STORED); // Our first field is body. @@ -64,29 +64,29 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // Let's create a brand new index. // // This will actually just save a meta.json - // with our schema the directory. + // with our schema in the directory. let index = try!(Index::create(index_path, schema.clone())); // To insert document we need an index writer. - // There shall be only one writer at a time. - // Besides, this single `IndexWriter` is already + // There must be only one writer at a time. + // This single `IndexWriter` is already // multithreaded. // - // Here we used a buffer of 1 GB. Using a bigger + // Here we use a buffer of 1 GB. Using a bigger // heap for the indexer can increase its throughput. // This buffer will be split between the indexing // threads. let mut index_writer = try!(index.writer(1_000_000_000)); - // Let's now index our documents! + // Let's index our documents! // We first need a handle on the title and the body field. // ### Create a document "manually". // - // We can create a document manually, by setting adding the fields + // We can create a document manually, by setting the fields // one by one in a Document object. let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); @@ -122,7 +122,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // This is an example, so we will only index 3 documents // here. You can check out tantivy's tutorial to index // the English wikipedia. Tantivy's indexing is rather fast. - // Indexing 5 millions articles of the English wikipedia takes + // Indexing 5 million articles of the English wikipedia takes // around 4 minutes on my computer! @@ -131,56 +131,56 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // At this point our documents are not searchable. // // - // We need to call .commit() explicitely to force the + // We need to call .commit() explicitly to force the // index_writer to finish processing the documents in the queue, - // flush the current index on the disk, and advertise + // flush the current index to the disk, and advertise // the existence of new documents. // // This call is blocking. try!(index_writer.commit()); // If `.commit()` returns correctly, then all of the - // documents have been added before are guaranteed to be + // documents that have been added are guaranteed to be // persistently indexed. // // In the scenario of a crash or a power failure, - // tantivy behaves as if it rollbacked to its last + // tantivy behaves as if has rolled back to its last // commit. // # Searching // - // Let's search our index. This starts + // Let's search our index. We start // by creating a searcher. There can be more // than one searcher at a time. // - // You are supposed to acquire a search + // You should create a searcher // every time you start a "search query". let searcher = index.searcher(); // The query parser can interpret human queries. // Here, if the user does not specify which - // field he wants to search, tantivy will search + // field they want to search, tantivy will search // in both title and body. let query_parser = QueryParser::new(index.schema(), vec!(title, body)); // QueryParser may fail if the query is not in the right // format. For user facing applications, this can be a problem. - // A ticket has been filled regarding this problem. + // A ticket has been opened regarding this problem. let query = try!(query_parser.parse_query("sea whale")); // A query defines a set of documents, as // well as the way they should be scored. // - // Query created by the query parser are scoring according + // A query created by the query parser is scored according // to a metric called Tf-Idf, and will consider // any document matching at least one of our terms. // ### Collectors // - // We are not interested in all of the document but - // only in the top 10. Keep track of our top 10 best documents + // We are not interested in all of the documents but + // only in the top 10. Keeping track of our top 10 best documents // is the role of the TopCollector. let mut top_collector = TopCollector::with_limit(10); @@ -188,14 +188,14 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // We can now perform our query. try!(query.search(&searcher, &mut top_collector)); - // Our top collector now contains are 10 + // Our top collector now contains the 10 // most relevant doc ids... let doc_addresses = top_collector.docs(); // The actual documents still need to be // retrieved from Tantivy's store. // - // Since body was not configured as stored, + // Since the body field was not configured as stored, // the document returned will only contain // a title. @@ -205,4 +205,4 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { } Ok(()) -} \ No newline at end of file +} From 9c1d08c4891b3659adea7a76474fae17e5061d37 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Thu, 29 Sep 2016 16:31:50 -0400 Subject: [PATCH 03/19] literally one character change to README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 395acd5a4..7a5f414b6 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ It will walk you through getting a wikipedia search engine up and running in a f Tantivy has a git submodule called `simdcomp`. After cloning the repository, you will need to initialize and update -the submodules. The project can then be build using `cargo`. +the submodules. The project can then be built using `cargo`. git clone git@github.com:fulmicoton/tantivy.git git submodule init From b9ef5909ad1a82a8b7b7d33d3fedaee0ccfd9ec9 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Thu, 29 Sep 2016 16:47:02 -0400 Subject: [PATCH 04/19] small changes to doc comments --- src/collector/count_collector.rs | 4 ++-- src/lib.rs | 2 +- src/postings/docset.rs | 6 +++--- src/postings/postings.rs | 4 ++-- src/query/query_parser.rs | 8 ++++---- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 9eaf74165..dc1d85580 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -5,13 +5,13 @@ use SegmentReader; use SegmentLocalId; /// `CountCollector` collector only counts how many -/// document are matching the query. +/// documents match the query. pub struct CountCollector { count: usize, } impl CountCollector { - /// Returns the count of document that where + /// Returns the count of documents that were /// collected. pub fn count(&self,) -> usize { self.count diff --git a/src/lib.rs b/src/lib.rs index 0f335c425..15378e025 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -96,7 +96,7 @@ pub use postings::SegmentPostingsOption; /// u32 identifying a document within a segment. -/// Document gets their doc id assigned incrementally, +/// Documents have their doc id assigned incrementally, /// as they are added in the segment. pub type DocId = u32; diff --git a/src/postings/docset.rs b/src/postings/docset.rs index e7fb59315..e84c56380 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -4,7 +4,7 @@ use std::borrow::BorrowMut; use std::cmp::Ordering; -/// Expressed the outcome of a call to `DocSet`'s `.skip_next(...)`. +/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`. #[derive(PartialEq, Eq, Debug)] pub enum SkipResult { /// target was in the docset @@ -24,7 +24,7 @@ pub trait DocSet { /// element. fn advance(&mut self,) -> bool; - /// After skipping position, the iterator in such a way `.doc()` + /// After skipping, position the iterator in such a way `.doc()` /// will return a value greater or equal to target. /// /// SkipResult expresses whether the `target value` was reached, overstepped, @@ -97,4 +97,4 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { } } - \ No newline at end of file + diff --git a/src/postings/postings.rs b/src/postings/postings.rs index e3323ae19..bffbeb876 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -12,8 +12,8 @@ use common::HasLen; /// as well as the list of term positions. /// /// Its main implementation is `SegmentPostings`, -/// but some other implementation mocking SegmentPostings exists, -/// in order to help merging segment or for testing. +/// but other implementations mocking SegmentPostings exists, +/// in order to help merging segments or for testing. pub trait Postings: DocSet { /// Returns the term frequency fn term_freq(&self,) -> u32; diff --git a/src/query/query_parser.rs b/src/query/query_parser.rs index f15196a06..89876825d 100644 --- a/src/query/query_parser.rs +++ b/src/query/query_parser.rs @@ -29,7 +29,7 @@ pub enum ParsingError { /// Tantivy's Query parser /// -/// The language covered by the current is extremely simple. +/// The language covered by the current parser is extremely simple. /// /// * simple terms: "e.g.: `Barack Obama` are simply analyzed using /// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`. @@ -44,7 +44,7 @@ pub enum ParsingError { /// /// This behavior is slower, but is not a bad idea if the user is sorting /// by relevance : The user typically just scans through the first few -/// documents in order of decreasing relevance and will stop when the document +/// documents in order of decreasing relevance and will stop when the documents /// are not relevant anymore. /// Making it possible to make this behavior customizable is tracked in /// [issue #27](https://github.com/fulmicoton/tantivy/issues/27). @@ -135,9 +135,9 @@ impl QueryParser { /// Parse a query /// /// Note that `parse_query` returns an error if the input - /// not a valid query. + /// is not a valid query. /// - /// There is currently no lenient mode for the query parse + /// There is currently no lenient mode for the query parser /// which makes it a bad choice for a public/broad user search engine. /// /// Implementing a lenient mode for this query parser is tracked From 54437105445e9cfe0d6021612b00a4267be0cce0 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Thu, 29 Sep 2016 21:51:12 -0400 Subject: [PATCH 05/19] more minor doc text changes --- src/collector/mod.rs | 24 ++++++++++++------------ src/collector/multi_collector.rs | 4 ++-- src/collector/top_collector.rs | 22 +++++++++++----------- src/directory/directory.rs | 26 +++++++++++++------------- src/directory/mmap_directory.rs | 4 ++-- src/directory/ram_directory.rs | 8 ++++---- src/postings/docset.rs | 4 ++-- src/postings/postings.rs | 4 ++-- src/schema/mod.rs | 12 ++++++------ src/schema/schema.rs | 4 ++-- 10 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index de4092738..81bdc330f 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -20,16 +20,16 @@ pub use self::chained_collector::chain; /// /// /// For instance, -/// - keeping track of the top 10 best documents -/// - computing a break down over a fast field -/// - computing the number of documents matching the query /// +/// - keeping track of the top 10 best documents +/// - computing a breakdown over a fast field +/// - computing the number of documents matching the query /// /// Queries are in charge of pushing the `DocSet` to the collector. /// -/// As they work on multiple segment, they first inform -/// the collector of a change in segment and then -/// call the collect method to push document to the collector. +/// As they work on multiple segments, they first inform +/// the collector of a change in a segment and then +/// call the collect method to push the document to the collector. /// /// Temporally, our collector will receive calls /// - `.set_segment(0, segment_reader_0)` @@ -45,10 +45,10 @@ pub use self::chained_collector::chain; /// /// Segments are not guaranteed to be visited in any specific order. pub trait Collector { - /// `set_segment` is called before starting enumerating + /// `set_segment` is called before beginning to enumerate /// on this segment. fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; - /// The query pushes scored document to the collector via this method. + /// The query pushes the scored document to the collector via this method. fn collect(&mut self, scored_doc: ScoredDoc); } @@ -57,7 +57,7 @@ impl<'a, C: Collector> Collector for &'a mut C { fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { (*self).set_segment(segment_local_id, segment) } - /// The query pushes scored document to the collector via this method. + /// The query pushes the scored document to the collector via this method. fn collect(&mut self, scored_doc: ScoredDoc) { (*self).collect(scored_doc); } @@ -120,10 +120,10 @@ pub mod tests { - /// Collects in order all of the fast field for all of the - /// doc of the `DocSet` + /// Collects in order all of the fast fields for all of the + /// doc in the `DocSet` /// - /// This collector is essentially useful for tests. + /// This collector is mainly useful for tests. pub struct FastFieldTestCollector { vals: Vec, field: Field, diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 092547c78..92958018d 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -5,7 +5,7 @@ use SegmentReader; use SegmentLocalId; -/// Multicollector makes it possible to collect on more than one collector +/// Multicollector makes it possible to collect on more than one collector. /// It should only be used for use cases where the Collector types is unknown /// at compile time. /// If the type of the collectors is known, you should prefer to use `ChainedCollector`. @@ -60,4 +60,4 @@ mod tests { assert_eq!(count_collector.count(), 3); assert!(top_collector.at_capacity()); } -} \ No newline at end of file +} diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index c04c82774..98641594d 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -50,7 +50,7 @@ pub struct TopCollector { impl TopCollector { - /// Creates a top collector, with a number of document of "limit" + /// Creates a top collector, with a number of documents equal to "limit". /// /// # Panics /// The method panics if limit is 0 @@ -65,9 +65,9 @@ impl TopCollector { } } - /// Returns the decreasingly sorted K-best documents. + /// Returns K best documents sorted in decreasing order. /// - /// Calling this method will triggers the sort. + /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn docs(&self) -> Vec { self.score_docs() @@ -76,9 +76,9 @@ impl TopCollector { .collect() } - /// Returns the decreasingly sorted K-best ScoredDocument. + /// Returns K best ScoredDocument sorted in decreasing order. /// - /// Calling this method will triggers the sort. + /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { let mut scored_docs: Vec = self.heap @@ -90,9 +90,9 @@ impl TopCollector { .map(|GlobalScoredDoc(score, doc_address)| (score, doc_address)) .collect() } - - /// Return true iff at least K document have gone through - /// the collector. + + /// Return true iff at least K documents have gone through + /// the collector. #[inline] pub fn at_capacity(&self, ) -> bool { self.heap.len() >= self.limit @@ -176,8 +176,8 @@ mod tests { .collect(); assert_eq!(docs, vec!(7, 1, 5, 3)); } - - + + } #[test] @@ -185,4 +185,4 @@ mod tests { fn test_top_0() { TopCollector::with_limit(0); } -} \ No newline at end of file +} diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 08f602251..2a77825c2 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -9,7 +9,7 @@ use std::marker::Sync; /// Write-once read many (WORM) abstraction for where tantivy's index should be stored. /// -/// There is currently two implementations of `Directory` +/// There are currently two implementations of `Directory` /// /// - The [`MMapDirectory`](struct.MmapDirectory.html), this /// should be your default choice. @@ -20,19 +20,19 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// Opens a virtual file for read. /// - /// Once a virtualfile is open, its data may not + /// Once a virtual file is open, its data may not /// change. /// - /// Specifically, subsequent write or flush should - /// have no effect the returned `ReadOnlySource` object. + /// Specifically, subsequent writes or flushes should + /// have no effect on the returned `ReadOnlySource` object. fn open_read(&self, path: &Path) -> result::Result; /// Removes a file /// - /// Removing a file will not affect eventual + /// Removing a file will not affect an eventual /// existing ReadOnlySource pointing to it. /// - /// Removing a non existing files, yields a + /// Removing a nonexistent file, yields a /// `FileError::DoesNotExist`. fn delete(&self, path: &Path) -> result::Result<(), FileError>; @@ -44,28 +44,28 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// same path should return a `ReadOnlySource`. /// /// Write operations may be aggressively buffered. - /// The client of this trait is in charge to call flush + /// The client of this trait is responsible for calling flush /// to ensure that subsequent `read` operations - /// will take in account preceding `write` operations. + /// will take into account preceding `write` operations. /// /// Flush operation should also be persistent. /// - /// User shall not rely on `Drop` triggering `flush`. + /// The user shall not rely on `Drop` triggering `flush`. /// Note that `RAMDirectory` will panic! if `flush` /// was not called. /// - /// The file may not previously exists. + /// The file may not previously exist. fn open_write(&mut self, path: &Path) -> Result; - /// Atomically replace the content of a file by data. + /// Atomically replace the content of a file with data. /// /// This calls ensure that reads can never *observe* /// a partially written file. /// - /// The file may or may not previously exists. + /// The file may or may not previously exist. fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; - /// Clone the directory and boxes the clone + /// Clones the directory and boxes the clone fn box_clone(&self) -> Box; } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index eefa760b1..4117494c0 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -47,7 +47,7 @@ impl MmapDirectory { /// Creates a new MmapDirectory in a temporary directory. /// /// This is mostly useful to test the MmapDirectory itself. - /// For your unit test, prefer the RAMDirectory. + /// For your unit tests, prefer the RAMDirectory. pub fn create_from_tempdir() -> io::Result { let tempdir = try!(TempDir::new("index")); let tempdir_path = PathBuf::from(tempdir.path()); @@ -81,7 +81,7 @@ impl MmapDirectory { } /// Joins a relative_path to the directory `root_path` - /// to create proper complete `filepath`. + /// to create a proper complete `filepath`. fn resolve_path(&self, relative_path: &Path) -> PathBuf { self.root_path.join(relative_path) } diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 8a002a088..d3b408985 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -11,7 +11,7 @@ use directory::error::{OpenWriteError, FileError}; use directory::WritePtr; use super::shared_vec_slice::SharedVecSlice; -/// Writer associated to the `RAMDirectory` +/// Writer associated with the `RAMDirectory` /// /// The Writer just writes a buffer. /// @@ -133,9 +133,9 @@ impl fmt::Debug for RAMDirectory { } -/// Directory storing everything in anonymous memory. +/// A Directory storing everything in anonymous memory. /// -/// It's main purpose is unit test. +/// It is mainly meant for unit testing. /// Writes are only made visible upon flushing. /// #[derive(Clone)] @@ -161,7 +161,7 @@ impl Directory for RAMDirectory { fn open_write(&mut self, path: &Path) -> Result { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); - // force the creation of the file to mimick the MMap directory. + // force the creation of the file to mimic the MMap directory. if try!(self.fs.write(path_buf.clone(), &Vec::new())) { Err(OpenWriteError::FileAlreadyExists(path_buf)) } diff --git a/src/postings/docset.rs b/src/postings/docset.rs index e84c56380..db40db619 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -24,8 +24,8 @@ pub trait DocSet { /// element. fn advance(&mut self,) -> bool; - /// After skipping, position the iterator in such a way `.doc()` - /// will return a value greater or equal to target. + /// After skipping, position the iterator in such a way that `.doc()` + /// will return a value greater than or equal to target. /// /// SkipResult expresses whether the `target value` was reached, overstepped, /// or if the `DocSet` was entirely consumed without finding any value diff --git a/src/postings/postings.rs b/src/postings/postings.rs index bffbeb876..071068c95 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -12,8 +12,8 @@ use common::HasLen; /// as well as the list of term positions. /// /// Its main implementation is `SegmentPostings`, -/// but other implementations mocking SegmentPostings exists, -/// in order to help merging segments or for testing. +/// but other implementations mocking SegmentPostings exist, +/// in order to help when merging segments or for testing. pub trait Postings: DocSet { /// Returns the term frequency fn term_freq(&self,) -> u32; diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 2abde20ba..5be6bc512 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -4,7 +4,7 @@ # Schema definition Tantivy has a very strict schema. -The schema defines information about the fields your index contains, that is for each field : +The schema defines information about the fields your index contains, that is, for each field : * the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`) * the type of the field (currently only `text` and `u32` are supported) @@ -37,20 +37,20 @@ let schema = schema_builder.build(); We can split the problem of generating a search result page into two phases : -* identifying the list of 10 or so document to be displayed (Conceptually `query -> doc_ids[]`) +* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`) * for each of these documents, retrieving the information required to generate the serp page. (`doc_ids[] -> Document[]`) -In the first phase, the hability to search for documents by the given field, is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our +In the first phase, the ability to search for documents by the given field is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`](struct.TextOptions.html). -The effect of each possible settings is described more in detail [`TextIndexingOptions`](enum.TextIndexingOptions.html). +The effect of each possible setting is described more in detail [`TextIndexingOptions`](enum.TextIndexingOptions.html). On the other hand setting the field as stored or not determines whether the field should be returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called. ### Shortcuts -For convenience, a few special value of `TextOptions` for your convenience. +For convenience, a few special values of `TextOptions`. They can be composed using the `|` operator. The example can be rewritten : @@ -82,7 +82,7 @@ Just like for Text fields (see above), setting the field as stored defines whether the field will be returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called, and setting the field as indexed means that we will be able perform queries such as `num_stars:10`. -Note that contrary to text fields, u32 can only be indexed in one way for the moment. +Note that unlike text fields, u32 can only be indexed in one way for the moment. This may change when we will start supporting range queries. The `fast` option on the other hand is specific to u32 fields, and is only relevant diff --git a/src/schema/schema.rs b/src/schema/schema.rs index f41d5572f..8d6e3bedd 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -15,7 +15,7 @@ use std::fmt; /// Tantivy has a very strict schema. -/// You need to specify in advance, whether a field is indexed or not, +/// You need to specify in advance whether a field is indexed or not, /// stored or not, and RAM-based or not. /// /// This is done by creating a schema object, and @@ -483,4 +483,4 @@ mod tests { } } } -} \ No newline at end of file +} From 784043b8eb3cb269cacb414bb1c3c0e6d20c4c3c Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Fri, 30 Sep 2016 11:16:56 -0400 Subject: [PATCH 06/19] more small changes --- src/collector/count_collector.rs | 3 +-- src/collector/mod.rs | 2 +- src/core/index.rs | 16 ++++++++-------- src/core/segment_reader.rs | 10 +++++----- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index dc1d85580..44d547ec3 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -20,8 +20,7 @@ impl CountCollector { impl Default for CountCollector { fn default() -> CountCollector { - CountCollector { - count: 0, + CountCollector {count: 0, } } } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 81bdc330f..683b7eb1c 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -29,7 +29,7 @@ pub use self::chained_collector::chain; /// /// As they work on multiple segments, they first inform /// the collector of a change in a segment and then -/// call the collect method to push the document to the collector. +/// call the `collect` method to push the document to the collector. /// /// Temporally, our collector will receive calls /// - `.set_segment(0, segment_reader_0)` diff --git a/src/core/index.rs b/src/core/index.rs index ac2b287cf..86b620319 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -90,7 +90,7 @@ impl Index { /// Creates a new index in a temp directory. /// /// The index will use the `MMapDirectory` in a newly created directory. - /// The temp directory will be destroyed automatically when the Index object + /// The temp directory will be destroyed automatically when the `Index` object /// is destroyed. /// /// The temp directory is only used for testing the `MmapDirectory`. @@ -100,7 +100,7 @@ impl Index { Index::from_directory(directory, schema) } - /// Creates a new index given a directory and an IndexMeta. + /// Creates a new index given a directory and an `IndexMeta`. fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); let index = Index { @@ -160,7 +160,7 @@ impl Index { /// Marks the segment as published. // TODO find a rusty way to hide that, while keeping - // it visible for IndexWriters. + // it visible for `IndexWriter`s. pub fn publish_segments(&mut self, segment_ids: &[SegmentId], docstamp: u64) -> Result<()> { @@ -204,7 +204,7 @@ impl Index { } - /// Return a segment object given a segment_id + /// Return a segment object given a `segment_id` /// /// The segment may or may not exist. fn segment(&self, segment_id: SegmentId) -> Segment { @@ -246,7 +246,7 @@ impl Index { /// Either // - it fails, in which case an error is returned, /// and the `meta.json` remains untouched, - /// - it success, and `meta.json` is written + /// - it succeeds, and `meta.json` is written /// and flushed. pub fn save_metas(&mut self,) -> Result<()> { let mut w = Vec::new(); @@ -286,9 +286,9 @@ impl Index { /// /// This method should be called every single time a search /// query is performed. - /// The searcher are taken from a pool of `NUM_SEARCHERS` searchers. + /// The searchers are taken from a pool of `NUM_SEARCHERS` searchers. /// If no searcher is available - /// it may block. + /// this may block. /// /// The same searcher must be used for a given query, as it ensures /// the use of a consistent segment set. @@ -313,4 +313,4 @@ impl Clone for Index { searcher_pool: self.searcher_pool.clone(), } } -} \ No newline at end of file +} diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index be1e46ec2..148e34ab2 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -25,7 +25,7 @@ use schema::TextIndexingOptions; use error::Error; -/// Entrypoint to access all of the datastructures of the `Segment` +/// Entry point to access all of the datastructures of the `Segment` /// /// - term dictionary /// - postings @@ -34,8 +34,8 @@ use error::Error; /// - field norm reader /// /// The segment reader has a very low memory footprint, -/// as close to all of the memory data is in Mmapped. -/// +/// as close to all of the memory data is mmapped. +/// pub struct SegmentReader { segment_info: SegmentInfo, segment_id: SegmentId, @@ -51,7 +51,7 @@ pub struct SegmentReader { impl SegmentReader { /// Returns the highest document id ever attributed in /// this segment + 1. - /// Today, `tantivy` does not handle deletes so, it happens + /// Today, `tantivy` does not handle deletes, so it happens /// to also be the number of documents in the index. pub fn max_doc(&self) -> DocId { self.segment_info.max_doc @@ -233,7 +233,7 @@ impl SegmentReader { self.read_postings(term, segment_posting_option) } - /// Returns the term info of associated with the term. + /// Returns the term info associated with the term. pub fn get_term_info(&self, term: &Term) -> Option { self.term_infos.get(term.as_slice()) } From a2cc0ff58c6fc6054697eedbc441151c0342a66f Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Fri, 30 Sep 2016 11:17:47 -0400 Subject: [PATCH 07/19] readme updates --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7a5f414b6..75b0b13a6 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,10 @@ It is strongly inspired by Lucene's design. # Features - configurable indexing (optional term frequency and position indexing) -- Tf-Idf scoring +- tf-idf scoring - Basic query language - Incremental indexing -- Multithreaded indexing (indexing en wikipedia takes 4mn on my desktop) +- Multithreaded indexing (indexing english Wikipedia takes 4 minutes on my desktop) - Mmap based - SIMD integer compression - u32 fast fields (equivalent of doc values in Lucene) From ac9aa5cb5e11031ffdddbd34f0ce06583c3456c9 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Fri, 30 Sep 2016 11:19:00 -0400 Subject: [PATCH 08/19] literally one character changed --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75b0b13a6..f62295e2b 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ It is strongly inspired by Lucene's design. - tf-idf scoring - Basic query language - Incremental indexing -- Multithreaded indexing (indexing english Wikipedia takes 4 minutes on my desktop) -- Mmap based +- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop) +- mmap based - SIMD integer compression - u32 fast fields (equivalent of doc values in Lucene) - LZ4 compressed document store From 5986f729eeaf03d9098ed1021ddfa531d071f79d Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 21:08:47 -0400 Subject: [PATCH 09/19] acquire and create lockfile when creating IndexWriter --- src/indexer/index_writer.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 519d34825..98f9f3571 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -8,6 +8,7 @@ use std::thread::JoinHandle; use indexer::SegmentWriter; use std::clone::Clone; use std::io; +use std::path::Path; use std::thread; use std::collections::HashSet; use indexer::merger::IndexMerger; @@ -15,6 +16,7 @@ use core::SegmentId; use datastruct::stacker::Heap; use std::mem::swap; use chan; +use directory::WritePtr; use Result; use Error; @@ -29,6 +31,8 @@ pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32; // Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; +pub const LOCKFILE_NAME: &'static str = ".tantivy-indexer.lock"; + type DocumentSender = chan::Sender; type DocumentReceiver = chan::Receiver; @@ -51,7 +55,8 @@ pub struct IndexWriter { document_receiver: DocumentReceiver, document_sender: DocumentSender, num_threads: usize, - docstamp: u64, + docstamp: u64, + lockfile: WritePtr } @@ -121,11 +126,16 @@ impl IndexWriter { if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { panic!(format!("The heap size per thread needs to be at least {}.", HEAP_SIZE_LIMIT)); } + + let mut cloned_index = index.clone(); + let lockfile_path = Path::new(LOCKFILE_NAME); + + let lf = try!(cloned_index.directory_mut().open_write(lockfile_path)); let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); let (segment_ready_sender, segment_ready_receiver): (NewSegmentSender, NewSegmentReceiver) = chan::async(); let mut index_writer = IndexWriter { heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, - index: index.clone(), + index: cloned_index, segment_ready_receiver: segment_ready_receiver, segment_ready_sender: segment_ready_sender, document_receiver: document_receiver, @@ -133,6 +143,7 @@ impl IndexWriter { workers_join_handle: Vec::new(), num_threads: num_threads, docstamp: try!(index.docstamp()), + lockfile: lf }; try!(index_writer.start_workers()); Ok(index_writer) @@ -336,4 +347,4 @@ mod tests { index.searcher(); } -} \ No newline at end of file +} From f8011c73eda74743e3adec62dc640c1a32b5e7ef Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 21:15:57 -0400 Subject: [PATCH 10/19] delete lockfile on drop IndexWriter --- src/indexer/index_writer.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 98f9f3571..04221876f 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -78,7 +78,15 @@ fn index_documents(heap: &mut Heap, Ok(num_docs) } - +impl Drop for IndexWriter { + fn drop(&mut self) { + let lockfile_path = Path::new(LOCKFILE_NAME); + match self.index.directory_mut().delete(lockfile_path) { + Ok(_) => (), + Err(_) => () + } + } +} impl IndexWriter { /// Spawns a new worker thread for indexing. From d580b8a5acce2ce33efec8dbf366298dddcc2d55 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 21:19:25 -0400 Subject: [PATCH 11/19] don't actually store a pointer to the lockfile, there's no need --- src/indexer/index_writer.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 04221876f..824317ed2 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -56,7 +56,6 @@ pub struct IndexWriter { document_sender: DocumentSender, num_threads: usize, docstamp: u64, - lockfile: WritePtr } @@ -138,7 +137,7 @@ impl IndexWriter { let mut cloned_index = index.clone(); let lockfile_path = Path::new(LOCKFILE_NAME); - let lf = try!(cloned_index.directory_mut().open_write(lockfile_path)); + try!(cloned_index.directory_mut().open_write(lockfile_path)); let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); let (segment_ready_sender, segment_ready_receiver): (NewSegmentSender, NewSegmentReceiver) = chan::async(); let mut index_writer = IndexWriter { @@ -151,7 +150,6 @@ impl IndexWriter { workers_join_handle: Vec::new(), num_threads: num_threads, docstamp: try!(index.docstamp()), - lockfile: lf }; try!(index_writer.start_workers()); Ok(index_writer) From bb42606a9bca230df3d92dd132634ebcb5d41e22 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 21:32:49 -0400 Subject: [PATCH 12/19] added tests for correct handling of lockfiles --- src/indexer/index_writer.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 824317ed2..3bfaede25 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -310,6 +310,28 @@ mod tests { use Index; use Term; + #[test] + #[should_panic] + fn test_lockfile_stops_duplicates() { + + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + let index_writer = index.writer(40_000_000).unwrap(); + let index_writer_two = index.writer(40_000_000).unwrap(); + } + + #[test] + fn test_lockfile_released_on_drop() { + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + { + let index_writer = index.writer(40_000_000).unwrap(); + } + + let index_writer_two = index.writer(40_000_000).unwrap(); + } #[test] fn test_commit_and_rollback() { let mut schema_builder = schema::SchemaBuilder::default(); From 1fb75a8627e18f1edb9d02d4bd4d8359ac121bbf Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 21:56:01 -0400 Subject: [PATCH 13/19] check for specific error rather than any panic --- src/indexer/index_writer.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3bfaede25..631c184b3 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -309,16 +309,20 @@ mod tests { use schema::{self, Document}; use Index; use Term; + use Error; + use directory::error::OpenWriteError; #[test] - #[should_panic] fn test_lockfile_stops_duplicates() { let mut schema_builder = schema::SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - let index_writer_two = index.writer(40_000_000).unwrap(); + match index.writer(40_000_000) { + Err(Error::FileAlreadyExists(_)) => {}, + _ => panic!("Expected FileAlreadyExists error") + } } #[test] From 6fc412f70a17cee221ccc95fb6b36a8e811080c0 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 22:04:04 -0400 Subject: [PATCH 14/19] formatting and removing some unused vars and imports --- src/indexer/index_writer.rs | 545 ++++++++++++++++++------------------ 1 file changed, 271 insertions(+), 274 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 631c184b3..1cf4fa9c8 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -16,7 +16,6 @@ use core::SegmentId; use datastruct::stacker::Heap; use std::mem::swap; use chan; -use directory::WritePtr; use Result; use Error; @@ -47,34 +46,36 @@ type NewSegmentReceiver = chan::Receiver>; /// Each indexing thread builds its own independant `Segment`, via /// a `SegmentWriter` object. pub struct IndexWriter { - index: Index, - heap_size_in_bytes_per_thread: usize, - workers_join_handle: Vec>, - segment_ready_sender: NewSegmentSender, - segment_ready_receiver: NewSegmentReceiver, - document_receiver: DocumentReceiver, - document_sender: DocumentSender, - num_threads: usize, - docstamp: u64, + index: Index, + heap_size_in_bytes_per_thread: usize, + workers_join_handle: Vec>, + segment_ready_sender: NewSegmentSender, + segment_ready_receiver: NewSegmentReceiver, + document_receiver: DocumentReceiver, + document_sender: DocumentSender, + num_threads: usize, + docstamp: u64, } fn index_documents(heap: &mut Heap, - segment: Segment, - schema: &Schema, - document_iterator: &mut Iterator) -> Result { - heap.clear(); - let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); - for doc in document_iterator { - try!(segment_writer.add_document(&doc, &schema)); - if segment_writer.is_buffer_full() { - info!("Buffer limit reached, flushing segment with maxdoc={}.", segment_writer.max_doc()); - break; - } - } - let num_docs = segment_writer.max_doc() as usize; - try!(segment_writer.finalize()); - Ok(num_docs) + segment: Segment, + schema: &Schema, + document_iterator: &mut Iterator) + -> Result { + heap.clear(); + let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); + for doc in document_iterator { + try!(segment_writer.add_document(&doc, &schema)); + if segment_writer.is_buffer_full() { + info!("Buffer limit reached, flushing segment with maxdoc={}.", + segment_writer.max_doc()); + break; + } + } + let num_docs = segment_writer.max_doc() as usize; + try!(segment_writer.finalize()); + Ok(num_docs) } impl Drop for IndexWriter { @@ -82,223 +83,222 @@ impl Drop for IndexWriter { let lockfile_path = Path::new(LOCKFILE_NAME); match self.index.directory_mut().delete(lockfile_path) { Ok(_) => (), - Err(_) => () + Err(_) => (), } } } impl IndexWriter { + /// Spawns a new worker thread for indexing. + /// The thread consumes documents from the pipeline. + /// + fn add_indexing_worker(&mut self) -> Result<()> { + let index = self.index.clone(); + let schema = self.index.schema(); + let segment_ready_sender_clone = self.segment_ready_sender.clone(); + let document_receiver_clone = self.document_receiver.clone(); - /// Spawns a new worker thread for indexing. - /// The thread consumes documents from the pipeline. - /// - fn add_indexing_worker(&mut self,) -> Result<()> { - let index = self.index.clone(); - let schema = self.index.schema(); - let segment_ready_sender_clone = self.segment_ready_sender.clone(); - let document_receiver_clone = self.document_receiver.clone(); - - let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); - let join_handle: JoinHandle<()> = thread::spawn(move || { - loop { - let segment = index.new_segment(); - let segment_id = segment.id(); - let mut document_iterator = document_receiver_clone - .clone() - .into_iter() - .peekable(); - // the peeking here is to avoid - // creating a new segment's files - // if no document are available. - if document_iterator.peek().is_some() { - let index_result = index_documents(&mut heap, segment, &schema, &mut document_iterator) - .map(|num_docs| (segment_id, num_docs)); - segment_ready_sender_clone.send(index_result); - } - else { - return; - } - } - }); - self.workers_join_handle.push(join_handle); - Ok(()) - } - - /// Open a new index writer - /// - /// num_threads tells the number of indexing worker that - /// should work at the same time. - pub fn open(index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) -> Result { - if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { - panic!(format!("The heap size per thread needs to be at least {}.", HEAP_SIZE_LIMIT)); - } + let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); + let join_handle: JoinHandle<()> = thread::spawn(move || { + loop { + let segment = index.new_segment(); + let segment_id = segment.id(); + let mut document_iterator = document_receiver_clone.clone() + .into_iter() + .peekable(); + // the peeking here is to avoid + // creating a new segment's files + // if no document are available. + if document_iterator.peek().is_some() { + let index_result = + index_documents(&mut heap, segment, &schema, &mut document_iterator) + .map(|num_docs| (segment_id, num_docs)); + segment_ready_sender_clone.send(index_result); + } else { + return; + } + } + }); + self.workers_join_handle.push(join_handle); + Ok(()) + } - let mut cloned_index = index.clone(); - let lockfile_path = Path::new(LOCKFILE_NAME); + /// Open a new index writer + /// + /// num_threads tells the number of indexing worker that + /// should work at the same time. + pub fn open(index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize) + -> Result { + if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { + panic!(format!("The heap size per thread needs to be at least {}.", + HEAP_SIZE_LIMIT)); + } - try!(cloned_index.directory_mut().open_write(lockfile_path)); - let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - let (segment_ready_sender, segment_ready_receiver): (NewSegmentSender, NewSegmentReceiver) = chan::async(); - let mut index_writer = IndexWriter { - heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, - index: cloned_index, - segment_ready_receiver: segment_ready_receiver, - segment_ready_sender: segment_ready_sender, - document_receiver: document_receiver, - document_sender: document_sender, - workers_join_handle: Vec::new(), - num_threads: num_threads, - docstamp: try!(index.docstamp()), - }; - try!(index_writer.start_workers()); - Ok(index_writer) - } + let mut cloned_index = index.clone(); + let lockfile_path = Path::new(LOCKFILE_NAME); - fn start_workers(&mut self,) -> Result<()> { - for _ in 0 .. self.num_threads { - try!(self.add_indexing_worker()); - } - Ok(()) - } - - /// Merges a given list of segments - pub fn merge(&mut self, segments: &[Segment]) -> Result<()> { - let schema = self.index.schema(); - let merger = try!(IndexMerger::open(schema, segments)); - let mut merged_segment = self.index.new_segment(); - let segment_serializer = try!(SegmentSerializer::for_segment(&mut merged_segment)); - try!(merger.write(segment_serializer)); - let merged_segment_ids: HashSet = segments.iter().map(|segment| segment.id()).collect(); - try!(self.index.publish_merge_segment(merged_segment_ids, merged_segment.id())); - Ok(()) - } + try!(cloned_index.directory_mut().open_write(lockfile_path)); + let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = + chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); + let (segment_ready_sender, segment_ready_receiver): (NewSegmentSender, NewSegmentReceiver) = chan::async(); + let mut index_writer = IndexWriter { + heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, + index: cloned_index, + segment_ready_receiver: segment_ready_receiver, + segment_ready_sender: segment_ready_sender, + document_receiver: document_receiver, + document_sender: document_sender, + workers_join_handle: Vec::new(), + num_threads: num_threads, + docstamp: try!(index.docstamp()), + }; + try!(index_writer.start_workers()); + Ok(index_writer) + } - /// Closes the current document channel send. - /// and replace all the channels by new ones. - /// - /// The current workers will keep on indexing - /// the pending document and stop - /// when no documents are remaining. - /// - /// Returns the former segment_ready channel. - fn recreate_channels(&mut self,) -> (DocumentReceiver, chan::Receiver>) { - let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - let (mut segment_ready_sender, mut segment_ready_receiver): (NewSegmentSender, NewSegmentReceiver) = chan::async(); - swap(&mut self.document_sender, &mut document_sender); - swap(&mut self.document_receiver, &mut document_receiver); - swap(&mut self.segment_ready_sender, &mut segment_ready_sender); - swap(&mut self.segment_ready_receiver, &mut segment_ready_receiver); - (document_receiver, segment_ready_receiver) - } + fn start_workers(&mut self) -> Result<()> { + for _ in 0..self.num_threads { + try!(self.add_indexing_worker()); + } + Ok(()) + } + + /// Merges a given list of segments + pub fn merge(&mut self, segments: &[Segment]) -> Result<()> { + let schema = self.index.schema(); + let merger = try!(IndexMerger::open(schema, segments)); + let mut merged_segment = self.index.new_segment(); + let segment_serializer = try!(SegmentSerializer::for_segment(&mut merged_segment)); + try!(merger.write(segment_serializer)); + let merged_segment_ids: HashSet = + segments.iter().map(|segment| segment.id()).collect(); + try!(self.index.publish_merge_segment(merged_segment_ids, merged_segment.id())); + Ok(()) + } + + /// Closes the current document channel send. + /// and replace all the channels by new ones. + /// + /// The current workers will keep on indexing + /// the pending document and stop + /// when no documents are remaining. + /// + /// Returns the former segment_ready channel. + fn recreate_channels(&mut self) + -> (DocumentReceiver, chan::Receiver>) { + let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) = + chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); + let (mut segment_ready_sender, mut segment_ready_receiver): (NewSegmentSender, + NewSegmentReceiver) = + chan::async(); + swap(&mut self.document_sender, &mut document_sender); + swap(&mut self.document_receiver, &mut document_receiver); + swap(&mut self.segment_ready_sender, &mut segment_ready_sender); + swap(&mut self.segment_ready_receiver, + &mut segment_ready_receiver); + (document_receiver, segment_ready_receiver) + } - /// Rollback to the last commit - /// - /// This cancels all of the update that - /// happened before after the last commit. - /// After calling rollback, the index is in the same - /// state as it was after the last commit. - /// - /// The docstamp at the last commit is returned. - pub fn rollback(&mut self,) -> Result { + /// Rollback to the last commit + /// + /// This cancels all of the update that + /// happened before after the last commit. + /// After calling rollback, the index is in the same + /// state as it was after the last commit. + /// + /// The docstamp at the last commit is returned. + pub fn rollback(&mut self) -> Result { - // we cannot drop segment ready receiver yet - // as it would block the workers. - let (document_receiver, mut _segment_ready_receiver) = self.recreate_channels(); + // we cannot drop segment ready receiver yet + // as it would block the workers. + let (document_receiver, mut _segment_ready_receiver) = self.recreate_channels(); - // consumes the document receiver pipeline - // worker don't need to index the pending documents. - for _ in document_receiver {}; + // consumes the document receiver pipeline + // worker don't need to index the pending documents. + for _ in document_receiver {} - let mut former_workers_join_handle = Vec::new(); - swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - - // wait for all the worker to finish their work - // (it should be fast since we consumed all pending documents) - for worker_handle in former_workers_join_handle { - try!(worker_handle - .join() - .map_err(|e| Error::ErrorInThread(format!("{:?}", e))) - ); - // add a new worker for the next generation. - try!(self.add_indexing_worker()); - } + let mut former_workers_join_handle = Vec::new(); + swap(&mut former_workers_join_handle, + &mut self.workers_join_handle); - // reset the docstamp to what it was before - self.docstamp = try!(self.index.docstamp()); - Ok(self.docstamp) - } + // wait for all the worker to finish their work + // (it should be fast since we consumed all pending documents) + for worker_handle in former_workers_join_handle { + try!(worker_handle.join() + .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); + // add a new worker for the next generation. + try!(self.add_indexing_worker()); + } + + // reset the docstamp to what it was before + self.docstamp = try!(self.index.docstamp()); + Ok(self.docstamp) + } - /// Commits all of the pending changes - /// - /// A call to commit blocks. - /// After it returns, all of the document that - /// were added since the last commit are published - /// and persisted. - /// - /// In case of a crash or an hardware failure (as - /// long as the hard disk is spared), it will be possible - /// to resume indexing from this point. - /// - /// Commit returns the `docstamp` of the last document - /// that made it in the commit. - /// - pub fn commit(&mut self,) -> Result { - - let (document_receiver, segment_ready_receiver) = self.recreate_channels(); - drop(document_receiver); + /// Commits all of the pending changes + /// + /// A call to commit blocks. + /// After it returns, all of the document that + /// were added since the last commit are published + /// and persisted. + /// + /// In case of a crash or an hardware failure (as + /// long as the hard disk is spared), it will be possible + /// to resume indexing from this point. + /// + /// Commit returns the `docstamp` of the last document + /// that made it in the commit. + /// + pub fn commit(&mut self) -> Result { - // Docstamp of the last document in this commit. - let commit_docstamp = self.docstamp; + let (document_receiver, segment_ready_receiver) = self.recreate_channels(); + drop(document_receiver); - let mut former_workers_join_handle = Vec::new(); - swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - - for worker_handle in former_workers_join_handle { - try!(worker_handle - .join() - .map_err(|e| Error::ErrorInThread(format!("{:?}", e))) - ); - // add a new worker for the next generation. - try!(self.add_indexing_worker()); - } - - let segment_ids_and_size: Vec<(SegmentId, usize)> = try!( - segment_ready_receiver - .into_iter() - .collect() - ); + // Docstamp of the last document in this commit. + let commit_docstamp = self.docstamp; - let segment_ids: Vec = segment_ids_and_size - .iter() - .map(|&(segment_id, _num_docs)| segment_id) - .collect(); - - try!(self.index.publish_segments(&segment_ids, commit_docstamp)); + let mut former_workers_join_handle = Vec::new(); + swap(&mut former_workers_join_handle, + &mut self.workers_join_handle); - Ok(commit_docstamp) - } - + for worker_handle in former_workers_join_handle { + try!(worker_handle.join() + .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); + // add a new worker for the next generation. + try!(self.add_indexing_worker()); + } - /// Adds a document. - /// - /// If the indexing pipeline is full, this call may block. - /// - /// The docstamp is an increasing `u64` that can - /// be used by the client to align commits with its own - /// document queue. - /// - /// Currently it represents the number of documents that - /// have been added since the creation of the index. - pub fn add_document(&mut self, doc: Document) -> io::Result { - self.document_sender.send(doc); - self.docstamp += 1; - Ok(self.docstamp) - } - + let segment_ids_and_size: Vec<(SegmentId, usize)> = try!(segment_ready_receiver.into_iter() + .collect()); + let segment_ids: Vec = segment_ids_and_size.iter() + .map(|&(segment_id, _num_docs)| segment_id) + .collect(); + + try!(self.index.publish_segments(&segment_ids, commit_docstamp)); + + Ok(commit_docstamp) + } + + + /// Adds a document. + /// + /// If the indexing pipeline is full, this call may block. + /// + /// The docstamp is an increasing `u64` that can + /// be used by the client to align commits with its own + /// document queue. + /// + /// Currently it represents the number of documents that + /// have been added since the creation of the index. + pub fn add_document(&mut self, doc: Document) -> io::Result { + self.document_sender.send(doc); + self.docstamp += 1; + Ok(self.docstamp) + } } @@ -306,77 +306,74 @@ impl IndexWriter { #[cfg(test)] mod tests { - use schema::{self, Document}; - use Index; - use Term; + use schema::{self, Document}; + use Index; + use Term; use Error; - use directory::error::OpenWriteError; #[test] fn test_lockfile_stops_duplicates() { - - let mut schema_builder = schema::SchemaBuilder::default(); - let text_field = schema_builder.add_text_field("text", schema::TEXT); - let index = Index::create_in_ram(schema_builder.build()); - let index_writer = index.writer(40_000_000).unwrap(); + + let schema_builder = schema::SchemaBuilder::default(); + let index = Index::create_in_ram(schema_builder.build()); + let index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(Error::FileAlreadyExists(_)) => {}, - _ => panic!("Expected FileAlreadyExists error") + Err(Error::FileAlreadyExists(_)) => {} + _ => panic!("Expected FileAlreadyExists error"), } } #[test] fn test_lockfile_released_on_drop() { - let mut schema_builder = schema::SchemaBuilder::default(); - let text_field = schema_builder.add_text_field("text", schema::TEXT); - let index = Index::create_in_ram(schema_builder.build()); - { + let schema_builder = schema::SchemaBuilder::default(); + let index = Index::create_in_ram(schema_builder.build()); + { let index_writer = index.writer(40_000_000).unwrap(); } - + let index_writer_two = index.writer(40_000_000).unwrap(); } - #[test] - fn test_commit_and_rollback() { - let mut schema_builder = schema::SchemaBuilder::default(); - let text_field = schema_builder.add_text_field("text", schema::TEXT); - let index = Index::create_in_ram(schema_builder.build()); + #[test] + fn test_commit_and_rollback() { + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); - let num_docs_containing = |s: &str| { - let searcher = index.searcher(); - let term_a = Term::from_field_text(text_field, s); - searcher.doc_freq(&term_a) - }; - - { - // writing the segment - let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap(); - { - let mut doc = Document::default(); - doc.add_text(text_field, "a"); - index_writer.add_document(doc).unwrap(); - } - assert_eq!(index_writer.rollback().unwrap(), 0u64); - assert_eq!(num_docs_containing("a"), 0); + let num_docs_containing = |s: &str| { + let searcher = index.searcher(); + let term_a = Term::from_field_text(text_field, s); + searcher.doc_freq(&term_a) + }; - { - let mut doc = Document::default(); - doc.add_text(text_field, "b"); - index_writer.add_document(doc).unwrap(); - } - { - let mut doc = Document::default(); - doc.add_text(text_field, "c"); - index_writer.add_document(doc).unwrap(); - } - assert_eq!(index_writer.commit().unwrap(), 2u64); - - assert_eq!(num_docs_containing("a"), 0); - assert_eq!(num_docs_containing("b"), 1); - assert_eq!(num_docs_containing("c"), 1); - } - index.searcher(); - } + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap(); + { + let mut doc = Document::default(); + doc.add_text(text_field, "a"); + index_writer.add_document(doc).unwrap(); + } + assert_eq!(index_writer.rollback().unwrap(), 0u64); + assert_eq!(num_docs_containing("a"), 0); + + { + let mut doc = Document::default(); + doc.add_text(text_field, "b"); + index_writer.add_document(doc).unwrap(); + } + { + let mut doc = Document::default(); + doc.add_text(text_field, "c"); + index_writer.add_document(doc).unwrap(); + } + assert_eq!(index_writer.commit().unwrap(), 2u64); + + assert_eq!(num_docs_containing("a"), 0); + assert_eq!(num_docs_containing("b"), 1); + assert_eq!(num_docs_containing("c"), 1); + } + index.searcher(); + } } From e0240609abab9e88e5584f75d2fc79ece8770388 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Sun, 2 Oct 2016 22:08:15 -0400 Subject: [PATCH 15/19] =?UTF-8?q?=F0=9F=93=8E=20small=20change=20for=20cli?= =?UTF-8?q?ppy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/indexer/index_writer.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 1cf4fa9c8..05b4a67dd 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -82,8 +82,7 @@ impl Drop for IndexWriter { fn drop(&mut self) { let lockfile_path = Path::new(LOCKFILE_NAME); match self.index.directory_mut().delete(lockfile_path) { - Ok(_) => (), - Err(_) => (), + Ok(_) | Err(_) => () } } } From 964e56a19028d32977ebfff579f9c8c214abbb37 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Mon, 3 Oct 2016 11:30:45 -0400 Subject: [PATCH 16/19] document errors and panics when creating IndexWriter [skip ci] --- src/core/index.rs | 13 +++++++++++-- src/indexer/index_writer.rs | 8 ++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 86b620319..c6e83869a 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -139,14 +139,23 @@ impl Index { } /// Creates a multithreaded writer. - /// Each writer produces an independant segment. + /// Each writer produces an independent segment. + /// + /// # Errors + /// If the lockfile already exists, returns `Error::FileAlreadyExists`. + /// # Panics + /// If the heap size per thread is too small, panics. pub fn writer_with_num_threads(&self, num_threads: usize, heap_size_in_bytes: usize) -> Result { IndexWriter::open(self, num_threads, heap_size_in_bytes) } /// Creates a multithreaded writer - /// It just calls `writer_with_num_threads` with the number of core as `num_threads` + /// It just calls `writer_with_num_threads` with the number of cores as `num_threads` + /// # Errors + /// If the lockfile already exists, returns `Error::FileAlreadyExists`. + /// # Panics + /// If the heap size per thread is too small, panics. pub fn writer(&self, heap_size_in_bytes: usize) -> Result { self.writer_with_num_threads(num_cpus::get(), heap_size_in_bytes) } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 05b4a67dd..1da39ac21 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -121,10 +121,14 @@ impl IndexWriter { Ok(()) } - /// Open a new index writer + /// Open a new index writer. Attempts to acquire a lockfile. /// - /// num_threads tells the number of indexing worker that + /// num_threads specifies the number of indexing workers that /// should work at the same time. + /// # Errors + /// If the lockfile already exists, returns `Error::FileAlreadyExists`. + /// # Panics + /// If the heap size per thread is too small, panics. pub fn open(index: &Index, num_threads: usize, heap_size_in_bytes_per_thread: usize) From 21a058ec02cb62541874c3a6fa42c742edb4fe71 Mon Sep 17 00:00:00 2001 From: "Michael J. Curry" Date: Mon, 3 Oct 2016 11:40:53 -0400 Subject: [PATCH 17/19] clarify that it is safe to delete the lockfile manually if needed [skip ci] --- src/indexer/index_writer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 1da39ac21..ba5223f1b 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -123,6 +123,12 @@ impl IndexWriter { /// Open a new index writer. Attempts to acquire a lockfile. /// + /// The lockfile should be deleted on drop, but it is possible + /// that due to a panic or other error, a stale lockfile will be + /// left in the index directory. If you are sure that no other + /// `IndexWriter` on the system is accessing the index directory, + /// it is safe to manually delete the lockfile. + /// /// num_threads specifies the number of indexing workers that /// should work at the same time. /// # Errors From 1435327ad1217b4a7c9406fb33fb7b4c81a297c3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 6 Oct 2016 09:56:26 +0900 Subject: [PATCH 18/19] NOBUG Changed tantivy logo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f62295e2b..d9138a07b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![Tantivy](http://fulmicoton.com/tantivy_500.png#h) +![Tantivy](http://fulmicoton.com/tantivy-logo/tantivy-logo.png) [![Build Status](https://travis-ci.org/fulmicoton/tantivy.svg?branch=master)](https://travis-ci.org/fulmicoton/tantivy) [![Coverage Status](https://coveralls.io/repos/github/fulmicoton/tantivy/badge.svg?branch=master)](https://coveralls.io/github/fulmicoton/tantivy?branch=master) From 69d1bbb70b7db29f5d0bbaa972476da9fd000d64 Mon Sep 17 00:00:00 2001 From: White-Oak Date: Wed, 12 Oct 2016 11:44:30 +0300 Subject: [PATCH 19/19] Added a macro to define document simply Added a test to cover that. Added `impl From<&str> for Value` for dco macro usage. --- src/lib.rs | 26 ++++++++++++++++++++++++++ src/schema/value.rs | 7 ++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 15378e025..77bfa6f06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,6 +51,16 @@ mod macros { macro_rules! get( ($e:expr) => (match $e { Some(e) => e, None => return None }) ); + + macro_rules! doc( + ($($field:ident => $value:expr),*) => {{ + let mut document = Document::default(); + $( + document.add(FieldValue::new($field, $value.into())); + )* + document + }}; + ); } mod core; @@ -399,4 +409,20 @@ mod tests { } index.searcher(); } + + #[test] + fn test_doc_macro() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let other_text_field = schema_builder.add_text_field("text2", TEXT); + let document = doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short"); + assert_eq!(document.len(), 3); + let values = document.get_all(text_field); + assert_eq!(values.len(), 2); + assert_eq!(values[0].text(), "tantivy"); + assert_eq!(values[1].text(), "some other value"); + let values = document.get_all(other_text_field); + assert_eq!(values.len(), 1); + assert_eq!(values[0].text(), "short"); + } } diff --git a/src/schema/value.rs b/src/schema/value.rs index 36f21bc86..084b8d373 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -59,6 +59,11 @@ impl From for Value { } } +impl<'a> From<&'a str> for Value { + fn from(s: &'a str) -> Value { + Value::Str(s.to_string()) + } +} const TEXT_CODE: u8 = 0; const U32_CODE: u8 = 1; @@ -95,4 +100,4 @@ impl BinarySerializable for Value { } } } -} \ No newline at end of file +}