From 9cd7458978303c63cb400c6b1dcb5c9e08a5af60 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 11 May 2017 21:12:59 +0900 Subject: [PATCH 01/51] NOBUG Hiding methods making it possible to build a incorrect Term. --- src/schema/term.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/schema/term.rs b/src/schema/term.rs index 77914d231..7abbbd9dc 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -18,7 +18,7 @@ pub struct Term(Vec); impl Term { /// Set the content of the term. - pub fn set_content(&mut self, content: &[u8]) { + pub(crate) fn set_content(&mut self, content: &[u8]) { assert!(content.len() >= 4); self.0.resize(content.len(), 0u8); (&mut self.0[..]).clone_from_slice(content); @@ -114,7 +114,7 @@ impl Term { /// /// If you want to build a field for a given `str`, /// you want to use `from_field_text`. - pub fn from_bytes(data: &[u8]) -> Term { + pub(crate) fn from_bytes(data: &[u8]) -> Term { Term(Vec::from(data)) } From 69832bfd03b91852ca6086741873b6a8c3a6c5af Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 12 May 2017 14:35:50 +0900 Subject: [PATCH 02/51] NOBUG Disabling running examples in CI as it is not working. --- .travis.yml | 1 - appveyor.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index cbfbc222b..0c7ec3d43 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,6 @@ script: travis-cargo test && travis-cargo bench && travis-cargo doc - - cargo run --example simple_search after_success: - bash ./script/build-doc.sh - travis-cargo doc-upload diff --git a/appveyor.yml b/appveyor.yml index 4e016911e..ac6917d6b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,4 +22,3 @@ build: false test_script: - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - REM SET RUST_LOG=tantivy,test & cargo run --example simple_search \ No newline at end of file From 4ff7dc7a4ff18dce402f0525872eca382fb101e2 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 May 2017 18:44:50 +0900 Subject: [PATCH 03/51] Closes #147 --- appveyor.yml | 1 + src/directory/managed_directory.rs | 44 ++++++++++++------------------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ac6917d6b..ef7845696 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,3 +22,4 @@ build: false test_script: - REM SET RUST_LOG=tantivy,test & cargo test --verbose + - cargo run --example simple_search diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 18631e3a7..24c1c9379 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -7,6 +7,7 @@ use std::io; use Directory; use std::sync::{Arc, RwLock}; use std::collections::HashSet; +use std::sync::RwLockWriteGuard; use std::io::Write; use core::MANAGED_FILEPATH; use std::collections::HashMap; @@ -67,6 +68,16 @@ impl Drop for FileProtection { } } +/// Saves the file containing the list of existing files +/// that were created by tantivy. +fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard) -> io::Result<()> { + let managed_paths = wlock.managed_paths.clone(); + let mut w = serde_json::to_vec(&managed_paths)?; + write!(&mut w, "\n")?; + directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; + Ok(()) +} + impl ManagedDirectory { /// Wraps a directory as managed directory. @@ -154,16 +165,16 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. + let mut meta_informations_wlock = self.meta_informations + .write() + .expect("Managed directory wlock poisoned (2)."); { - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed directory wlock poisoned (2)."); let managed_paths_write = &mut meta_informations_wlock.managed_paths; for delete_file in &deleted_files { managed_paths_write.remove(delete_file); } } - if let Err(_) = self.save_managed_paths() { + if let Err(_) = save_managed_paths(self.directory.as_mut(), &meta_informations_wlock) { error!("Failed to save the list of managed files."); } } @@ -193,23 +204,6 @@ impl ManagedDirectory { } } - - /// Saves the file containing the list of existing files - /// that were created by tantivy. - fn save_managed_paths(&mut self,) -> io::Result<()> { - let managed_paths; - { - let meta_informations_rlock = self.meta_informations - .read() - .expect("Managed file lock poisoned"); - managed_paths = meta_informations_rlock.managed_paths.clone(); - } - let mut w = try!(serde_json::to_vec(&managed_paths)); - try!(write!(&mut w, "\n")); - self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; - Ok(()) - } - /// Registers a file as managed /// /// This method must be called before the file is @@ -218,14 +212,12 @@ impl ManagedDirectory { /// will not lead to garbage files that will /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { - let has_changed = { - let mut meta_wlock = self.meta_informations + let mut meta_wlock = self.meta_informations .write() .expect("Managed file lock poisoned"); - meta_wlock.managed_paths.insert(filepath.to_owned()) - }; + let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); if has_changed { - self.save_managed_paths()?; + save_managed_paths(self.directory.as_mut(), &meta_wlock)?; } Ok(()) } From 695c8828b8db20f707c93afcbd07c892aaea8373 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 May 2017 18:51:38 +0900 Subject: [PATCH 04/51] Display backtrace --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index ef7845696..2d5a4458b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,4 +22,4 @@ build: false test_script: - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - cargo run --example simple_search + - REM SET RUST_BACKTRACE=1 & cargo run --example simple_search From ee0873dd072554a91c23b31adf4d6ee1a9530f6f Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sat, 13 May 2017 16:11:07 +0300 Subject: [PATCH 05/51] Avoid clone() call --- src/directory/managed_directory.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 24c1c9379..dbdcc1039 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -71,8 +71,7 @@ impl Drop for FileProtection { /// Saves the file containing the list of existing files /// that were created by tantivy. fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard) -> io::Result<()> { - let managed_paths = wlock.managed_paths.clone(); - let mut w = serde_json::to_vec(&managed_paths)?; + let mut w = serde_json::to_vec(&wlock.managed_paths)?; write!(&mut w, "\n")?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; Ok(()) From 7a5df33c85856a9727b3cd6c11aa7c13aa78f870 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 14 May 2017 19:50:40 +0900 Subject: [PATCH 06/51] issue/148 Wrapping MsQueue to drop all of its concent on Drop --- src/core/pool.rs | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/core/pool.rs b/src/core/pool.rs index 33d04dbeb..eca549139 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -10,8 +10,39 @@ pub struct GenerationItem { item: T, } + +// See https://github.com/crossbeam-rs/crossbeam/issues/91 +struct NonLeakingMsQueue { + underlying_queue: MsQueue +} + +impl Default for NonLeakingMsQueue { + fn default() -> NonLeakingMsQueue { + NonLeakingMsQueue { + underlying_queue: MsQueue::new(), + } + } +} + +impl NonLeakingMsQueue { + + fn pop(&self,) -> T { + self.underlying_queue.pop() + } + + fn push(&self, el: T) { + self.underlying_queue.push(el); + } +} + +impl Drop for NonLeakingMsQueue { + fn drop(&mut self) { + while let Some(_popped_item_to_be_dropped) = self.underlying_queue.try_pop() {} + } +} + pub struct Pool { - queue: Arc>>, + queue: Arc>>, freshest_generation: AtomicUsize, next_generation: AtomicUsize, } @@ -20,7 +51,7 @@ impl Pool { pub fn new() -> Pool { Pool { - queue: Arc::new(MsQueue::new()), + queue: Arc::default(), freshest_generation: AtomicUsize::default(), next_generation: AtomicUsize::default(), } @@ -57,7 +88,7 @@ impl Pool { self.freshest_generation.load(Ordering::Acquire) } - pub fn acquire(&self,) -> LeasedItem { + pub fn acquire(&self) -> LeasedItem { let generation = self.generation(); loop { let gen_item = self.queue.pop(); @@ -80,7 +111,7 @@ impl Pool { pub struct LeasedItem { gen_item: Option>, - recycle_queue: Arc>>, + recycle_queue: Arc>>, } impl Deref for LeasedItem { From e04f2f0b08a9f010045bee338d1918d8e9653f80 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sun, 14 May 2017 16:24:17 +0300 Subject: [PATCH 07/51] issue/148 Wait for the index writer threads to shut down in simple_search --- examples/simple_search.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 821462afd..cf85b9b6a 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -204,5 +204,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { println!("{}", schema.to_json(&retrieved_doc)); } + // Wait for indexing and merging threads to shut down. + // Usually this isn't needed, but in `main` we try to + // delete the temporary directory and that fails on + // Windows if the files are still open. + index_writer.wait_merging_threads(); + Ok(()) } From ab66ffed4e6135c6e36cca0f086182b368b06874 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 May 2017 18:44:50 +0900 Subject: [PATCH 08/51] Closes #147 --- appveyor.yml | 1 + src/directory/managed_directory.rs | 44 ++++++++++++------------------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ac6917d6b..ef7845696 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,3 +22,4 @@ build: false test_script: - REM SET RUST_LOG=tantivy,test & cargo test --verbose + - cargo run --example simple_search diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 18631e3a7..24c1c9379 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -7,6 +7,7 @@ use std::io; use Directory; use std::sync::{Arc, RwLock}; use std::collections::HashSet; +use std::sync::RwLockWriteGuard; use std::io::Write; use core::MANAGED_FILEPATH; use std::collections::HashMap; @@ -67,6 +68,16 @@ impl Drop for FileProtection { } } +/// Saves the file containing the list of existing files +/// that were created by tantivy. +fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard) -> io::Result<()> { + let managed_paths = wlock.managed_paths.clone(); + let mut w = serde_json::to_vec(&managed_paths)?; + write!(&mut w, "\n")?; + directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; + Ok(()) +} + impl ManagedDirectory { /// Wraps a directory as managed directory. @@ -154,16 +165,16 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. + let mut meta_informations_wlock = self.meta_informations + .write() + .expect("Managed directory wlock poisoned (2)."); { - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed directory wlock poisoned (2)."); let managed_paths_write = &mut meta_informations_wlock.managed_paths; for delete_file in &deleted_files { managed_paths_write.remove(delete_file); } } - if let Err(_) = self.save_managed_paths() { + if let Err(_) = save_managed_paths(self.directory.as_mut(), &meta_informations_wlock) { error!("Failed to save the list of managed files."); } } @@ -193,23 +204,6 @@ impl ManagedDirectory { } } - - /// Saves the file containing the list of existing files - /// that were created by tantivy. - fn save_managed_paths(&mut self,) -> io::Result<()> { - let managed_paths; - { - let meta_informations_rlock = self.meta_informations - .read() - .expect("Managed file lock poisoned"); - managed_paths = meta_informations_rlock.managed_paths.clone(); - } - let mut w = try!(serde_json::to_vec(&managed_paths)); - try!(write!(&mut w, "\n")); - self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; - Ok(()) - } - /// Registers a file as managed /// /// This method must be called before the file is @@ -218,14 +212,12 @@ impl ManagedDirectory { /// will not lead to garbage files that will /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { - let has_changed = { - let mut meta_wlock = self.meta_informations + let mut meta_wlock = self.meta_informations .write() .expect("Managed file lock poisoned"); - meta_wlock.managed_paths.insert(filepath.to_owned()) - }; + let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); if has_changed { - self.save_managed_paths()?; + save_managed_paths(self.directory.as_mut(), &meta_wlock)?; } Ok(()) } From db56167a5dd5ef8323e0f73d20770772a3a0a33d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 13 May 2017 18:51:38 +0900 Subject: [PATCH 09/51] Display backtrace --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index ef7845696..2d5a4458b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,4 +22,4 @@ build: false test_script: - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - cargo run --example simple_search + - REM SET RUST_BACKTRACE=1 & cargo run --example simple_search From 8a352593005b8fdb6164d0119be22290e52b1eaa Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sat, 13 May 2017 16:11:07 +0300 Subject: [PATCH 10/51] Avoid clone() call --- src/directory/managed_directory.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 24c1c9379..dbdcc1039 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -71,8 +71,7 @@ impl Drop for FileProtection { /// Saves the file containing the list of existing files /// that were created by tantivy. fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard) -> io::Result<()> { - let managed_paths = wlock.managed_paths.clone(); - let mut w = serde_json::to_vec(&managed_paths)?; + let mut w = serde_json::to_vec(&wlock.managed_paths)?; write!(&mut w, "\n")?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; Ok(()) From 8cd5a2d81d207ddf1c5fce2225f5232642add663 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 15 May 2017 00:25:49 +0900 Subject: [PATCH 11/51] Fixed logging deleted files twice --- src/directory/managed_directory.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index dbdcc1039..e22653fa7 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -140,13 +140,14 @@ impl ManagedDirectory { deleted_files.push(file_to_delete); } Err(file_error) => { - error!("Failed to delete {:?}", file_to_delete); match file_error { DeleteError::FileDoesNotExist(_) => { deleted_files.push(file_to_delete); } DeleteError::IOError(_) => { if !cfg!(target_os = "windows") { + // On windows, delete is expected to fail if the file + // is mmapped. error!("Failed to delete {:?}", file_to_delete); } } From b2beac1203039085ccfec043aeaa6d49a4813d8d Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sun, 14 May 2017 20:53:05 +0300 Subject: [PATCH 12/51] Check the result of wait_merging_threads --- examples/simple_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index cf85b9b6a..f62cf3e47 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -208,7 +208,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // Usually this isn't needed, but in `main` we try to // delete the temporary directory and that fails on // Windows if the files are still open. - index_writer.wait_merging_threads(); + index_writer.wait_merging_threads()?; Ok(()) } From 6f89a86b14cc84fc34d035a02ea28ed77e7cba47 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 15 May 2017 12:10:23 +0900 Subject: [PATCH 13/51] Added simple search in travis CI --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 0c7ec3d43..cbfbc222b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,7 @@ script: travis-cargo test && travis-cargo bench && travis-cargo doc + - cargo run --example simple_search after_success: - bash ./script/build-doc.sh - travis-cargo doc-upload From a23b7a181541f00d0a8b437af8617951526d23a6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 15 May 2017 19:09:52 +0900 Subject: [PATCH 14/51] Test the size of complete 0..128 block --- src/compression/pack/compression_pack_simd.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index dcdcf7065..4351e7c16 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -111,3 +111,16 @@ impl BlockDecoder { } +#[cfg(test)] +mod tests { + + use super::BlockEncoder; + + #[test] + fn test_all_docs_compression_len() { + let data: Vec = (0u32..128u32).collect(); + let mut encoder = BlockEncoder::new(); + let compressed = encoder.compress_block_sorted(&data, 0u32); + assert_eq!(compressed.len(), 17); + } +} \ No newline at end of file From 4c8f9742f882dd1366f4130ac11a5ce4a786180f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 15 May 2017 22:30:18 +0900 Subject: [PATCH 15/51] format --- src/analyzer/mod.rs | 20 +- src/collector/chained_collector.rs | 19 +- src/collector/count_collector.rs | 20 +- src/collector/mod.rs | 49 ++- src/collector/multi_collector.rs | 14 +- src/collector/top_collector.rs | 60 ++- src/common/bitpacker.rs | 81 ++-- src/common/mod.rs | 14 +- src/common/serialize.rs | 26 +- src/common/timer.rs | 30 +- src/common/vint.rs | 12 +- src/compression/composite.rs | 85 ++-- src/compression/mod.rs | 95 ++--- .../pack/compression_pack_nosimd.rs | 54 ++- src/compression/pack/compression_pack_simd.rs | 56 ++- .../vint/compression_vint_nosimd.rs | 18 +- src/compression/vint/compression_vint_simd.rs | 82 ++-- src/core/index.rs | 19 +- src/core/index_meta.rs | 6 +- src/core/mod.rs | 2 +- src/core/pool.rs | 56 +-- src/core/searcher.rs | 27 +- src/core/segment.rs | 29 +- src/core/segment_component.rs | 24 +- src/core/segment_id.rs | 8 +- src/core/segment_meta.rs | 43 +- src/core/segment_reader.rs | 134 +++--- src/core/term_iterator.rs | 33 +- src/datastruct/fstmap.rs | 52 +-- src/datastruct/skip/mod.rs | 4 +- src/datastruct/skip/skiplist.rs | 55 ++- src/datastruct/skip/skiplist_builder.rs | 21 +- src/datastruct/stacker/expull.rs | 25 +- src/datastruct/stacker/hashmap.rs | 70 ++- src/datastruct/stacker/heap.rs | 107 ++--- src/datastruct/stacker/mod.rs | 10 +- src/directory/directory.rs | 41 +- src/directory/error.rs | 12 +- src/directory/managed_directory.rs | 122 +++--- src/directory/mmap_directory.rs | 203 ++++----- src/directory/mod.rs | 10 +- src/directory/ram_directory.rs | 49 +-- src/directory/read_only_source.rs | 30 +- src/directory/shared_vec_slice.rs | 9 +- src/error.rs | 20 +- src/fastfield/delete.rs | 16 +- src/fastfield/error.rs | 11 +- src/fastfield/mod.rs | 123 +++--- src/fastfield/reader.rs | 79 ++-- src/fastfield/serializer.rs | 42 +- src/fastfield/writer.rs | 72 ++-- src/functional_test.rs | 11 +- src/indexer/delete_queue.rs | 108 ++--- src/indexer/directory_lock.rs | 2 +- src/indexer/doc_opstamp_mapping.rs | 20 +- src/indexer/index_writer.rs | 260 ++++++------ src/indexer/log_merge_policy.rs | 22 +- src/indexer/merge_policy.rs | 27 +- src/indexer/merger.rs | 401 ++++++++++-------- src/indexer/segment_entry.rs | 30 +- src/indexer/segment_manager.rs | 134 +++--- src/indexer/segment_register.rs | 99 +++-- src/indexer/segment_serializer.rs | 10 +- src/indexer/segment_updater.rs | 250 +++++------ src/indexer/segment_writer.rs | 387 +++++++++-------- src/indexer/stamper.rs | 5 +- src/lib.rs | 193 +++++---- src/postings/freq_handler.rs | 2 +- src/postings/mod.rs | 79 ++-- src/postings/postings_writer.rs | 8 +- src/postings/segment_postings.rs | 18 +- src/postings/serializer.rs | 47 +- src/postings/term_info.rs | 23 +- src/query/boolean_query/boolean_query.rs | 21 +- src/query/boolean_query/boolean_scorer.rs | 50 +-- src/query/boolean_query/boolean_weight.rs | 6 +- src/query/boolean_query/mod.rs | 60 +-- src/query/boolean_query/score_combiner.rs | 15 +- src/query/mod.rs | 2 +- src/query/occur_filter.rs | 17 +- src/query/phrase_query/mod.rs | 35 +- src/query/phrase_query/phrase_query.rs | 15 +- src/query/phrase_query/phrase_scorer.rs | 24 +- src/query/phrase_query/phrase_weight.rs | 16 +- src/query/query.rs | 36 +- src/query/query_parser/logical_ast.rs | 19 +- src/query/query_parser/mod.rs | 2 +- src/query/query_parser/query_grammar.rs | 44 +- src/query/query_parser/query_parser.rs | 120 +++--- src/query/query_parser/user_input_ast.rs | 32 +- src/query/scorer.rs | 25 +- src/query/term_query/mod.rs | 15 +- src/query/term_query/term_query.rs | 7 +- src/query/term_query/term_scorer.rs | 31 +- src/query/term_query/term_weight.rs | 39 +- src/query/weight.rs | 2 - src/schema/document.rs | 49 +-- src/schema/field.rs | 3 +- src/schema/field_entry.rs | 93 ++-- src/schema/field_type.rs | 52 ++- src/schema/field_value.rs | 11 +- src/schema/int_options.rs | 36 +- src/schema/mod.rs | 5 +- src/schema/named_field_document.rs | 4 +- src/schema/schema.rs | 167 ++++---- src/schema/term.rs | 45 +- src/schema/text_options.rs | 61 ++- src/schema/value.rs | 54 +-- src/store/mod.rs | 22 +- src/store/reader.rs | 6 +- src/store/writer.rs | 3 +- 111 files changed, 2791 insertions(+), 2888 deletions(-) diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index cf29d8187..cfc20218d 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -18,11 +18,10 @@ pub trait StreamingIterator<'a, T> { impl<'a, 'b> TokenIter<'b> { fn consume_token(&'a mut self) -> Option<&'a str> { - for c in &mut self.chars { + for c in &mut self.chars { if c.is_alphanumeric() { append_char_lowercase(c, &mut self.term_buffer); - } - else { + } else { break; } } @@ -32,9 +31,8 @@ impl<'a, 'b> TokenIter<'b> { impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { - #[inline] - fn next(&'a mut self,) -> Option<&'a str> { + fn next(&'a mut self) -> Option<&'a str> { self.term_buffer.clear(); // skipping non-letter characters. loop { @@ -45,24 +43,24 @@ impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { return self.consume_token(); } } - None => { return None; } + None => { + return None; + } } } } - } pub struct SimpleTokenizer; impl SimpleTokenizer { - pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> { TokenIter { - term_buffer: String::new(), - chars: text.chars(), + term_buffer: String::new(), + chars: text.chars(), } - } + } } diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs index 524ffec58..6cc5785b4 100644 --- a/src/collector/chained_collector.rs +++ b/src/collector/chained_collector.rs @@ -7,8 +7,8 @@ use Score; /// Collector that does nothing. -/// This is used in the chain Collector and will hopefully -/// be optimized away by the compiler. +/// This is used in the chain Collector and will hopefully +/// be optimized away by the compiler. pub struct DoNothingCollector; impl Collector for DoNothingCollector { #[inline] @@ -24,10 +24,10 @@ impl Collector for DoNothingCollector { /// are known at compile time. pub struct ChainedCollector { left: Left, - right: Right + right: Right, } -impl ChainedCollector { +impl ChainedCollector { /// Adds a collector pub fn push(self, new_collector: &mut C) -> ChainedCollector { ChainedCollector { @@ -38,7 +38,10 @@ impl ChainedCollector { } impl Collector for ChainedCollector { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { + fn set_segment(&mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader) + -> Result<()> { try!(self.left.set_segment(segment_local_id, segment)); try!(self.right.set_segment(segment_local_id, segment)); Ok(()) @@ -70,9 +73,7 @@ mod tests { let mut top_collector = TopCollector::with_limit(2); let mut count_collector = CountCollector::default(); { - let mut collectors = chain() - .push(&mut top_collector) - .push(&mut count_collector); + let mut collectors = chain().push(&mut top_collector).push(&mut count_collector); collectors.collect(1, 0.2); collectors.collect(2, 0.1); collectors.collect(3, 0.5); @@ -80,4 +81,4 @@ mod tests { assert_eq!(count_collector.count(), 3); assert!(top_collector.at_capacity()); } -} \ No newline at end of file +} diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index ff15abd73..bfb17eb3c 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -6,7 +6,7 @@ use SegmentReader; use SegmentLocalId; /// `CountCollector` collector only counts how many -/// documents match the query. +/// documents match the query. pub struct CountCollector { count: usize, } @@ -14,20 +14,18 @@ pub struct CountCollector { impl CountCollector { /// Returns the count of documents that were /// collected. - pub fn count(&self,) -> usize { + pub fn count(&self) -> usize { self.count } } impl Default for CountCollector { fn default() -> CountCollector { - CountCollector {count: 0, - } + CountCollector { count: 0 } } } impl Collector for CountCollector { - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { Ok(()) } @@ -47,11 +45,11 @@ mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - for doc in 0..1_000_000 { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + for doc in 0..1_000_000 { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index c3964c596..72d5797ff 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -16,11 +16,11 @@ pub use self::top_collector::TopCollector; mod chained_collector; pub use self::chained_collector::chain; -/// Collectors are in charge of collecting and retaining relevant +/// Collectors are in charge of collecting and retaining relevant /// information from the document found and scored by the query. /// /// -/// For instance, +/// For instance, /// /// - keeping track of the top 10 best documents /// - computing a breakdown over a fast field @@ -29,7 +29,7 @@ pub use self::chained_collector::chain; /// Queries are in charge of pushing the `DocSet` to the collector. /// /// As they work on multiple segments, they first inform -/// the collector of a change in a segment and then +/// the collector of a change in a segment and then /// call the `collect` method to push the document to the collector. /// /// Temporally, our collector will receive calls @@ -46,16 +46,22 @@ pub use self::chained_collector::chain; /// /// Segments are not guaranteed to be visited in any specific order. pub trait Collector { - /// `set_segment` is called before beginning to enumerate + /// `set_segment` is called before beginning to enumerate /// on this segment. - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>; + fn set_segment(&mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader) + -> Result<()>; /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score); } impl<'a, C: Collector> Collector for &'a mut C { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { + fn set_segment(&mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader) + -> Result<()> { (*self).set_segment(segment_local_id, segment) } /// The query pushes the scored document to the collector via this method. @@ -77,7 +83,7 @@ pub mod tests { use fastfield::U64FastFieldReader; use fastfield::FastFieldReader; use schema::Field; - + /// Stores all of the doc ids. /// This collector is only used for tests. /// It is unusable in practise, as it does not store @@ -90,7 +96,7 @@ pub mod tests { impl TestCollector { /// Return the exhalist of documents. - pub fn docs(self,) -> Vec { + pub fn docs(self) -> Vec { self.docs } } @@ -106,7 +112,6 @@ pub mod tests { } impl Collector for TestCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.offset += self.segment_max_doc; self.segment_max_doc = reader.max_doc(); @@ -117,10 +122,10 @@ pub mod tests { self.docs.push(doc + self.offset); } } - - - - + + + + /// Collects in order all of the fast fields for all of the /// doc in the `DocSet` /// @@ -140,11 +145,11 @@ pub mod tests { } } - pub fn vals(self,) -> Vec { + pub fn vals(self) -> Vec { self.vals } } - + impl Collector for FastFieldTestCollector { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.ff_reader = Some(reader.get_fast_field_reader(self.field)?); @@ -161,12 +166,12 @@ pub mod tests { #[bench] fn build_collector(b: &mut Bencher) { b.iter(|| { - let mut count_collector = CountCollector::default(); - let docs: Vec = (0..1_000_000).collect(); - for doc in docs { - count_collector.collect(doc, 1f32); - } - count_collector.count() - }); + let mut count_collector = CountCollector::default(); + let docs: Vec = (0..1_000_000).collect(); + for doc in docs { + count_collector.collect(doc, 1f32); + } + count_collector.count() + }); } } diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index e5eddc7f4..c2515782d 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -7,7 +7,7 @@ use SegmentLocalId; /// Multicollector makes it possible to collect on more than one collector. -/// It should only be used for use cases where the Collector types is unknown +/// It should only be used for use cases where the Collector types is unknown /// at compile time. /// If the type of the collectors is known, you should prefer to use `ChainedCollector`. pub struct MultiCollector<'a> { @@ -17,15 +17,16 @@ pub struct MultiCollector<'a> { impl<'a> MultiCollector<'a> { /// Constructor pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { - MultiCollector { - collectors: collectors, - } + MultiCollector { collectors: collectors } } } impl<'a> Collector for MultiCollector<'a> { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { + fn set_segment(&mut self, + segment_local_id: SegmentLocalId, + segment: &SegmentReader) + -> Result<()> { for collector in &mut self.collectors { try!(collector.set_segment(segment_local_id, segment)); } @@ -52,7 +53,8 @@ mod tests { let mut top_collector = TopCollector::with_limit(2); let mut count_collector = CountCollector::default(); { - let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector)); + let mut collectors = MultiCollector::from(vec![&mut top_collector, + &mut count_collector]); collectors.collect(1, 0.2); collectors.collect(2, 0.1); collectors.collect(3, 0.5); diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 6425eb300..a02141fca 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -12,8 +12,7 @@ use Score; #[derive(Clone, Copy)] struct GlobalScoredDoc { score: Score, - doc_address: DocAddress - + doc_address: DocAddress, } impl PartialOrd for GlobalScoredDoc { @@ -25,10 +24,10 @@ impl PartialOrd for GlobalScoredDoc { impl Ord for GlobalScoredDoc { #[inline] fn cmp(&self, other: &GlobalScoredDoc) -> Ordering { - other.score.partial_cmp(&self.score) - .unwrap_or( - other.doc_address.cmp(&self.doc_address) - ) + other + .score + .partial_cmp(&self.score) + .unwrap_or(other.doc_address.cmp(&self.doc_address)) } } @@ -53,7 +52,6 @@ pub struct TopCollector { } impl TopCollector { - /// Creates a top collector, with a number of documents equal to "limit". /// /// # Panics @@ -68,9 +66,9 @@ impl TopCollector { segment_id: 0, } } - + /// Returns K best documents sorted in decreasing order. - /// + /// /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn docs(&self) -> Vec { @@ -81,30 +79,27 @@ impl TopCollector { } /// Returns K best ScoredDocument sorted in decreasing order. - /// + /// /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { - let mut scored_docs: Vec = self.heap - .iter() - .cloned() - .collect(); + let mut scored_docs: Vec = self.heap.iter().cloned().collect(); scored_docs.sort(); - scored_docs.into_iter() - .map(|GlobalScoredDoc {score, doc_address}| (score, doc_address)) + scored_docs + .into_iter() + .map(|GlobalScoredDoc { score, doc_address }| (score, doc_address)) .collect() } /// Return true iff at least K documents have gone through /// the collector. #[inline] - pub fn at_capacity(&self, ) -> bool { + pub fn at_capacity(&self) -> bool { self.heap.len() >= self.limit } } impl Collector for TopCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { self.segment_id = segment_id; Ok(()) @@ -113,17 +108,21 @@ impl Collector for TopCollector { fn collect(&mut self, doc: DocId, score: Score) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden"); + let limit_doc: GlobalScoredDoc = + *self.heap + .peek() + .expect("Top collector with size 0 is forbidden"); if limit_doc.score < score { - let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden"); + let mut mut_head = self.heap + .peek_mut() + .expect("Top collector with size 0 is forbidden"); mut_head.score = score; - mut_head.doc_address = DocAddress(self.segment_id, doc); + mut_head.doc_address = DocAddress(self.segment_id, doc); } - } - else { + } else { let wrapped_doc = GlobalScoredDoc { score: score, - doc_address: DocAddress(self.segment_id, doc) + doc_address: DocAddress(self.segment_id, doc), }; self.heap.push(wrapped_doc); } @@ -147,13 +146,12 @@ mod tests { top_collector.collect(3, 0.2); top_collector.collect(5, 0.3); assert!(!top_collector.at_capacity()); - let score_docs: Vec<(Score, DocId)> = top_collector.score_docs() + let score_docs: Vec<(Score, DocId)> = top_collector + .score_docs() .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); - assert_eq!(score_docs, vec!( - (0.8, 1), (0.3, 5), (0.2, 3), - )); + assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); } #[test] @@ -171,9 +169,7 @@ mod tests { .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); - assert_eq!(score_docs, vec!( - (0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3) - )); + assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); } { let docs: Vec = top_collector @@ -181,7 +177,7 @@ mod tests { .into_iter() .map(|doc_address| doc_address.doc()) .collect(); - assert_eq!(docs, vec!(7, 1, 5, 3)); + assert_eq!(docs, vec![7, 1, 5, 3]); } diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index f765a3956..e16df3967 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -6,19 +6,19 @@ use std::mem; /// Computes the number of bits that will be used for bitpacking. /// -/// In general the target is the minimum number of bits +/// In general the target is the minimum number of bits /// required to express the amplitude given in argument. /// /// e.g. If the amplitude is 10, we can store all ints on simply 4bits. -/// +/// /// The logic is slightly more convoluted here as for optimization /// reasons, we want to ensure that a value spawns over at most 8 bytes /// of aligns bytes. -/// -/// Spawning over 9 bytes is possible for instance, if we do +/// +/// Spawning over 9 bytes is possible for instance, if we do /// bitpacking with an amplitude of 63 bits. /// In this case, the second int will start on bit -/// 63 (which belongs to byte 7) and ends at byte 15; +/// 63 (which belongs to byte 7) and ends at byte 15; /// Hence 9 bytes (from byte 7 to byte 15 included). /// /// To avoid this, we force the number of bits to 64bits @@ -30,12 +30,7 @@ use std::mem; /// number of bits. pub fn compute_num_bits(amplitude: u64) -> u8 { let amplitude = (64u32 - amplitude.leading_zeros()) as u8; - if amplitude <= 64 - 8 { - amplitude - } - else { - 64 - } + if amplitude <= 64 - 8 { amplitude } else { 64 } } pub struct BitPacker { @@ -46,7 +41,6 @@ pub struct BitPacker { } impl BitPacker { - pub fn new(num_bits: usize) -> BitPacker { BitPacker { mini_buffer: 0u64, @@ -55,7 +49,7 @@ impl BitPacker { written_size: 0, } } - + pub fn write(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> { let val_u64 = val as u64; if self.mini_buffer_written + self.num_bits > 64 { @@ -63,30 +57,29 @@ impl BitPacker { self.written_size += self.mini_buffer.serialize(output)?; self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32); self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64; - } - else { + } else { self.mini_buffer |= val_u64 << self.mini_buffer_written; self.mini_buffer_written += self.num_bits; if self.mini_buffer_written == 64 { self.written_size += self.mini_buffer.serialize(output)?; self.mini_buffer_written = 0; self.mini_buffer = 0u64; - } + } } Ok(()) } - - fn flush(&mut self, output: &mut TWrite) -> io::Result<()>{ + + fn flush(&mut self, output: &mut TWrite) -> io::Result<()> { if self.mini_buffer_written > 0 { let num_bytes = (self.mini_buffer_written + 7) / 8; - let arr: [u8; 8] = unsafe { mem::transmute::(self.mini_buffer) }; + let arr: [u8; 8] = unsafe { mem::transmute::(self.mini_buffer) }; output.write_all(&arr[..num_bytes])?; self.written_size += num_bytes; self.mini_buffer_written = 0; } Ok(()) } - + pub fn close(&mut self, output: &mut TWrite) -> io::Result { self.flush(output)?; Ok(self.written_size) @@ -99,26 +92,24 @@ pub struct BitUnpacker { num_bits: usize, mask: u64, data_ptr: *const u8, - data_len: usize, + data_len: usize, } impl BitUnpacker { pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker { - let mask: u64 = - if num_bits == 64 { - !0u64 - } - else { - (1u64 << num_bits) - 1u64 - }; + let mask: u64 = if num_bits == 64 { + !0u64 + } else { + (1u64 << num_bits) - 1u64 + }; BitUnpacker { num_bits: num_bits, mask: mask, data_ptr: data.as_ptr(), - data_len: data.len() + data_len: data.len(), } } - + pub fn get(&self, idx: usize) -> u64 { if self.num_bits == 0 { return 0; @@ -127,13 +118,13 @@ impl BitUnpacker { let bit_shift = idx * self.num_bits - addr * 8; let val_unshifted_unmasked: u64; if addr + 8 <= self.data_len { - val_unshifted_unmasked = unsafe { * (self.data_ptr.offset(addr as isize) as *const u64) }; - } - else { + val_unshifted_unmasked = + unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) }; + } else { let mut arr = [0u8; 8]; if addr < self.data_len { for i in 0..self.data_len - addr { - arr[i] = unsafe { *self.data_ptr.offset( (addr + i) as isize) }; + arr[i] = unsafe { *self.data_ptr.offset((addr + i) as isize) }; } } val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) }; @@ -141,7 +132,6 @@ impl BitUnpacker { let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & self.mask) } - } @@ -150,7 +140,7 @@ impl BitUnpacker { #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker, compute_num_bits}; - + #[test] fn test_compute_num_bits() { assert_eq!(compute_num_bits(1), 1u8); @@ -162,31 +152,26 @@ mod test { assert_eq!(compute_num_bits(256), 9u8); assert_eq!(compute_num_bits(5_000_000_000), 33u8); } - + fn test_bitpacker_util(len: usize, num_bits: usize) { let mut data = Vec::new(); let mut bitpacker = BitPacker::new(num_bits); let max_val: u64 = (1 << num_bits) - 1; - let vals: Vec = (0u64..len as u64).map(|i| { - if max_val == 0 { - 0 - } - else { - i % max_val - } - }).collect(); + let vals: Vec = (0u64..len as u64) + .map(|i| if max_val == 0 { 0 } else { i % max_val }) + .collect(); for &val in &vals { bitpacker.write(val, &mut data).unwrap(); } let num_bytes = bitpacker.close(&mut data).unwrap(); - assert_eq!(num_bytes, (num_bits * len + 7) / 8); + assert_eq!(num_bytes, (num_bits * len + 7) / 8); assert_eq!(data.len(), num_bytes); let bitunpacker = BitUnpacker::new(&data, num_bits); for (i, val) in vals.iter().enumerate() { assert_eq!(bitunpacker.get(i), *val); } } - + #[test] fn test_bitpacker() { test_bitpacker_util(10, 3); @@ -195,4 +180,4 @@ mod test { test_bitpacker_util(6, 14); test_bitpacker_util(1000, 14); } -} \ No newline at end of file +} diff --git a/src/common/mod.rs b/src/common/mod.rs index ae9f56794..4629ed95f 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -19,10 +19,10 @@ pub fn make_io_err(msg: String) -> io::Error { /// Has length trait pub trait HasLen { /// Return length - fn len(&self,) -> usize; - + fn len(&self) -> usize; + /// Returns true iff empty. - fn is_empty(&self,) -> bool { + fn is_empty(&self) -> bool { self.len() == 0 } } @@ -32,13 +32,13 @@ const HIGHEST_BIT: u64 = 1 << 63; /// Maps `i64` to `u64` so that /// `-2^63 .. 2^63-1` is mapped -/// to +/// to /// `0 .. 2^64` /// in that order. /// /// This is more suited than simply casting (`val as u64`) /// because of bitpacking. -/// +/// /// Imagine a list of `i64` ranging from -10 to 10. /// When casting negative values, the negative values are projected /// to values over 2^63, and all values end up requiring 64 bits. @@ -47,7 +47,7 @@ pub fn i64_to_u64(val: i64) -> u64 { (val as u64) ^ HIGHEST_BIT } -/// Reverse the mapping given by +/// Reverse the mapping given by /// `i64_to_u64`. #[inline(always)] pub fn u64_to_i64(val: u64) -> i64 { @@ -76,4 +76,4 @@ mod test { test_i64_converter_helper(i); } } -} \ No newline at end of file +} diff --git a/src/common/serialize.rs b/src/common/serialize.rs index 7e4fb49e5..471ac3a9c 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -6,7 +6,7 @@ use std::io::Read; use std::io; use common::VInt; -pub trait BinarySerializable : fmt::Debug + Sized { +pub trait BinarySerializable: fmt::Debug + Sized { fn serialize(&self, writer: &mut Write) -> io::Result; fn deserialize(reader: &mut Read) -> io::Result; } @@ -45,14 +45,13 @@ impl BinarySerializable for Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write))) } fn deserialize(reader: &mut Read) -> io::Result { - Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) ) + Ok((try!(Left::deserialize(reader)), try!(Right::deserialize(reader)))) } } impl BinarySerializable for u32 { fn serialize(&self, writer: &mut Write) -> io::Result { - writer.write_u32::(*self) - .map(|_| 4) + writer.write_u32::(*self).map(|_| 4) } fn deserialize(reader: &mut Read) -> io::Result { @@ -63,8 +62,7 @@ impl BinarySerializable for u32 { impl BinarySerializable for u64 { fn serialize(&self, writer: &mut Write) -> io::Result { - writer.write_u64::(*self) - .map(|_| 8) + writer.write_u64::(*self).map(|_| 8) } fn deserialize(reader: &mut Read) -> io::Result { reader.read_u64::() @@ -73,8 +71,7 @@ impl BinarySerializable for u64 { impl BinarySerializable for i64 { fn serialize(&self, writer: &mut Write) -> io::Result { - writer.write_i64::(*self) - .map(|_| 8) + writer.write_i64::(*self).map(|_| 8) } fn deserialize(reader: &mut Read) -> io::Result { reader.read_i64::() @@ -104,7 +101,9 @@ impl BinarySerializable for String { fn deserialize(reader: &mut Read) -> io::Result { let string_length = try!(VInt::deserialize(reader)).val() as usize; let mut result = String::with_capacity(string_length); - try!(reader.take(string_length as u64).read_to_string(&mut result)); + try!(reader + .take(string_length as u64) + .read_to_string(&mut result)); Ok(result) } } @@ -122,8 +121,7 @@ mod test { if num_bytes != 0 { assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes); assert_eq!(buffer.len(), num_bytes); - } - else { + } else { v.serialize(&mut buffer).unwrap(); } let mut cursor = &buffer[..]; @@ -147,15 +145,15 @@ mod test { #[test] fn test_serialize_string() { serialize_test(String::from(""), 1); - serialize_test(String::from("ぽよぽよ"), 1 + 3*4); - serialize_test(String::from("富士さん見える。"), 1 + 3*8); + serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4); + serialize_test(String::from("富士さん見える。"), 1 + 3 * 8); } #[test] fn test_serialize_vec() { let v: Vec = Vec::new(); serialize_test(v, 1); - serialize_test(vec!(1u32, 3u32), 1 + 4*2); + serialize_test(vec![1u32, 3u32], 1 + 4 * 2); } #[test] diff --git a/src/common/timer.rs b/src/common/timer.rs index e28d1af6b..035bd65de 100644 --- a/src/common/timer.rs +++ b/src/common/timer.rs @@ -10,7 +10,7 @@ pub struct OpenTimer<'a> { impl<'a> OpenTimer<'a> { /// Starts timing a new named subtask /// - /// The timer is stopped automatically + /// The timer is stopped automatically /// when the `OpenTimer` is dropped. pub fn open(&mut self, name: &'static str) -> OpenTimer { OpenTimer { @@ -23,12 +23,17 @@ impl<'a> OpenTimer<'a> { } impl<'a> Drop for OpenTimer<'a> { - fn drop(&mut self,) { - self.timer_tree.timings.push(Timing { - name: self.name, - duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(), - depth: self.depth, - }); + fn drop(&mut self) { + self.timer_tree + .timings + .push(Timing { + name: self.name, + duration: self.start + .to(PreciseTime::now()) + .num_microseconds() + .unwrap(), + depth: self.depth, + }); } } @@ -47,12 +52,11 @@ pub struct TimerTree { } impl TimerTree { - - /// Returns the total time elapsed in microseconds - pub fn total_time(&self,) -> i64 { + /// Returns the total time elapsed in microseconds + pub fn total_time(&self) -> i64 { self.timings.last().unwrap().duration } - + /// Open a new named subtask pub fn open(&mut self, name: &'static str) -> OpenTimer { OpenTimer { @@ -66,9 +70,7 @@ impl TimerTree { impl Default for TimerTree { fn default() -> TimerTree { - TimerTree { - timings: Vec::new(), - } + TimerTree { timings: Vec::new() } } } diff --git a/src/common/vint.rs b/src/common/vint.rs index 70b1f9589..0563d8f8e 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -5,12 +5,12 @@ use std::io::Read; -/// Wrapper over a `u64` that serializes as a variable int. +/// Wrapper over a `u64` that serializes as a variable int. #[derive(Debug, Eq, PartialEq)] pub struct VInt(pub u64); impl VInt { - pub fn val(&self,) -> u64 { + pub fn val(&self) -> u64 { self.0 } } @@ -27,8 +27,7 @@ impl BinarySerializable for VInt { buffer[written] = next_byte | 128u8; written += 1; break; - } - else { + } else { buffer[written] = next_byte; written += 1; } @@ -50,12 +49,9 @@ impl BinarySerializable for VInt { } shift += 7; } - _ => { - return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")) - } + _ => return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer")), } } Ok(VInt(result)) } } - diff --git a/src/compression/composite.rs b/src/compression/composite.rs index a92023405..57fae249c 100644 --- a/src/compression/composite.rs +++ b/src/compression/composite.rs @@ -8,38 +8,40 @@ pub struct CompositeEncoder { } impl CompositeEncoder { - pub fn new() -> CompositeEncoder { CompositeEncoder { block_encoder: BlockEncoder::new(), output: Vec::with_capacity(500_000), } } - + pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] { self.output.clear(); let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; let mut offset = 0u32; for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK]; + let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset); offset = vals_slice[NUM_DOCS_PER_BLOCK - 1]; self.output.extend_from_slice(block_compressed); } - let vint_compressed = self.block_encoder.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset); + let vint_compressed = + self.block_encoder + .compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset); self.output.extend_from_slice(vint_compressed); &self.output } - + pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] { self.output.clear(); let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; for i in 0..num_blocks { - let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK]; + let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK..(i + 1) * NUM_DOCS_PER_BLOCK]; let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice); self.output.extend_from_slice(block_compressed); } - let vint_compressed = self.block_encoder.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]); + let vint_compressed = self.block_encoder + .compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]); self.output.extend_from_slice(vint_compressed); &self.output } @@ -57,10 +59,13 @@ impl CompositeDecoder { CompositeDecoder { block_decoder: BlockDecoder::new(), vals: Vec::with_capacity(500_000), - } + } } - - pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] { + + pub fn uncompress_sorted(&mut self, + mut compressed_data: &[u8], + uncompressed_len: usize) + -> &[u32] { if uncompressed_len > self.vals.capacity() { let extra_capacity = uncompressed_len - self.vals.capacity(); self.vals.reserve(extra_capacity); @@ -69,24 +74,37 @@ impl CompositeDecoder { self.vals.clear(); let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; for _ in 0..num_blocks { - compressed_data = self.block_decoder.uncompress_block_sorted(compressed_data, offset); + compressed_data = self.block_decoder + .uncompress_block_sorted(compressed_data, offset); offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); - self.vals.extend_from_slice(self.block_decoder.output_array()); + self.vals + .extend_from_slice(self.block_decoder.output_array()); } - self.block_decoder.uncompress_vint_sorted(compressed_data, offset, uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals.extend_from_slice(self.block_decoder.output_array()); + self.block_decoder + .uncompress_vint_sorted(compressed_data, + offset, + uncompressed_len % NUM_DOCS_PER_BLOCK); + self.vals + .extend_from_slice(self.block_decoder.output_array()); &self.vals } - - pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] { + + pub fn uncompress_unsorted(&mut self, + mut compressed_data: &[u8], + uncompressed_len: usize) + -> &[u32] { self.vals.clear(); let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; for _ in 0..num_blocks { - compressed_data = self.block_decoder.uncompress_block_unsorted(compressed_data); - self.vals.extend_from_slice(self.block_decoder.output_array()); + compressed_data = self.block_decoder + .uncompress_block_unsorted(compressed_data); + self.vals + .extend_from_slice(self.block_decoder.output_array()); } - self.block_decoder.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK); - self.vals.extend_from_slice(self.block_decoder.output_array()); + self.block_decoder + .uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK); + self.vals + .extend_from_slice(self.block_decoder.output_array()); &self.vals } } @@ -100,7 +118,7 @@ impl Into> for CompositeDecoder { #[cfg(test)] pub mod tests { - + use test::Bencher; use super::*; use compression::tests::generate_array; @@ -115,7 +133,7 @@ pub mod tests { let result = decoder.uncompress_unsorted(&compressed, data.len()); for i in 0..data.len() { assert_eq!(data[i], result[i]); - } + } } #[test] @@ -128,32 +146,25 @@ pub mod tests { let result = decoder.uncompress_sorted(&compressed, data.len()); for i in 0..data.len() { assert_eq!(data[i], result[i]); - } + } } - - + + const BENCH_NUM_INTS: usize = 99_968; - + #[bench] fn bench_compress(b: &mut Bencher) { let mut encoder = CompositeEncoder::new(); let data = generate_array(BENCH_NUM_INTS, 0.1); - b.iter(|| { - encoder.compress_sorted(&data); - }); + b.iter(|| { encoder.compress_sorted(&data); }); } - + #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = CompositeEncoder::new(); let data = generate_array(BENCH_NUM_INTS, 0.1); let compressed = encoder.compress_sorted(&data); - let mut decoder = CompositeDecoder::new(); - b.iter(|| { - decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); - }); + let mut decoder = CompositeDecoder::new(); + b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); }); } } - - - diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 722521c2c..3331f6b08 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -38,39 +38,44 @@ pub trait VIntEncoder { } pub trait VIntDecoder { - fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> &'a [u8]; - fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8]; + fn uncompress_vint_sorted<'a>(&mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize) + -> &'a [u8]; + fn uncompress_vint_unsorted<'a>(&mut self, + compressed_data: &'a [u8], + num_els: usize) + -> &'a [u8]; } impl VIntEncoder for BlockEncoder { - fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] { vint::compress_sorted(input, &mut self.output, offset) } - + fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] { vint::compress_unsorted(input, &mut self.output) } } impl VIntDecoder for BlockDecoder { - - fn uncompress_vint_sorted<'a>( - &mut self, - compressed_data: &'a [u8], - offset: u32, - num_els: usize) -> &'a [u8] { + fn uncompress_vint_sorted<'a>(&mut self, + compressed_data: &'a [u8], + offset: u32, + num_els: usize) + -> &'a [u8] { self.output_len = num_els; vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) } - - fn uncompress_vint_unsorted<'a>( - &mut self, - compressed_data: &'a [u8], - num_els: usize) -> &'a [u8] { + + fn uncompress_vint_unsorted<'a>(&mut self, + compressed_data: &'a [u8], + num_els: usize) + -> &'a [u8] { self.output_len = num_els; vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) - } + } } @@ -84,12 +89,12 @@ pub mod tests { use rand::XorShiftRng; use super::*; use test::Bencher; - + fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); (0..u32::max_value()) - .filter(|_| rng.next_f32()< ratio) + .filter(|_| rng.next_f32() < ratio) .take(n) .collect() } @@ -100,12 +105,12 @@ pub mod tests { #[test] fn test_encode_sorted_block() { - let vals: Vec = (0u32..128u32).map(|i| i*7).collect(); + let vals: Vec = (0u32..128u32).map(|i| i * 7).collect(); let mut encoder = BlockEncoder::new(); let compressed_data = encoder.compress_block_sorted(&vals, 0); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); + let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); assert_eq!(remaining_data.len(), 0); } for i in 0..128 { @@ -115,31 +120,31 @@ pub mod tests { #[test] fn test_encode_sorted_block_with_offset() { - let vals: Vec = (0u32..128u32).map(|i| 11 + i*7).collect(); + let vals: Vec = (0u32..128u32).map(|i| 11 + i * 7).collect(); let mut encoder = BlockEncoder::new(); let compressed_data = encoder.compress_block_sorted(&vals, 10); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); + let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); assert_eq!(remaining_data.len(), 0); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); } } - + #[test] fn test_encode_sorted_block_with_junk() { let mut compressed: Vec = Vec::new(); let n = 128; - let vals: Vec = (0..n).map(|i| 11u32 + (i as u32)*7u32).collect(); + let vals: Vec = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect(); let mut encoder = BlockEncoder::new(); let compressed_data = encoder.compress_block_sorted(&vals, 10); compressed.extend_from_slice(compressed_data); compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); + let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); assert_eq!(remaining_data.len(), 1); assert_eq!(remaining_data[0], 173u8); } @@ -152,14 +157,14 @@ pub mod tests { fn test_encode_unsorted_block_with_junk() { let mut compressed: Vec = Vec::new(); let n = 128; - let vals: Vec = (0..n).map(|i| 11u32 + (i as u32)*7u32 % 12).collect(); + let vals: Vec = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect(); let mut encoder = BlockEncoder::new(); let compressed_data = encoder.compress_block_unsorted(&vals); compressed.extend_from_slice(compressed_data); compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_unsorted(&compressed); + let remaining_data = decoder.uncompress_block_unsorted(&compressed); assert_eq!(remaining_data.len(), 1); assert_eq!(remaining_data[0], 173u8); } @@ -167,22 +172,20 @@ pub mod tests { assert_eq!(vals[i], decoder.output(i)); } } - - + + #[test] fn test_encode_vint() { { let expected_length = 154; let mut encoder = BlockEncoder::new(); - let input: Vec = (0u32..123u32) - .map(|i| 4 + i * 7 / 2) - .into_iter() - .collect(); + let input: Vec = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect(); for offset in &[0u32, 1u32, 2u32] { let encoded_data = encoder.compress_vint_sorted(&input, *offset); assert!(encoded_data.len() <= expected_length); let mut decoder = BlockDecoder::new(); - let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len()); + let remaining_data = + decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len()); assert_eq!(0, remaining_data.len()); assert_eq!(input, decoder.output_array()); } @@ -194,20 +197,16 @@ pub mod tests { fn bench_compress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1); - b.iter(|| { - encoder.compress_block_sorted(&data, 0u32); - }); + b.iter(|| { encoder.compress_block_sorted(&data, 0u32); }); } - + #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1); let compressed = encoder.compress_block_sorted(&data, 0u32); - let mut decoder = BlockDecoder::new(); - b.iter(|| { - decoder.uncompress_block_sorted(compressed, 0u32); - }); + let mut decoder = BlockDecoder::new(); + b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); }); } @@ -217,20 +216,16 @@ pub mod tests { fn bench_compress_vint(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); - b.iter(|| { - encoder.compress_vint_sorted(&data, 0u32); - }); + b.iter(|| { encoder.compress_vint_sorted(&data, 0u32); }); } - + #[bench] fn bench_uncompress_vint(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); let compressed = encoder.compress_vint_sorted(&data, 0u32); - let mut decoder = BlockDecoder::new(); - b.iter(|| { - decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); - }); + let mut decoder = BlockDecoder::new(); + b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); }); } } diff --git a/src/compression/pack/compression_pack_nosimd.rs b/src/compression/pack/compression_pack_nosimd.rs index 979cd4eb8..4086688d2 100644 --- a/src/compression/pack/compression_pack_nosimd.rs +++ b/src/compression/pack/compression_pack_nosimd.rs @@ -4,10 +4,10 @@ use std::cmp; use std::io::Write; use super::super::NUM_DOCS_PER_BLOCK; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize { - let mut max_delta = 0; + let mut max_delta = 0; { let mut local_offset = offset; for i in 0..NUM_DOCS_PER_BLOCK { @@ -24,7 +24,10 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> for val in vals { bit_packer.write(*val, &mut output).unwrap(); } - 1 + bit_packer.close(&mut output).expect("packing in memory should never fail") + 1 + + bit_packer + .close(&mut output) + .expect("packing in memory should never fail") } @@ -36,36 +39,40 @@ pub struct BlockEncoder { } impl BlockEncoder { - pub fn new() -> BlockEncoder { BlockEncoder { output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, input_buffer: [0u32; NUM_DOCS_PER_BLOCK], - } + } } - + pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] { self.input_buffer.clone_from_slice(vals); let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset); &self.output[..compressed_size] } - - pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { + + pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { let compressed_size: usize = { - let mut output: &mut [u8] = &mut self.output; - let max = vals.iter().cloned().max().expect("compress unsorted called with an empty array"); + let mut output: &mut [u8] = &mut self.output; + let max = vals.iter() + .cloned() + .max() + .expect("compress unsorted called with an empty array"); let num_bits = compute_num_bits(max); output.write_all(&[num_bits]).unwrap(); let mut bit_packer = BitPacker::new(num_bits as usize); for val in vals { bit_packer.write(*val, &mut output).unwrap(); } - 1 + bit_packer.close(&mut output).expect("packing in memory should never fail") + 1 + + bit_packer + .close(&mut output) + .expect("packing in memory should never fail") }; &self.output[..compressed_size] } - } pub struct BlockDecoder { @@ -78,15 +85,18 @@ impl BlockDecoder { pub fn new() -> BlockDecoder { BlockDecoder::with_val(0u32) } - + pub fn with_val(val: u32) -> BlockDecoder { BlockDecoder { output: [val; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, } } - - pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], mut offset: u32) -> &'a[u8] { + + pub fn uncompress_block_sorted<'a>(&mut self, + compressed_data: &'a [u8], + mut offset: u32) + -> &'a [u8] { let consumed_size = { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); @@ -96,13 +106,13 @@ impl BlockDecoder { self.output[i] = val; offset = val; } - 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8 + 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8 }; self.output_len = NUM_DOCS_PER_BLOCK; &compressed_data[consumed_size..] } - - pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] { + + pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { let num_bits = compressed_data[0]; let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize); for i in 0..NUM_DOCS_PER_BLOCK { @@ -112,16 +122,14 @@ impl BlockDecoder { self.output_len = NUM_DOCS_PER_BLOCK; &compressed_data[consumed_size..] } - + #[inline] - pub fn output_array(&self,) -> &[u32] { + pub fn output_array(&self) -> &[u32] { &self.output[..self.output_len] } - + #[inline] pub fn output(&self, idx: usize) -> u32 { self.output[idx] } } - - diff --git a/src/compression/pack/compression_pack_simd.rs b/src/compression/pack/compression_pack_simd.rs index 4351e7c16..78cf58c37 100644 --- a/src/compression/pack/compression_pack_simd.rs +++ b/src/compression/pack/compression_pack_simd.rs @@ -1,28 +1,21 @@ use super::super::NUM_DOCS_PER_BLOCK; -const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; +const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; mod simdcomp { use libc::size_t; - extern { - pub fn compress_sorted( - data: *const u32, - output: *mut u8, - offset: u32) -> size_t; + extern "C" { + pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t; - pub fn uncompress_sorted( - compressed_data: *const u8, - output: *mut u32, - offset: u32) -> size_t; - - pub fn compress_unsorted( - data: *const u32, - output: *mut u8) -> size_t; + pub fn uncompress_sorted(compressed_data: *const u8, + output: *mut u32, + offset: u32) + -> size_t; - pub fn uncompress_unsorted( - compressed_data: *const u8, - output: *mut u32) -> size_t; + pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t; + + pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t; } } @@ -49,24 +42,22 @@ pub struct BlockEncoder { } impl BlockEncoder { - pub fn new() -> BlockEncoder { BlockEncoder { output: [0u8; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, - } + } } - + pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] { let compressed_size = compress_sorted(vals, &mut self.output, offset); &self.output[..compressed_size] } - + pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] { let compressed_size = compress_unsorted(vals, &mut self.output); &self.output[..compressed_size] } - } pub struct BlockDecoder { @@ -79,31 +70,34 @@ impl BlockDecoder { pub fn new() -> BlockDecoder { BlockDecoder::with_val(0u32) } - + pub fn with_val(val: u32) -> BlockDecoder { BlockDecoder { output: [val; COMPRESSED_BLOCK_MAX_SIZE], output_len: 0, } } - - pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32) -> &'a[u8] { + + pub fn uncompress_block_sorted<'a>(&mut self, + compressed_data: &'a [u8], + offset: u32) + -> &'a [u8] { let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset); self.output_len = NUM_DOCS_PER_BLOCK; &compressed_data[consumed_size..] } - - pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] { + + pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a [u8] { let consumed_size = uncompress_unsorted(compressed_data, &mut self.output); self.output_len = NUM_DOCS_PER_BLOCK; &compressed_data[consumed_size..] } - + #[inline] - pub fn output_array(&self,) -> &[u32] { + pub fn output_array(&self) -> &[u32] { &self.output[..self.output_len] } - + #[inline] pub fn output(&self, idx: usize) -> u32 { self.output[idx] @@ -123,4 +117,4 @@ mod tests { let compressed = encoder.compress_block_sorted(&data, 0u32); assert_eq!(compressed.len(), 17); } -} \ No newline at end of file +} diff --git a/src/compression/vint/compression_vint_nosimd.rs b/src/compression/vint/compression_vint_nosimd.rs index b53dee1c7..a3af5e489 100644 --- a/src/compression/vint/compression_vint_nosimd.rs +++ b/src/compression/vint/compression_vint_nosimd.rs @@ -12,8 +12,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) output[byte_written] = next_byte | 128u8; byte_written += 1; break; - } - else { + } else { output[byte_written] = next_byte; byte_written += 1; } @@ -34,8 +33,7 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { output[byte_written] = next_byte | 128u8; byte_written += 1; break; - } - else { + } else { output[byte_written] = next_byte; byte_written += 1; } @@ -45,10 +43,10 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { } #[inline(always)] -pub fn uncompress_sorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) -> &'a [u8] { +pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], + output: &mut [u32], + offset: u32) + -> &'a [u8] { let mut read_byte = 0; let mut result = offset; let num_els = output.len(); @@ -69,9 +67,7 @@ pub fn uncompress_sorted<'a>( } #[inline(always)] -pub fn uncompress_unsorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32]) -> &'a [u8] { +pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { let mut read_byte = 0; let num_els = output.len(); for i in 0..num_els { diff --git a/src/compression/vint/compression_vint_simd.rs b/src/compression/vint/compression_vint_simd.rs index 9d7264994..dbeca660c 100644 --- a/src/compression/vint/compression_vint_simd.rs +++ b/src/compression/vint/compression_vint_simd.rs @@ -3,28 +3,25 @@ mod streamvbyte { use libc::size_t; - extern { - pub fn streamvbyte_delta_encode( - data: *const u32, - num_els: u32, - output: *mut u8, - offset: u32) -> size_t; + extern "C" { + pub fn streamvbyte_delta_encode(data: *const u32, + num_els: u32, + output: *mut u8, + offset: u32) + -> size_t; - pub fn streamvbyte_delta_decode( - compressed_data: *const u8, - output: *mut u32, - num_els: u32, - offset: u32) -> size_t; - - pub fn streamvbyte_encode( - data: *const u32, - num_els: u32, - output: *mut u8) -> size_t; - - pub fn streamvbyte_decode( - compressed_data: *const u8, - output: *mut u32, - num_els: usize) -> size_t; + pub fn streamvbyte_delta_decode(compressed_data: *const u8, + output: *mut u32, + num_els: u32, + offset: u32) + -> size_t; + + pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t; + + pub fn streamvbyte_decode(compressed_data: *const u8, + output: *mut u32, + num_els: usize) + -> size_t; } } @@ -32,11 +29,10 @@ mod streamvbyte { #[inline(always)] pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] { let compress_length = unsafe { - streamvbyte::streamvbyte_delta_encode( - input.as_ptr(), - input.len() as u32, - output.as_mut_ptr(), - offset) + streamvbyte::streamvbyte_delta_encode(input.as_ptr(), + input.len() as u32, + output.as_mut_ptr(), + offset) }; &output[..compress_length] } @@ -44,39 +40,29 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> #[inline(always)] pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { let compress_length = unsafe { - streamvbyte::streamvbyte_encode( - input.as_ptr(), - input.len() as u32, - output.as_mut_ptr()) - }; + streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr()) + }; &output[..compress_length] } #[inline(always)] -pub fn uncompress_sorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32], - offset: u32) -> &'a [u8] { +pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], + output: &mut [u32], + offset: u32) + -> &'a [u8] { let consumed_bytes = unsafe { - streamvbyte::streamvbyte_delta_decode( - compressed_data.as_ptr(), - output.as_mut_ptr(), - output.len() as u32, - offset) + streamvbyte::streamvbyte_delta_decode(compressed_data.as_ptr(), + output.as_mut_ptr(), + output.len() as u32, + offset) }; &compressed_data[consumed_bytes..] } #[inline(always)] -pub fn uncompress_unsorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32]) -> &'a [u8] { +pub fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> &'a [u8] { let consumed_bytes = unsafe { - streamvbyte::streamvbyte_decode( - compressed_data.as_ptr(), - output.as_mut_ptr(), - output.len()) + streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len()) }; &compressed_data[consumed_bytes..] } - diff --git a/src/core/index.rs b/src/core/index.rs index 0210f5ad0..0e0fdcf76 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -153,11 +153,10 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self - .searchable_segment_metas()? - .into_iter() - .map(|segment_meta| self.segment(segment_meta)) - .collect()) + Ok(self.searchable_segment_metas()? + .into_iter() + .map(|segment_meta| self.segment(segment_meta)) + .collect()) } #[doc(hidden)] @@ -186,13 +185,13 @@ impl Index { pub fn searchable_segment_metas(&self) -> Result> { Ok(load_metas(self.directory())?.segments) } - + /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { Ok(self.searchable_segment_metas()? .iter() .map(|segment_meta| segment_meta.id()) - .collect()) + .collect()) } /// Creates a new generation of searchers after @@ -203,9 +202,9 @@ impl Index { pub fn load_searchers(&self) -> Result<()> { let searchable_segments = self.searchable_segments()?; let segment_readers: Vec = try!(searchable_segments - .into_iter() - .map(SegmentReader::open) - .collect()); + .into_iter() + .map(SegmentReader::open) + .collect()); let searchers = (0..NUM_SEARCHERS) .map(|_| Searcher::from(segment_readers.clone())) .collect(); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 849d21041..785846a0d 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -2,9 +2,9 @@ use schema::Schema; use core::SegmentMeta; /// Meta information about the `Index`. -/// +/// /// This object is serialized on disk in the `meta.json` file. -/// It keeps information about +/// It keeps information about /// * the searchable segments, /// * the index docstamp /// * the schema @@ -19,7 +19,7 @@ pub struct IndexMeta { impl IndexMeta { pub fn with_schema(schema: Schema) -> IndexMeta { IndexMeta { - segments: vec!(), + segments: vec![], schema: schema, opstamp: 0u64, } diff --git a/src/core/mod.rs b/src/core/mod.rs index 6f7cb9edc..719246f9a 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -40,4 +40,4 @@ lazy_static! { /// /// If the process is killed and this file remains, it is safe to remove it manually. pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); -} \ No newline at end of file +} diff --git a/src/core/pool.rs b/src/core/pool.rs index eca549139..805ea3467 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -13,20 +13,17 @@ pub struct GenerationItem { // See https://github.com/crossbeam-rs/crossbeam/issues/91 struct NonLeakingMsQueue { - underlying_queue: MsQueue + underlying_queue: MsQueue, } impl Default for NonLeakingMsQueue { fn default() -> NonLeakingMsQueue { - NonLeakingMsQueue { - underlying_queue: MsQueue::new(), - } + NonLeakingMsQueue { underlying_queue: MsQueue::new() } } } impl NonLeakingMsQueue { - - fn pop(&self,) -> T { + fn pop(&self) -> T { self.underlying_queue.pop() } @@ -48,7 +45,6 @@ pub struct Pool { } impl Pool { - pub fn new() -> Pool { Pool { queue: Arc::default(), @@ -68,23 +64,24 @@ impl Pool { } self.advertise_generation(next_generation); } - - /// At the exit of this method, + + /// At the exit of this method, /// - freshest_generation has a value greater or equal than generation /// - freshest_generation has a value that has been advertised - /// - freshest_generation has + /// - freshest_generation has fn advertise_generation(&self, generation: usize) { - // not optimal at all but the easiest to read proof. + // not optimal at all but the easiest to read proof. loop { let former_generation = self.freshest_generation.load(Ordering::Acquire); if former_generation >= generation { break; } - self.freshest_generation.compare_and_swap(former_generation, generation, Ordering::SeqCst); - } + self.freshest_generation + .compare_and_swap(former_generation, generation, Ordering::SeqCst); + } } - - fn generation(&self,) -> usize { + + fn generation(&self) -> usize { self.freshest_generation.load(Ordering::Acquire) } @@ -94,19 +91,16 @@ impl Pool { let gen_item = self.queue.pop(); if gen_item.generation >= generation { return LeasedItem { - gen_item: Some(gen_item), - recycle_queue: self.queue.clone(), - } - } - else { + gen_item: Some(gen_item), + recycle_queue: self.queue.clone(), + }; + } else { // this searcher is obsolete, // removing it from the pool. } } - + } - - } pub struct LeasedItem { @@ -115,23 +109,29 @@ pub struct LeasedItem { } impl Deref for LeasedItem { - type Target = T; fn deref(&self) -> &T { - &self.gen_item.as_ref().expect("Unwrapping a leased item should never fail").item // unwrap is safe here + &self.gen_item + .as_ref() + .expect("Unwrapping a leased item should never fail") + .item // unwrap is safe here } } impl DerefMut for LeasedItem { fn deref_mut(&mut self) -> &mut T { - &mut self.gen_item.as_mut().expect("Unwrapping a mut leased item should never fail").item // unwrap is safe here + &mut self.gen_item + .as_mut() + .expect("Unwrapping a mut leased item should never fail") + .item // unwrap is safe here } } impl Drop for LeasedItem { fn drop(&mut self) { - let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None).expect("Unwrapping a leased item should never fail"); + let gen_item: GenerationItem = mem::replace(&mut self.gen_item, None) + .expect("Unwrapping a leased item should never fail"); self.recycle_queue.push(gen_item); } } @@ -158,4 +158,4 @@ mod tests { assert_eq!(*pool.acquire(), 11); } } -} \ No newline at end of file +} diff --git a/src/core/searcher.rs b/src/core/searcher.rs index fe54d1a5f..2faa919f3 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -13,36 +13,35 @@ use std::fmt; /// Holds a list of `SegmentReader`s ready for search. /// -/// It guarantees that the `Segment` will not be removed before +/// It guarantees that the `Segment` will not be removed before /// the destruction of the `Searcher`. -/// +/// pub struct Searcher { segment_readers: Vec, } impl Searcher { - /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the - /// the request to the right `Segment`. + /// the request to the right `Segment`. pub fn doc(&self, doc_address: &DocAddress) -> Result { let DocAddress(segment_local_id, doc_id) = *doc_address; let segment_reader = &self.segment_readers[segment_local_id as usize]; segment_reader.doc(doc_id) } - + /// Returns the overall number of documents in the index. - pub fn num_docs(&self,) -> DocId { + pub fn num_docs(&self) -> DocId { self.segment_readers .iter() .map(|segment_reader| segment_reader.num_docs()) .fold(0u32, |acc, val| acc + val) } - + /// Return the overall number of documents containing - /// the given term. + /// the given term. pub fn doc_freq(&self, term: &Term) -> u32 { self.segment_readers .iter() @@ -63,15 +62,15 @@ impl Searcher { } /// Return the list of segment readers - pub fn segment_readers(&self,) -> &[SegmentReader] { + pub fn segment_readers(&self) -> &[SegmentReader] { &self.segment_readers } - + /// Returns the segment_reader associated with the given segment_ordinal pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader { &self.segment_readers[segment_ord as usize] } - + /// Runs a query on the segment readers wrapped by the searcher pub fn search(&self, query: &Query, collector: &mut C) -> Result { query.search(self, collector) @@ -81,9 +80,7 @@ impl Searcher { impl From> for Searcher { fn from(segment_readers: Vec) -> Searcher { - Searcher { - segment_readers: segment_readers, - } + Searcher { segment_readers: segment_readers } } } @@ -95,4 +92,4 @@ impl fmt::Debug for Searcher { .collect::>(); write!(f, "Searcher({:?})", segment_ids) } -} \ No newline at end of file +} diff --git a/src/core/segment.rs b/src/core/segment.rs index c99d36e85..6d6d07db5 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -26,8 +26,8 @@ impl fmt::Debug for Segment { } /// Creates a new segment given an `Index` and a `SegmentId` -/// -/// The function is here to make it private outside `tantivy`. +/// +/// The function is here to make it private outside `tantivy`. pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { Segment { index: index, @@ -36,9 +36,8 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { } impl Segment { - /// Returns our index's schema. - pub fn schema(&self,) -> Schema { + pub fn schema(&self) -> Schema { self.index.schema() } @@ -53,13 +52,13 @@ impl Segment { } /// Returns the segment's id. - pub fn id(&self,) -> SegmentId { + pub fn id(&self) -> SegmentId { self.meta.id() } /// Returns the relative path of a component of our segment. - /// - /// It just joins the segment id with the extension + /// + /// It just joins the segment id with the extension /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { self.meta.relative_path(component) @@ -77,14 +76,18 @@ impl Segment { } /// Open one of the component file for a *regular* read. - pub fn open_read(&self, component: SegmentComponent) -> result::Result { + pub fn open_read(&self, + component: SegmentComponent) + -> result::Result { let path = self.relative_path(component); let source = try!(self.index.directory().open_read(&path)); Ok(source) } /// Open one of the component file for *regular* write. - pub fn open_write(&mut self, component: SegmentComponent) -> result::Result { + pub fn open_write(&mut self, + component: SegmentComponent) + -> result::Result { let path = self.relative_path(component); let write = try!(self.index.directory_mut().open_write(&path)); Ok(write) @@ -114,10 +117,10 @@ mod tests { let mut index = Index::create_in_ram(SchemaBuilder::new().build()); let segment = index.new_segment(); let path = segment.relative_path(SegmentComponent::POSTINGS); - + let directory = index.directory_mut(); - directory.atomic_write(&*path, &vec!(0u8)).unwrap(); - + directory.atomic_write(&*path, &vec![0u8]).unwrap(); + let living_files = HashSet::new(); { let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS); @@ -130,4 +133,4 @@ mod tests { assert!(!directory.exists(&*path)); } -} \ No newline at end of file +} diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index 6f85c4031..3b01e892b 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -6,22 +6,18 @@ pub enum SegmentComponent { FIELDNORMS, TERMS, STORE, - DELETE + DELETE, } impl SegmentComponent { - - pub fn iterator() -> impl Iterator { - static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [ - SegmentComponent::POSTINGS, - SegmentComponent::POSITIONS, - SegmentComponent::FASTFIELDS, - SegmentComponent::FIELDNORMS, - SegmentComponent::TERMS, - SegmentComponent::STORE, - SegmentComponent::DELETE - ]; + pub fn iterator() -> impl Iterator { + static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [SegmentComponent::POSTINGS, + SegmentComponent::POSITIONS, + SegmentComponent::FASTFIELDS, + SegmentComponent::FIELDNORMS, + SegmentComponent::TERMS, + SegmentComponent::STORE, + SegmentComponent::DELETE]; SEGMENT_COMPONENTS.into_iter() } - -} \ No newline at end of file +} diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index a0914ceb4..52978c152 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -7,7 +7,7 @@ use std::sync::atomic; /// Tantivy SegmentId. /// -/// Tantivy's segment are identified +/// Tantivy's segment are identified /// by a UUID which is used to prefix the filenames /// of all of the file associated with the segment. /// @@ -52,14 +52,14 @@ impl SegmentId { /// We are using UUID4, so only 6 bits are fixed, /// and the rest is random. /// - /// Picking the first 8 chars is ok to identify + /// Picking the first 8 chars is ok to identify /// segments in a display message. - pub fn short_uuid_string(&self,) -> String { + pub fn short_uuid_string(&self) -> String { (&self.0.simple().to_string()[..8]).to_string() } /// Returns a segment uuid string. - pub fn uuid_string(&self,) -> String { + pub fn uuid_string(&self) -> String { self.0.simple().to_string() } } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 1a91123ca..5c9194e6e 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -17,12 +17,11 @@ struct DeleteMeta { pub struct SegmentMeta { segment_id: SegmentId, max_doc: u32, - deletes: Option, + deletes: Option, } impl SegmentMeta { - - /// Creates a new segment meta for + /// Creates a new segment meta for /// a segment with no deletes and no documents. pub fn new(segment_id: SegmentId) -> SegmentMeta { SegmentMeta { @@ -53,28 +52,28 @@ impl SegmentMeta { /// and are not used by any segment anymore. pub fn list_files(&self) -> HashSet { SegmentComponent::iterator() - .map(|component| { - self.relative_path(*component) - }) + .map(|component| self.relative_path(*component)) .collect::>() - + } /// Returns the relative path of a component of our segment. - /// - /// It just joins the segment id with the extension + /// + /// It just joins the segment id with the extension /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { let mut path = self.id().uuid_string(); path.push_str(&*match component { - SegmentComponent::POSITIONS => ".pos".to_string(), - SegmentComponent::POSTINGS => ".idx".to_string(), - SegmentComponent::TERMS => ".term".to_string(), - SegmentComponent::STORE => ".store".to_string(), - SegmentComponent::FASTFIELDS => ".fast".to_string(), - SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))}, - }); + SegmentComponent::POSITIONS => ".pos".to_string(), + SegmentComponent::POSTINGS => ".idx".to_string(), + SegmentComponent::TERMS => ".term".to_string(), + SegmentComponent::STORE => ".store".to_string(), + SegmentComponent::FASTFIELDS => ".fast".to_string(), + SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), + SegmentComponent::DELETE => { + format!(".{}.del", self.delete_opstamp().unwrap_or(0)) + } + }); PathBuf::from(path) } @@ -95,9 +94,7 @@ impl SegmentMeta { /// Returns the opstamp of the last delete operation /// taken in account in this segment. pub fn delete_opstamp(&self) -> Option { - self.deletes - .as_ref() - .map(|delete_meta| delete_meta.opstamp) + self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp) } /// Returns true iff the segment meta contains @@ -114,8 +111,8 @@ impl SegmentMeta { #[doc(hidden)] pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.deletes = Some(DeleteMeta { - num_deleted_docs: num_deleted_docs, - opstamp: opstamp, - }); + num_deleted_docs: num_deleted_docs, + opstamp: opstamp, + }); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 7b4387904..f34acb4d8 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -60,7 +60,7 @@ impl SegmentReader { pub fn max_doc(&self) -> DocId { self.segment_meta.max_doc() } - + /// Returns the number of documents. /// Deleted documents are not counted. /// @@ -69,7 +69,7 @@ impl SegmentReader { pub fn num_docs(&self) -> DocId { self.segment_meta.num_docs() } - + /// Return the number of documents that have been /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { @@ -91,39 +91,39 @@ impl SegmentReader { /// /// # Panics /// May panic if the index is corrupted. - pub fn get_fast_field_reader(&self, field: Field) -> fastfield::Result { + pub fn get_fast_field_reader + (&self, + field: Field) + -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); if !TFastFieldReader::is_enabled(field_entry.field_type()) { Err(FastFieldNotAvailableError::new(field_entry)) - } - else { - Ok( - self.fast_fields_reader - .open_reader(field) - .expect("Fast field file corrupted.") - ) + } else { + Ok(self.fast_fields_reader + .open_reader(field) + .expect("Fast field file corrupted.")) } } - + /// Accessor to the segment's `Field norms`'s reader. /// /// Field norms are the length (in tokens) of the fields. /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). /// - /// They are simply stored as a fast field, serialized in - /// the `.fieldnorm` file of the segment. + /// They are simply stored as a fast field, serialized in + /// the `.fieldnorm` file of the segment. pub fn get_fieldnorms_reader(&self, field: Field) -> Option { - self.fieldnorms_reader.open_reader(field) + self.fieldnorms_reader.open_reader(field) } - + /// Returns the number of documents containing the term. pub fn doc_freq(&self, term: &Term) -> u32 { match self.get_term_info(term) { Some(term_info) => term_info.doc_freq, None => 0, } - } - + } + /// Accessor to the segment's `StoreReader`. pub fn get_store_reader(&self) -> &StoreReader { &self.store_reader @@ -136,46 +136,44 @@ impl SegmentReader { let term_infos = try!(FstMap::from_source(source)); let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE))); let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS)); - + let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS)); let fast_fields_reader = try!(FastFieldsReader::open(fast_field_data)); - + let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS)); let fieldnorms_reader = try!(FastFieldsReader::open(fieldnorms_data)); - + let positions_data = segment .open_read(SegmentComponent::POSITIONS) .unwrap_or_else(|_| ReadOnlySource::empty()); - - let delete_bitset = - if segment.meta().has_deletes() { - let delete_data = segment.open_read(SegmentComponent::DELETE)?; - DeleteBitSet::open(delete_data) - } - else { - DeleteBitSet::empty() - }; - + + let delete_bitset = if segment.meta().has_deletes() { + let delete_data = segment.open_read(SegmentComponent::DELETE)?; + DeleteBitSet::open(delete_data) + } else { + DeleteBitSet::empty() + }; + let schema = segment.schema(); Ok(SegmentReader { - segment_meta: segment.meta().clone(), - postings_data: postings_shared_mmap, - term_infos: Arc::new(term_infos), - segment_id: segment.id(), - store_reader: store_reader, - fast_fields_reader: Arc::new(fast_fields_reader), - fieldnorms_reader: Arc::new(fieldnorms_reader), - delete_bitset: delete_bitset, - positions_data: positions_data, - schema: schema, - }) + segment_meta: segment.meta().clone(), + postings_data: postings_shared_mmap, + term_infos: Arc::new(term_infos), + segment_id: segment.id(), + store_reader: store_reader, + fast_fields_reader: Arc::new(fast_fields_reader), + fieldnorms_reader: Arc::new(fieldnorms_reader), + delete_bitset: delete_bitset, + positions_data: positions_data, + schema: schema, + }) } - + /// Return the term dictionary datastructure. pub fn term_infos(&self) -> &FstMap { &self.term_infos } - + /// Returns the document (or to be accurate, its stored field) /// bearing the given doc id. /// This method is slow and should seldom be called from @@ -186,15 +184,18 @@ impl SegmentReader { /// Returns the segment postings associated with the term, and with the given option, - /// or `None` if the term has never been encounterred and indexed. - /// - /// If the field was not indexed with the indexing options that cover + /// or `None` if the term has never been encounterred and indexed. + /// + /// If the field was not indexed with the indexing options that cover /// the requested options, the returned `SegmentPostings` the method does not fail /// and returns a `SegmentPostings` with as much information as possible. /// /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions` /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies. - pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option { + pub fn read_postings(&self, + term: &Term, + option: SegmentPostingsOption) + -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(&term)); @@ -204,44 +205,40 @@ impl SegmentReader { FieldType::Str(ref options) => { let indexing_options = options.get_indexing_options(); match option { - SegmentPostingsOption::NoFreq => { - FreqHandler::new_without_freq() - } + SegmentPostingsOption::NoFreq => FreqHandler::new_without_freq(), SegmentPostingsOption::Freq => { if indexing_options.is_termfreq_enabled() { FreqHandler::new_with_freq() - } - else { + } else { FreqHandler::new_without_freq() } } SegmentPostingsOption::FreqAndPositions => { if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition { - let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..]; + let offseted_position_data = &self.positions_data[term_info.positions_offset as + usize..]; FreqHandler::new_with_freq_and_position(offseted_position_data) - } - else if indexing_options.is_termfreq_enabled() - { + } else if indexing_options.is_termfreq_enabled() { FreqHandler::new_with_freq() - } - else { + } else { FreqHandler::new_without_freq() } } } } - _ => { - FreqHandler::new_without_freq() - } + _ => FreqHandler::new_without_freq(), }; - Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler)) + Some(SegmentPostings::from_data(term_info.doc_freq, + postings_data, + &self.delete_bitset, + freq_handler)) } - + /// Returns the posting list associated with a term. /// /// If the term is not found, return None. - /// Even when non-null, because of deletes, the posting object + /// Even when non-null, because of deletes, the posting object /// returned by this method may contain no documents. pub fn read_postings_all_info(&self, term: &Term) -> Option { let field_entry = self.schema.get_field_entry(term.field()); @@ -249,15 +246,18 @@ impl SegmentReader { FieldType::Str(ref text_options) => { match text_options.get_indexing_options() { TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq, - TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions, + TextIndexingOptions::TokenizedWithFreqAndPosition => { + SegmentPostingsOption::FreqAndPositions + } _ => SegmentPostingsOption::NoFreq, } } - FieldType::U64(_) | FieldType::I64(_) => SegmentPostingsOption::NoFreq + FieldType::U64(_) | + FieldType::I64(_) => SegmentPostingsOption::NoFreq, }; self.read_postings(term, segment_posting_option) } - + /// Returns the term info associated with the term. pub fn get_term_info(&self, term: &Term) -> Option { self.term_infos.get(term.as_slice()) diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index 63bb53c9a..143e8aa6b 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -64,14 +64,15 @@ impl<'a> TermIterator<'a> { loop { match self.heap.peek() { Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {} - _ => { break; } + _ => { + break; + } } let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand self.current_segment_ords.push(next_heap_it.segment_ord); } true - } - else { + } else { false } } @@ -92,17 +93,18 @@ impl<'a> TermIterator<'a> { /// This method may be called /// iff advance() has been called before /// and "true" was returned. - pub fn segment_ords(&self) -> &[usize]{ + pub fn segment_ords(&self) -> &[usize] { &self.current_segment_ords[..] } fn advance_segments(&mut self) { for segment_ord in self.current_segment_ords.drain(..) { if let Some(term) = self.key_streams[segment_ord].next() { - self.heap.push(HeapItem { - term: Term::from_bytes(term), - segment_ord: segment_ord, - }); + self.heap + .push(HeapItem { + term: Term::from_bytes(term), + segment_ord: segment_ord, + }); } } } @@ -114,8 +116,7 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> { fn next(&'a mut self) -> Option { if self.advance() { Some(&self.current_term) - } - else { + } else { None } } @@ -123,12 +124,10 @@ impl<'a, 'f> Streamer<'a> for TermIterator<'f> { impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> { fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> { - TermIterator::new( - segment_readers - .iter() - .map(|reader| reader.term_infos().keys()) - .collect() - ) + TermIterator::new(segment_readers + .iter() + .map(|reader| reader.term_infos().keys()) + .collect()) } } @@ -180,4 +179,4 @@ mod tests { assert_eq!(terms, "abcdef"); } -} \ No newline at end of file +} diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index adb3e6e35..c79c65b67 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -20,18 +20,17 @@ pub struct FstMapBuilder { } impl FstMapBuilder { - pub fn new(w: W) -> io::Result> { let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error)); Ok(FstMapBuilder { - fst_builder: fst_builder, - data: Vec::new(), - _phantom_: PhantomData, - }) + fst_builder: fst_builder, + data: Vec::new(), + _phantom_: PhantomData, + }) } /// Horribly unsafe, nobody should ever do that... except me :) - /// + /// /// If used, it must be used by systematically alternating calls /// to insert_key and insert_value. /// @@ -39,8 +38,8 @@ impl FstMapBuilder { /// in a nice way. pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { try!(self.fst_builder - .insert(key, self.data.len() as u64) - .map_err(convert_fst_error)); + .insert(key, self.data.len() as u64) + .map_err(convert_fst_error)); Ok(()) } @@ -53,17 +52,14 @@ impl FstMapBuilder { #[cfg(test)] pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { try!(self.fst_builder - .insert(key, self.data.len() as u64) - .map_err(convert_fst_error)); + .insert(key, self.data.len() as u64) + .map_err(convert_fst_error)); try!(value.serialize(&mut self.data)); Ok(()) } - pub fn finish(self,) -> io::Result { - let mut file = try!( - self.fst_builder - .into_inner() - .map_err(convert_fst_error)); + pub fn finish(self) -> io::Result { + let mut file = try!(self.fst_builder.into_inner().map_err(convert_fst_error)); let footer_size = self.data.len() as u32; try!(file.write_all(&self.data)); try!((footer_size as u32).serialize(&mut file)); @@ -81,31 +77,35 @@ pub struct FstMap { fn open_fst_index(source: ReadOnlySource) -> io::Result { Ok(fst::Map::from(match source { - ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)), - ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)), - })) + ReadOnlySource::Anonymous(data) => { + try!(Fst::from_shared_bytes(data.data, data.start, data.len) + .map_err(convert_fst_error)) + } + ReadOnlySource::Mmap(mmap_readonly) => { + try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)) + } + })) } impl FstMap { - - pub fn keys(&self,) -> fst::map::Keys { + pub fn keys(&self) -> fst::map::Keys { self.fst_index.keys() } - pub fn from_source(source: ReadOnlySource) -> io::Result> { + pub fn from_source(source: ReadOnlySource) -> io::Result> { let total_len = source.len(); let length_offset = total_len - 4; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; - let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize; + let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize; let split_len = length_offset - footer_size; let fst_source = source.slice(0, split_len); let values_source = source.slice(split_len, length_offset); let fst_index = try!(open_fst_index(fst_source)); Ok(FstMap { - fst_index: fst_index, - values_mmap: values_source, - _phantom_: PhantomData, - }) + fst_index: fst_index, + values_mmap: values_source, + _phantom_: PhantomData, + }) } fn read_value(&self, offset: u64) -> V { diff --git a/src/datastruct/skip/mod.rs b/src/datastruct/skip/mod.rs index 9c27b6283..3907ffade 100644 --- a/src/datastruct/skip/mod.rs +++ b/src/datastruct/skip/mod.rs @@ -114,9 +114,9 @@ mod tests { let mut skip_list: SkipList<()> = SkipList::from(output.as_slice()); assert_eq!(skip_list.next().unwrap(), (0, ())); skip_list.seek(431); - assert_eq!(skip_list.next().unwrap(), (431,()) ); + assert_eq!(skip_list.next().unwrap(), (431, ())); skip_list.seek(1003); - assert_eq!(skip_list.next().unwrap(), (1004,()) ); + assert_eq!(skip_list.next().unwrap(), (1004, ())); assert_eq!(skip_list.next(), None); } diff --git a/src/datastruct/skip/skiplist.rs b/src/datastruct/skip/skiplist.rs index 16e843b43..3cdfab759 100644 --- a/src/datastruct/skip/skiplist.rs +++ b/src/datastruct/skip/skiplist.rs @@ -13,14 +13,12 @@ struct Layer<'a, T> { } impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> { - type Item = (DocId, T); - fn next(&mut self,)-> Option<(DocId, T)> { + fn next(&mut self) -> Option<(DocId, T)> { if self.next_id == u32::max_value() { None - } - else { + } else { let cur_val = T::deserialize(&mut self.cursor).unwrap(); let cur_id = self.next_id; self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value()); @@ -31,7 +29,7 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> { impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> { fn from(data: &'a [u8]) -> Layer<'a, T> { - let mut cursor = data; + let mut cursor = data; let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value()); Layer { data: data, @@ -43,7 +41,6 @@ impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> { } impl<'a, T: BinarySerializable> Layer<'a, T> { - fn empty() -> Layer<'a, T> { Layer { data: &EMPTY, @@ -53,11 +50,11 @@ impl<'a, T: BinarySerializable> Layer<'a, T> { } } - fn seek_offset(&mut self, offset: usize) { + fn seek_offset(&mut self, offset: usize) { self.cursor = &self.data[offset..]; self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value()); } - + // Returns the last element (key, val) // such that (key < doc_id) // @@ -67,8 +64,12 @@ impl<'a, T: BinarySerializable> Layer<'a, T> { let mut val = None; while self.next_id < doc_id { match self.next() { - None => { break; }, - v => { val = v; } + None => { + break; + } + v => { + val = v; + } } } val @@ -82,16 +83,14 @@ pub struct SkipList<'a, T: BinarySerializable> { } impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> { - type Item = (DocId, T); - fn next(&mut self,)-> Option<(DocId, T)> { + fn next(&mut self) -> Option<(DocId, T)> { self.data_layer.next() } } impl<'a, T: BinarySerializable> SkipList<'a, T> { - pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> { let mut next_layer_skip: Option<(DocId, u32)> = None; for skip_layer in &mut self.skip_layers { @@ -99,39 +98,33 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> { skip_layer.seek_offset(offset as usize); } next_layer_skip = skip_layer.seek(doc_id); - } - if let Some((_, offset)) = next_layer_skip { - self.data_layer.seek_offset(offset as usize); - } - self.data_layer.seek(doc_id) + } + if let Some((_, offset)) = next_layer_skip { + self.data_layer.seek_offset(offset as usize); + } + self.data_layer.seek(doc_id) } - - } impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> { - fn from(mut data: &'a [u8]) -> SkipList<'a, T> { let offsets: Vec = Vec::deserialize(&mut data).unwrap(); let num_layers = offsets.len(); let layers_data: &[u8] = data; - let data_layer: Layer<'a, T> = - if num_layers == 0 { Layer::empty() } - else { - let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize]; - Layer::from(first_layer_data) - }; + let data_layer: Layer<'a, T> = if num_layers == 0 { + Layer::empty() + } else { + let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize]; + Layer::from(first_layer_data) + }; let skip_layers = (0..max(1, num_layers) - 1) .map(|i| (offsets[i] as usize, offsets[i + 1] as usize)) - .map(|(start, stop)| { - Layer::from(&layers_data[start..stop]) - }) + .map(|(start, stop)| Layer::from(&layers_data[start..stop])) .collect(); SkipList { skip_layers: skip_layers, data_layer: data_layer, } } - } diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index 9806a69af..34c5d8a48 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -13,8 +13,7 @@ struct LayerBuilder { } impl LayerBuilder { - - fn written_size(&self,) -> usize { + fn written_size(&self) -> usize { self.buffer.len() } @@ -42,8 +41,9 @@ impl LayerBuilder { Ok(if self.remaining == 0 { self.remaining = self.period; Some((doc_id, offset)) - } - else { None }) + } else { + None + }) } } @@ -56,7 +56,6 @@ pub struct SkipListBuilder { impl SkipListBuilder { - pub fn new(period: usize) -> SkipListBuilder { SkipListBuilder { period: period, @@ -78,11 +77,13 @@ impl SkipListBuilder { let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest)); loop { skip_pointer = match skip_pointer { - Some((skip_doc_id, skip_offset)) => - try!(self - .get_skip_layer(layer_id) - .insert(skip_doc_id, &skip_offset)), - None => { return Ok(()); } + Some((skip_doc_id, skip_offset)) => { + try!(self.get_skip_layer(layer_id) + .insert(skip_doc_id, &skip_offset)) + } + None => { + return Ok(()); + } }; layer_id += 1; } diff --git a/src/datastruct/stacker/expull.rs b/src/datastruct/stacker/expull.rs index 93aa80b08..68c4e61bd 100644 --- a/src/datastruct/stacker/expull.rs +++ b/src/datastruct/stacker/expull.rs @@ -9,7 +9,7 @@ pub fn is_power_of_2(val: u32) -> bool { #[inline] pub fn jump_needed(val: u32) -> bool { - val > 3 && is_power_of_2(val) + val > 3 && is_power_of_2(val) } @@ -24,7 +24,6 @@ pub struct ExpUnrolledLinkedList { } impl ExpUnrolledLinkedList { - pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> { ExpUnrolledLinkedListIterator { heap: heap, @@ -42,10 +41,10 @@ impl ExpUnrolledLinkedList { // the next block as a size of (length so far), // and we need to add 1u32 to store the pointer // to the next element. - let new_block_size: usize = (self.len as usize + 1) * mem::size_of::(); + let new_block_size: usize = (self.len as usize + 1) * mem::size_of::(); let new_block_addr: u32 = heap.allocate_space(new_block_size); heap.set(self.end, &new_block_addr); - self.end = new_block_addr; + self.end = new_block_addr; } heap.set(self.end, &val); self.end += mem::size_of::() as u32; @@ -77,23 +76,21 @@ pub struct ExpUnrolledLinkedListIterator<'a> { impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> { type Item = u32; - fn next(&mut self,) -> Option { + fn next(&mut self) -> Option { if self.consumed == self.len { None - } - else { + } else { let addr: u32; self.consumed += 1; if jump_needed(self.consumed) { addr = *self.heap.get_mut_ref(self.addr); - } - else { + } else { addr = self.addr; } self.addr = addr + mem::size_of::() as u32; Some(*self.heap.get_mut_ref(addr)) - } - + } + } } @@ -103,7 +100,7 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> { #[cfg(test)] mod tests { - + use super::*; use super::super::heap::Heap; use test::Bencher; @@ -147,7 +144,7 @@ mod tests { #[bench] fn bench_push_stack(bench: &mut Bencher) { - let heap = Heap::with_capacity(64_000_000); + let heap = Heap::with_capacity(64_000_000); bench.iter(|| { let mut stacks = Vec::with_capacity(100); for _ in 0..NUM_STACK { @@ -163,4 +160,4 @@ mod tests { heap.clear(); }); } -} \ No newline at end of file +} diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 55a6dc12c..aadba8230 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -7,7 +7,7 @@ use super::heap::{Heap, HeapAllocable, BytesRef}; /// dbj2 hash function fn djb2(key: &[u8]) -> u64 { - let mut state: u64 = 5381; + let mut state: u64 = 5381; for &b in key { state = (state << 5).wrapping_add(state).wrapping_add(b as u64); } @@ -29,7 +29,7 @@ impl Default for BytesRef { /// /// The key and the value are actually stored contiguously. /// For this reason, the (start, stop) information is actually redundant -/// and can be simplified in the future +/// and can be simplified in the future #[derive(Copy, Clone, Default)] struct KeyValue { key: BytesRef, @@ -37,7 +37,7 @@ struct KeyValue { } impl KeyValue { - fn is_empty(&self,) -> bool { + fn is_empty(&self) -> bool { self.key.stop == 0u32 } } @@ -49,7 +49,7 @@ pub enum Entry { /// Customized `HashMap` with string keys -/// +/// /// This `HashMap` takes String as keys. Keys are /// stored in a user defined heap. /// @@ -57,7 +57,9 @@ pub enum Entry { /// the computation of the hash of the key twice, /// or copying the key as long as there is no insert. /// -pub struct HashMap<'a, V> where V: HeapAllocable { +pub struct HashMap<'a, V> + where V: HeapAllocable +{ table: Box<[KeyValue]>, heap: &'a Heap, _phantom: PhantomData, @@ -65,13 +67,12 @@ pub struct HashMap<'a, V> where V: HeapAllocable { occupied: Vec, } -impl<'a, V> HashMap<'a, V> where V: HeapAllocable { - +impl<'a, V> HashMap<'a, V> + where V: HeapAllocable +{ pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> { let table_size = 1 << num_bucket_power_of_2; - let table: Vec = iter::repeat(KeyValue::default()) - .take(table_size) - .collect(); + let table: Vec = iter::repeat(KeyValue::default()).take(table_size).collect(); HashMap { table: table.into_boxed_slice(), heap: heap, @@ -99,23 +100,23 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { }; addr } - - pub fn iter<'b: 'a>(&'b self,) -> impl Iterator + 'b { + + pub fn iter<'b: 'a>(&'b self) -> impl Iterator + 'b { let heap: &'a Heap = self.heap; let table: &'b [KeyValue] = &self.table; self.occupied .iter() .cloned() .map(move |bucket: usize| { - let kv = table[bucket]; - let addr = kv.value_addr; - let v: &V = heap.get_mut_ref::(addr); - (heap.get_slice(kv.key), (addr, v)) - }) - // .map(move |addr: u32| (heap.get_mut_ref::(addr)) ) + let kv = table[bucket]; + let addr = kv.value_addr; + let v: &V = heap.get_mut_ref::(addr); + (heap.get_slice(kv.key), (addr, v)) + }) + // .map(move |addr: u32| (heap.get_mut_ref::(addr)) ) } - pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator + 'b { + pub fn values_mut<'b: 'a>(&'b self) -> impl Iterator + 'b { let heap: &'a Heap = self.heap; let table: &'b [KeyValue] = &self.table; self.occupied @@ -128,9 +129,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { pub fn get_or_create>(&mut self, key: S) -> &mut V { let entry = self.lookup(key.as_ref()); match entry { - Entry::Occupied(addr) => { - self.heap.get_mut_ref(addr) - } + Entry::Occupied(addr) => self.heap.get_mut_ref(addr), Entry::Vacant(bucket) => { let (addr, val): (u32, &mut V) = self.heap.allocate_object(); self.set_bucket(key.as_ref(), bucket, addr); @@ -138,7 +137,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { } } } - + pub fn lookup>(&self, key: S) -> Entry { let key_bytes: &[u8] = key.as_ref(); let mut bucket = self.bucket(key_bytes); @@ -150,7 +149,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { if self.get_key(kv.key) == key_bytes { return Entry::Occupied(kv.value_addr); } - bucket = (bucket + 1) & self.mask; + bucket = (bucket + 1) & self.mask; } } } @@ -158,7 +157,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { #[cfg(test)] mod tests { - + use super::*; use super::super::heap::{Heap, HeapAllocable}; use super::djb2; @@ -186,10 +185,10 @@ mod tests { let mut hash_map: HashMap = HashMap::new(18, &heap); { { - let v: &mut TestValue = hash_map.get_or_create("abc"); - assert_eq!(v.val, 0u32); - v.val = 3u32; - + let v: &mut TestValue = hash_map.get_or_create("abc"); + assert_eq!(v.val, 0u32); + v.val = 3u32; + } } { @@ -214,20 +213,17 @@ mod tests { #[bench] fn bench_djb2(bench: &mut Bencher) { let v = String::from("abwer"); - bench.iter(|| { - djb2(v.as_bytes()) - }); + bench.iter(|| djb2(v.as_bytes())); } #[bench] fn bench_siphasher(bench: &mut Bencher) { let v = String::from("abwer"); bench.iter(|| { - let mut h = DefaultHasher::new(); - h.write(v.as_bytes()); - h.finish() - }); + let mut h = DefaultHasher::new(); + h.write(v.as_bytes()); + h.finish() + }); } } - diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index c158ddd3c..a38a24d10 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -22,17 +22,13 @@ pub struct Heap { impl Heap { /// Creates a new heap with a given capacity pub fn with_capacity(num_bytes: usize) -> Heap { - Heap { - inner: UnsafeCell::new( - InnerHeap::with_capacity(num_bytes) - ), - } + Heap { inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)) } } - fn inner(&self,) -> &mut InnerHeap { - unsafe { &mut *self.inner.get() } + fn inner(&self) -> &mut InnerHeap { + unsafe { &mut *self.inner.get() } } - + /// Clears the heap. All the underlying data is lost. /// /// This heap does not support deallocation. @@ -40,19 +36,19 @@ impl Heap { pub fn clear(&self) { self.inner().clear(); } - + /// Return the heap capacity. - pub fn capacity(&self,) -> u32 { + pub fn capacity(&self) -> u32 { self.inner().capacity() } - - /// Return the amount of memory that has been allocated so far. - pub fn len(&self,) -> u32 { + + /// Return the amount of memory that has been allocated so far. + pub fn len(&self) -> u32 { self.inner().len() } - + /// Return amount of free space, in bytes. - pub fn num_free_bytes(&self,) -> u32 { + pub fn num_free_bytes(&self) -> u32 { self.inner().num_free_bytes() } @@ -61,31 +57,31 @@ impl Heap { pub fn allocate_space(&self, num_bytes: usize) -> u32 { self.inner().allocate_space(num_bytes) } - + /// Allocate an object in the heap - pub fn allocate_object(&self,) -> (u32, &mut V) { + pub fn allocate_object(&self) -> (u32, &mut V) { let addr = self.inner().allocate_space(mem::size_of::()); let v: V = V::with_addr(addr); self.inner().set(addr, &v); (addr, self.inner().get_mut_ref(addr)) } - + /// Stores a `&[u8]` in the heap and returns the destination BytesRef. pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef { self.inner().allocate_and_set(data) } - + /// Fetches the `&[u8]` stored on the slice defined by the `BytesRef` /// given as argumetn pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] { self.inner().get_slice(bytes_ref.start, bytes_ref.stop) } - + /// Stores an item's data in the heap, at the given `address`. pub fn set(&self, addr: u32, val: &Item) { self.inner().set(addr, val); } - + /// Returns a mutable reference for an object at a given Item. pub fn get_mut_ref(&self, addr: u32) -> &mut Item { self.inner().get_mut_ref(addr) @@ -106,7 +102,6 @@ struct InnerHeap { impl InnerHeap { - pub fn with_capacity(num_bytes: usize) -> InnerHeap { let buffer: Vec = vec![0u8; num_bytes]; InnerHeap { @@ -122,23 +117,22 @@ impl InnerHeap { self.next_heap = None; } - pub fn capacity(&self,) -> u32 { + pub fn capacity(&self) -> u32 { self.buffer.len() as u32 } - pub fn len(&self,) -> u32 { + pub fn len(&self) -> u32 { self.used } - + // Returns the number of free bytes. If the buffer // has reached it's capacity and overflowed to another buffer, return 0. - pub fn num_free_bytes(&self,) -> u32 { + pub fn num_free_bytes(&self) -> u32 { if self.next_heap.is_some() { 0u32 - } - else { + } else { self.buffer_len - self.used - } + } } pub fn allocate_space(&mut self, num_bytes: usize) -> u32 { @@ -146,32 +140,35 @@ impl InnerHeap { self.used += num_bytes as u32; if self.used <= self.buffer_len { addr - } - else { + } else { if self.next_heap.is_none() { - warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document."); + warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.",); self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize))); } self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len } - - + + } - + fn get_slice(&self, start: u32, stop: u32) -> &[u8] { if start >= self.buffer_len { - self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len) - } - else { + self.next_heap + .as_ref() + .unwrap() + .get_slice(start - self.buffer_len, stop - self.buffer_len) + } else { &self.buffer[start as usize..stop as usize] } } - + fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] { if start >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len) - } - else { + self.next_heap + .as_mut() + .unwrap() + .get_mut_slice(start - self.buffer_len, stop - self.buffer_len) + } else { &mut self.buffer[start as usize..stop as usize] } } @@ -188,9 +185,11 @@ impl InnerHeap { fn get_mut(&mut self, addr: u32) -> *mut u8 { if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len) - } - else { + self.next_heap + .as_mut() + .unwrap() + .get_mut(addr - self.buffer_len) + } else { let addr_isize = addr as isize; unsafe { self.buffer.as_mut_ptr().offset(addr_isize) } } @@ -200,9 +199,11 @@ impl InnerHeap { fn get_mut_ref(&mut self, addr: u32) -> &mut Item { if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len) - } - else { + self.next_heap + .as_mut() + .unwrap() + .get_mut_ref(addr - self.buffer_len) + } else { let v_ptr_u8 = self.get_mut(addr) as *mut u8; let v_ptr = v_ptr_u8 as *mut Item; unsafe { &mut *v_ptr } @@ -211,9 +212,11 @@ impl InnerHeap { fn set(&mut self, addr: u32, val: &Item) { if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val); - } - else { + self.next_heap + .as_mut() + .unwrap() + .set(addr - self.buffer_len, val); + } else { let v_ptr: *const Item = val as *const Item; let v_ptr_u8: *const u8 = v_ptr as *const u8; debug_assert!(addr + mem::size_of::() as u32 <= self.used); @@ -223,4 +226,4 @@ impl InnerHeap { } } } -} \ No newline at end of file +} diff --git a/src/datastruct/stacker/mod.rs b/src/datastruct/stacker/mod.rs index 66aaee2d0..01bd24d12 100644 --- a/src/datastruct/stacker/mod.rs +++ b/src/datastruct/stacker/mod.rs @@ -17,12 +17,12 @@ fn test_unrolled_linked_list() { let mut ks: Vec = (1..5).map(|k| k * 100).collect(); ks.push(2); ks.push(3); - for k in (1..5).map(|k| k * 100) { + for k in (1..5).map(|k| k * 100) { let mut hashmap: HashMap = HashMap::new(10, &heap); for j in 0..k { for i in 0..500 { let mut list = hashmap.get_or_create(i.to_string()); - list.push(i*j, &heap); + list.push(i * j, &heap); } } for i in 0..500 { @@ -31,7 +31,7 @@ fn test_unrolled_linked_list() { let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr); let mut it = v.iter(addr, &heap); for j in 0..k { - assert_eq!(it.next().unwrap(), i*j); + assert_eq!(it.next().unwrap(), i * j); } assert!(!it.next().is_some()); } @@ -41,6 +41,6 @@ fn test_unrolled_linked_list() { } } } - + } -} \ No newline at end of file +} diff --git a/src/directory/directory.rs b/src/directory/directory.rs index b555efbcc..f14e20af6 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -8,31 +8,30 @@ use std::io; use std::marker::Sync; /// Write-once read many (WORM) abstraction for where -/// tantivy's data should be stored. +/// tantivy's data should be stored. /// /// There are currently two implementations of `Directory` -/// +/// /// - The [`MMapDirectory`](struct.MmapDirectory.html), this -/// should be your default choice. -/// - The [`RAMDirectory`](struct.RAMDirectory.html), which +/// should be your default choice. +/// - The [`RAMDirectory`](struct.RAMDirectory.html), which /// should be used mostly for tests. -/// +/// pub trait Directory: fmt::Debug + Send + Sync + 'static { - /// Opens a virtual file for read. - /// + /// /// Once a virtual file is open, its data may not /// change. /// /// Specifically, subsequent writes or flushes should - /// have no effect on the returned `ReadOnlySource` object. + /// have no effect on the returned `ReadOnlySource` object. fn open_read(&self, path: &Path) -> result::Result; /// Removes a file /// /// Removing a file will not affect an eventual /// existing ReadOnlySource pointing to it. - /// + /// /// Removing a nonexistent file, yields a /// `DeleteError::DoesNotExist`. fn delete(&self, path: &Path) -> result::Result<(), DeleteError>; @@ -40,18 +39,18 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// Returns true iff the file exists fn exists(&self, path: &Path) -> bool; - /// Opens a writer for the *virtual file* associated with + /// Opens a writer for the *virtual file* associated with /// a Path. /// /// Right after this call, the file should be created - /// and any subsequent call to `open_read` for the + /// and any subsequent call to `open_read` for the /// same path should return a `ReadOnlySource`. - /// + /// /// Write operations may be aggressively buffered. /// The client of this trait is responsible for calling flush - /// to ensure that subsequent `read` operations + /// to ensure that subsequent `read` operations /// will take into account preceding `write` operations. - /// + /// /// Flush operation should also be persistent. /// /// The user shall not rely on `Drop` triggering `flush`. @@ -60,7 +59,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// /// The file may not previously exist. fn open_write(&mut self, path: &Path) -> Result; - + /// Reads the full content file that has been written using /// atomic_write. /// @@ -68,17 +67,13 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { fn atomic_read(&self, path: &Path) -> Result, OpenReadError>; /// Atomically replace the content of a file with data. - /// + /// /// This calls ensure that reads can never *observe* /// a partially written file. - /// + /// /// The file may or may not previously exist. fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; - - /// Clones the directory and boxes the clone + + /// Clones the directory and boxes the clone fn box_clone(&self) -> Box; - } - - - diff --git a/src/directory/error.rs b/src/directory/error.rs index b2a7f24f1..2bc2b6ffe 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -4,7 +4,7 @@ use std::io; /// Error that may occur when opening a directory #[derive(Debug)] pub enum OpenDirectoryError { - /// The underlying directory does not exists. + /// The underlying directory does not exists. DoesNotExist(PathBuf), /// The path exists but is not a directory. NotADirectory(PathBuf), @@ -14,9 +14,9 @@ pub enum OpenDirectoryError { #[derive(Debug)] pub enum OpenWriteError { /// Our directory is WORM, writing an existing file is forbidden. - /// Checkout the `Directory` documentation. + /// Checkout the `Directory` documentation. FileAlreadyExists(PathBuf), - /// Any kind of IO error that happens when + /// Any kind of IO error that happens when /// writing in the underlying IO device. IOError(io::Error), } @@ -32,7 +32,7 @@ impl From for OpenWriteError { pub enum OpenReadError { /// The file does not exists. FileDoesNotExist(PathBuf), - /// Any kind of IO error that happens when + /// Any kind of IO error that happens when /// interacting with the underlying IO device. IOError(io::Error), } @@ -43,10 +43,10 @@ pub enum OpenReadError { pub enum DeleteError { /// The file does not exists. FileDoesNotExist(PathBuf), - /// Any kind of IO error that happens when + /// Any kind of IO error that happens when /// interacting with the underlying IO device. IOError(io::Error), - /// The file may not be deleted because it is + /// The file may not be deleted because it is /// protected. FileProtected(PathBuf), } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index e22653fa7..bc3e42185 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -18,7 +18,7 @@ use Error; /// Wrapper of directories that keeps track of files created by Tantivy. /// /// A managed directory is just a wrapper of a directory -/// that keeps a (persisted) list of the files that +/// that keeps a (persisted) list of the files that /// have been created (and not deleted) by tantivy so far. /// /// Thanks to this list, it implements a `garbage_collect` method @@ -46,19 +46,18 @@ pub struct FileProtection { } fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) { - let mut meta_informations_wlock = directory.meta_informations + let mut meta_informations_wlock = directory + .meta_informations .write() .expect("Managed file lock poisoned"); - if let Some(counter_ref_mut) = meta_informations_wlock - .protected_files - .get_mut(path) { + if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) { (*counter_ref_mut) -= 1; } } impl fmt::Debug for FileProtection { fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> { - write!(formatter, "FileProtectionFor({:?})", self.path) + write!(formatter, "FileProtectionFor({:?})", self.path) } } @@ -70,7 +69,9 @@ impl Drop for FileProtection { /// Saves the file containing the list of existing files /// that were created by tantivy. -fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard) -> io::Result<()> { +fn save_managed_paths(directory: &mut Directory, + wlock: &RwLockWriteGuard) + -> io::Result<()> { let mut w = serde_json::to_vec(&wlock.managed_paths)?; write!(&mut w, "\n")?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; @@ -78,32 +79,30 @@ fn save_managed_paths(directory: &mut Directory, wlock: &RwLockWriteGuard(directory: Dir) -> Result { match directory.atomic_read(&MANAGED_FILEPATH) { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); - let managed_files: HashSet = serde_json::from_str(&managed_files_json) - .map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?; + let managed_files: HashSet = + serde_json::from_str(&managed_files_json) + .map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?; Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::new(RwLock::new( - MetaInformation { - managed_paths: managed_files, - protected_files: HashMap::default() - })), - }) + directory: box directory, + meta_informations: Arc::new(RwLock::new(MetaInformation { + managed_paths: managed_files, + protected_files: + HashMap::default(), + })), + }) } Err(OpenReadError::FileDoesNotExist(_)) => { Ok(ManagedDirectory { - directory: box directory, - meta_informations: Arc::default(), - }) - } - Err(OpenReadError::IOError(e)) => { - Err(From::from(e)) + directory: box directory, + meta_informations: Arc::default(), + }) } + Err(OpenReadError::IOError(e)) => Err(From::from(e)), } } @@ -111,7 +110,7 @@ impl ManagedDirectory { /// /// Removes the files that were created by `tantivy` and are not /// used by any segment anymore. - /// + /// /// * `living_files` - List of files that are still used by the index. /// /// This method does not panick nor returns errors. @@ -119,19 +118,21 @@ impl ManagedDirectory { /// an error is simply logged, and the file remains in the list of managed /// files. pub fn garbage_collect(&mut self, living_files: HashSet) { - let mut files_to_delete = vec!(); - { // releasing the lock as .delete() will use it too. - let meta_informations_rlock = self.meta_informations - .read() - .expect("Managed directory rlock poisoned in garbage collect."); + let mut files_to_delete = vec![]; + { + // releasing the lock as .delete() will use it too. + let meta_informations_rlock = + self.meta_informations + .read() + .expect("Managed directory rlock poisoned in garbage collect."); for managed_path in &meta_informations_rlock.managed_paths { if !living_files.contains(managed_path) { files_to_delete.push(managed_path.clone()); } } } - - let mut deleted_files = vec!(); + + let mut deleted_files = vec![]; { for file_to_delete in files_to_delete { match self.delete(&file_to_delete) { @@ -155,7 +156,7 @@ impl ManagedDirectory { // this is expected. } } - + } } } @@ -163,7 +164,7 @@ impl ManagedDirectory { if !deleted_files.is_empty() { - // update the list of managed files by removing + // update the list of managed files by removing // the file that were removed. let mut meta_informations_wlock = self.meta_informations .write() @@ -186,7 +187,7 @@ impl ManagedDirectory { /// /// The method returns a `FileProtection` object. /// The file will not be garbage collected as long as the - /// `FileProtection` object is kept alive. + /// `FileProtection` object is kept alive. pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection { let pathbuf = path.to_owned(); { @@ -194,9 +195,9 @@ impl ManagedDirectory { .write() .expect("Managed file lock poisoned on protect"); *meta_informations_wlock - .protected_files - .entry(pathbuf.clone()) - .or_insert(0) += 1; + .protected_files + .entry(pathbuf.clone()) + .or_insert(0) += 1; } FileProtection { directory: self.clone(), @@ -205,16 +206,16 @@ impl ManagedDirectory { } /// Registers a file as managed - /// - /// This method must be called before the file is + /// + /// This method must be called before the file is /// actually created to ensure that a failure between /// registering the filepath and creating the file - /// will not lead to garbage files that will + /// will not lead to garbage files that will /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { let mut meta_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned"); + .write() + .expect("Managed file lock poisoned"); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); if has_changed { save_managed_paths(self.directory.as_mut(), &meta_wlock)?; @@ -224,7 +225,6 @@ impl ManagedDirectory { } impl Directory for ManagedDirectory { - fn open_read(&self, path: &Path) -> result::Result { self.directory.open_read(path) } @@ -250,7 +250,7 @@ impl Directory for ManagedDirectory { .expect("poisoned lock in managed directory meta"); if let Some(counter) = metas_rlock.protected_files.get(path) { if *counter > 0 { - return Err(DeleteError::FileProtected(path.to_owned())) + return Err(DeleteError::FileProtected(path.to_owned())); } } } @@ -260,11 +260,10 @@ impl Directory for ManagedDirectory { fn exists(&self, path: &Path) -> bool { self.directory.exists(path) } - + fn box_clone(&self) -> Box { box self.clone() } - } impl Clone for ManagedDirectory { @@ -284,10 +283,10 @@ mod tests { use super::*; use directory::MmapDirectory; - use std::path::Path; + use std::path::Path; use std::io::Write; use tempdir::TempDir; - + lazy_static! { static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test"); static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2"); @@ -305,17 +304,17 @@ mod tests { write_file.flush().unwrap(); } { - managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap(); + managed_directory + .atomic_write(*TEST_PATH2, &vec![0u8, 1u8]) + .unwrap(); } { assert!(managed_directory.exists(*TEST_PATH1)); assert!(managed_directory.exists(*TEST_PATH2)); } { - let living_files: HashSet = [TEST_PATH1.to_owned()] - .into_iter() - .cloned() - .collect(); + let living_files: HashSet = + [TEST_PATH1.to_owned()].into_iter().cloned().collect(); managed_directory.garbage_collect(living_files); } { @@ -338,7 +337,7 @@ mod tests { assert!(!managed_directory.exists(*TEST_PATH1)); assert!(!managed_directory.exists(*TEST_PATH2)); } - } + } } #[test] @@ -349,10 +348,12 @@ mod tests { let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); - managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap(); + managed_directory + .atomic_write(*TEST_PATH1, &vec![0u8, 1u8]) + .unwrap(); assert!(managed_directory.exists(*TEST_PATH1)); - let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap(); + let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap(); managed_directory.garbage_collect(living_files.clone()); if cfg!(target_os = "windows") { // On Windows, gc should try and fail the file as it is mmapped. @@ -363,8 +364,7 @@ mod tests { // eventually be deleted once mmap is released. managed_directory.garbage_collect(living_files); assert!(!managed_directory.exists(*TEST_PATH1)); - } - else { + } else { assert!(!managed_directory.exists(*TEST_PATH1)); } @@ -379,7 +379,9 @@ mod tests { let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); - managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap(); + managed_directory + .atomic_write(*TEST_PATH1, &vec![0u8, 1u8]) + .unwrap(); assert!(managed_directory.exists(*TEST_PATH1)); { @@ -390,7 +392,7 @@ mod tests { managed_directory.garbage_collect(living_files.clone()); assert!(!managed_directory.exists(*TEST_PATH1)); - + } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 25c76a0e9..a64ff715c 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -24,33 +24,24 @@ use std::sync::Weak; use tempdir::TempDir; fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadError> { - let convert_file_error = |err: io::Error| { - if err.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) - } - else { - OpenReadError::IOError(err) - } + let convert_file_error = |err: io::Error| if err.kind() == io::ErrorKind::NotFound { + OpenReadError::FileDoesNotExist(full_path.clone()) + } else { + OpenReadError::IOError(err) }; let file = File::open(&full_path).map_err(convert_file_error)?; - let meta_data = file - .metadata() - .map_err(|e| OpenReadError::IOError(e))?; + let meta_data = file.metadata().map_err(|e| OpenReadError::IOError(e))?; if meta_data.len() == 0 { - // if the file size is 0, it will not be possible + // if the file size is 0, it will not be possible // to mmap the file, so we return an anonymous mmap_cache // instead. - return Ok(None) + return Ok(None); } match Mmap::open(&file, Protection::Read) { - Ok(mmap) => { - Ok(Some(Arc::new(mmap))) - } - Err(e) => { - Err(OpenReadError::IOError(e)) - } + Ok(mmap) => Ok(Some(Arc::new(mmap))), + Err(e) => Err(OpenReadError::IOError(e)), } - + } #[derive(Default,Clone,Debug,Serialize,Deserialize)] @@ -91,8 +82,7 @@ impl Default for MmapCache { impl MmapCache { - - fn cleanup(&mut self) { + fn cleanup(&mut self) { let previous_cache_size = self.cache.len(); let mut new_cache = HashMap::new(); mem::swap(&mut new_cache, &mut self.cache); @@ -107,9 +97,7 @@ impl MmapCache { fn get_info(&mut self) -> CacheInfo { self.cleanup(); - let paths: Vec = self.cache.keys() - .cloned() - .collect(); + let paths: Vec = self.cache.keys().cloned().collect(); CacheInfo { counters: self.counters.clone(), mmapped: paths, @@ -123,68 +111,63 @@ impl MmapCache { self.cleanup(); } Ok(match self.cache.entry(full_path.clone()) { - HashMapEntry::Occupied(mut occupied_entry) => { - if let Some(mmap_arc) = occupied_entry.get().upgrade() { - self.counters.hit += 1; - Some(mmap_arc.clone()) - } - else { - // The entry exists but the weak ref has been destroyed. - self.counters.miss_weak += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - occupied_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } - else { - None - } - } - } - HashMapEntry::Vacant(vacant_entry) => { - self.counters.miss_empty += 1; - if let Some(mmap_arc) = open_mmap(&full_path)? { - vacant_entry.insert(Arc::downgrade(&mmap_arc)); - Some(mmap_arc) - } - else { - None - } - } - }) + HashMapEntry::Occupied(mut occupied_entry) => { + if let Some(mmap_arc) = occupied_entry.get().upgrade() { + self.counters.hit += 1; + Some(mmap_arc.clone()) + } else { + // The entry exists but the weak ref has been destroyed. + self.counters.miss_weak += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + occupied_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + } + HashMapEntry::Vacant(vacant_entry) => { + self.counters.miss_empty += 1; + if let Some(mmap_arc) = open_mmap(&full_path)? { + vacant_entry.insert(Arc::downgrade(&mmap_arc)); + Some(mmap_arc) + } else { + None + } + } + }) } } /// Directory storing data in files, read via mmap. /// -/// The Mmap object are cached to limit the -/// system calls. +/// The Mmap object are cached to limit the +/// system calls. #[derive(Clone)] pub struct MmapDirectory { root_path: PathBuf, mmap_cache: Arc>, _temp_directory: Arc>, - } impl fmt::Debug for MmapDirectory { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "MmapDirectory({:?})", self.root_path) - } + write!(f, "MmapDirectory({:?})", self.root_path) + } } impl MmapDirectory { - /// Creates a new MmapDirectory in a temporary directory. /// /// This is mostly useful to test the MmapDirectory itself. - /// For your unit tests, prefer the RAMDirectory. + /// For your unit tests, prefer the RAMDirectory. pub fn create_from_tempdir() -> io::Result { let tempdir = try!(TempDir::new("index")); let tempdir_path = PathBuf::from(tempdir.path()); let directory = MmapDirectory { root_path: PathBuf::from(tempdir_path), mmap_cache: Arc::new(RwLock::new(MmapCache::default())), - _temp_directory: Arc::new(Some(tempdir)) + _temp_directory: Arc::new(Some(tempdir)), }; Ok(directory) } @@ -196,16 +179,14 @@ impl MmapDirectory { pub fn open(directory_path: &Path) -> Result { if !directory_path.exists() { Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path))) - } - else if !directory_path.is_dir() { + } else if !directory_path.is_dir() { Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path))) - } - else { + } else { Ok(MmapDirectory { - root_path: PathBuf::from(directory_path), - mmap_cache: Arc::new(RwLock::new(MmapCache::default())), - _temp_directory: Arc::new(None) - }) + root_path: PathBuf::from(directory_path), + mmap_cache: Arc::new(RwLock::new(MmapCache::default())), + _temp_directory: Arc::new(None), + }) } } @@ -232,7 +213,8 @@ impl MmapDirectory { use std::os::windows::fs::OpenOptionsExt; use winapi::winbase; - open_opts.write(true) + open_opts + .write(true) .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); } @@ -242,8 +224,8 @@ impl MmapDirectory { } /// Returns some statistical information /// about the Mmap cache. - /// - /// The `MmapDirectory` embeds a `MmapDirectory` + /// + /// The `MmapDirectory` embeds a `MmapDirectory` /// to avoid multiplying the `mmap` system calls. pub fn get_cache_info(&mut self) -> CacheInfo { self.mmap_cache @@ -251,12 +233,10 @@ impl MmapDirectory { .expect("Mmap cache lock is poisoned.") .get_info() } - - } -/// This Write wraps a File, but has the specificity of -/// call `sync_all` on flush. +/// This Write wraps a File, but has the specificity of +/// call `sync_all` on flush. struct SafeFileWriter(File); impl SafeFileWriter { @@ -266,7 +246,6 @@ impl SafeFileWriter { } impl Write for SafeFileWriter { - fn write(&mut self, buf: &[u8]) -> io::Result { self.0.write(buf) } @@ -285,51 +264,46 @@ impl Seek for SafeFileWriter { impl Directory for MmapDirectory { - fn open_read(&self, path: &Path) -> result::Result { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); - + let mut mmap_cache = self.mmap_cache .write() .map_err(|_| OpenReadError::IOError( make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path)) ))?; - - Ok(mmap_cache.get_mmap(full_path)? - .map(MmapReadOnly::from) - .map(ReadOnlySource::Mmap) - .unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty())) - ) + + Ok(mmap_cache + .get_mmap(full_path)? + .map(MmapReadOnly::from) + .map(ReadOnlySource::Mmap) + .unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))) } - + fn open_write(&mut self, path: &Path) -> Result { debug!("Open Write {:?}", path); let full_path = self.resolve_path(path); - + let open_res = OpenOptions::new() .write(true) .create_new(true) .open(full_path); - - let mut file = try!( - open_res.map_err(|err| { - if err.kind() == io::ErrorKind::AlreadyExists { - OpenWriteError::FileAlreadyExists(PathBuf::from(path)) - } - else { - OpenWriteError::IOError(err) - } - }) - ); - + + let mut file = try!(open_res.map_err(|err| if err.kind() == + io::ErrorKind::AlreadyExists { + OpenWriteError::FileAlreadyExists(PathBuf::from(path)) + } else { + OpenWriteError::IOError(err) + })); + // making sure the file is created. try!(file.flush()); - + // Apparetntly, on some filesystem syncing the parent // directory is required. try!(self.sync_directory()); - + let writer = SafeFileWriter::new(file); Ok(BufWriter::new(Box::new(writer))) } @@ -347,15 +321,11 @@ impl Directory for MmapDirectory { // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { - Ok(_) => { - self.sync_directory() - .map_err(|e| DeleteError::IOError(e)) - } + Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError(e)), Err(e) => { if e.kind() == io::ErrorKind::NotFound { Err(DeleteError::FileDoesNotExist(path.to_owned())) - } - else { + } else { Err(DeleteError::IOError(e)) } } @@ -379,26 +349,23 @@ impl Directory for MmapDirectory { Err(e) => { if e.kind() == io::ErrorKind::NotFound { Err(OpenReadError::FileDoesNotExist(path.to_owned())) - } - else { + } else { Err(OpenReadError::IOError(e)) } } } - + } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { debug!("Atomic Write {:?}", path); let full_path = self.resolve_path(path); let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite); - try!(meta_file.write(|f| { - f.write_all(data) - })); + try!(meta_file.write(|f| f.write_all(data))); Ok(()) } - fn box_clone(&self,) -> Box { + fn box_clone(&self) -> Box { Box::new(self.clone()) } } @@ -457,9 +424,9 @@ mod tests { } } assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10); - - - { + + + { // test weak miss // the first pass create the weak refs. for path in &paths { @@ -475,7 +442,7 @@ mod tests { } { - let mut saved_readmmaps = vec!(); + let mut saved_readmmaps = vec![]; // Keeps reference alive for (i, path) in paths.iter().enumerate() { let r = mmap_directory.open_read(path).unwrap(); @@ -494,7 +461,7 @@ mod tests { } assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0); - + } } diff --git a/src/directory/mod.rs b/src/directory/mod.rs index 09f61da3e..f47cfdcbb 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -31,7 +31,7 @@ pub type WritePtr = BufWriter>; mod tests { use super::*; - use std::path::Path; + use std::path::Path; use std::io::{Write, Seek, SeekFrom}; lazy_static! { @@ -65,7 +65,7 @@ mod tests { assert!(directory.exists(*TEST_PATH)); write_file.write_all(&[4]).unwrap(); write_file.write_all(&[3]).unwrap(); - write_file.write_all(&[7,3,5]).unwrap(); + write_file.write_all(&[7, 3, 5]).unwrap(); write_file.flush().unwrap(); } let read_file = directory.open_read(*TEST_PATH).unwrap(); @@ -81,9 +81,9 @@ mod tests { { { let mut write_file = directory.open_write(*TEST_PATH).unwrap(); - write_file.write_all(&[4, 3, 7,3,5]).unwrap(); + write_file.write_all(&[4, 3, 7, 3, 5]).unwrap(); write_file.seek(SeekFrom::Start(0)).unwrap(); - write_file.write_all(&[3,1]).unwrap(); + write_file.write_all(&[3, 1]).unwrap(); write_file.flush().unwrap(); } let read_file = directory.open_read(*TEST_PATH).unwrap(); @@ -98,7 +98,7 @@ mod tests { { directory.open_write(*TEST_PATH).unwrap(); assert!(directory.exists(*TEST_PATH)); - + } { assert!(directory.open_write(*TEST_PATH).is_err()); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index ddb97080b..4a1df936e 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -11,14 +11,14 @@ use directory::WritePtr; use super::shared_vec_slice::SharedVecSlice; /// Writer associated with the `RAMDirectory` -/// +/// /// The Writer just writes a buffer. /// /// # Panics /// /// On drop, if the writer was left in a *dirty* state. /// That is, if flush was not called after the last call -/// to write. +/// to write. /// struct VecWriter { path: PathBuf, @@ -40,8 +40,9 @@ impl VecWriter { impl Drop for VecWriter { fn drop(&mut self) { - if !self.is_flushed { - panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", self.path) + if !self.is_flushed { + panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", + self.path) } } } @@ -61,7 +62,8 @@ impl Write for VecWriter { fn flush(&mut self) -> io::Result<()> { self.is_flushed = true; - try!(self.shared_directory.write(self.path.clone(), self.data.get_ref())); + try!(self.shared_directory + .write(self.path.clone(), self.data.get_ref())); Ok(()) } } @@ -72,22 +74,22 @@ struct InnerDirectory(Arc>>>>); impl InnerDirectory { - fn new() -> InnerDirectory { InnerDirectory(Arc::new(RwLock::new(HashMap::new()))) } fn write(&self, path: PathBuf, data: &[u8]) -> io::Result { - let mut map = try!( - self.0 - .write() - .map_err(|_| make_io_err(format!("Failed to lock the directory, when trying to write {:?}", path))) - ); + let mut map = try!(self.0 + .write() + .map_err(|_| { + make_io_err(format!("Failed to lock the directory, when trying to write {:?}", + path)) + })); let prev_value = map.insert(path, Arc::new(Vec::from(data))); Ok(prev_value.is_some()) } - fn open_read(&self, path: &Path) -> Result { + fn open_read(&self, path: &Path) -> Result { self.0 .read() .map_err(|_| { @@ -129,13 +131,12 @@ impl InnerDirectory { .expect("Failed to get read lock directory.") .contains_key(path) } - } impl fmt::Debug for RAMDirectory { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "RAMDirectory") - } + write!(f, "RAMDirectory") + } } @@ -150,12 +151,9 @@ pub struct RAMDirectory { } impl RAMDirectory { - /// Constructor pub fn create() -> RAMDirectory { - RAMDirectory { - fs: InnerDirectory::new() - } + RAMDirectory { fs: InnerDirectory::new() } } } @@ -163,15 +161,14 @@ impl Directory for RAMDirectory { fn open_read(&self, path: &Path) -> result::Result { self.fs.open_read(path) } - + fn open_write(&mut self, path: &Path) -> Result { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); // force the creation of the file to mimic the MMap directory. if try!(self.fs.write(path_buf.clone(), &Vec::new())) { Err(OpenWriteError::FileAlreadyExists(path_buf)) - } - else { + } else { Ok(BufWriter::new(Box::new(vec_writer))) } } @@ -180,15 +177,14 @@ impl Directory for RAMDirectory { self.fs.delete(path) } - + fn exists(&self, path: &Path) -> bool { self.fs.exists(path) } fn atomic_read(&self, path: &Path) -> Result, OpenReadError> { let read = self.open_read(path)?; - Ok(read.as_slice() - .to_owned()) + Ok(read.as_slice().to_owned()) } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { @@ -200,8 +196,7 @@ impl Directory for RAMDirectory { Ok(()) } - fn box_clone(&self,) -> Box { + fn box_clone(&self) -> Box { Box::new(self.clone()) } - } diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index b7a98c288..9e5559f62 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -5,7 +5,7 @@ use common::HasLen; /// Read object that represents files in tantivy. -/// +/// /// These read objects are only in charge to deliver /// the data in the form of a constant read-only `&[u8]`. /// Whatever happens to the directory file, the data @@ -13,12 +13,11 @@ use common::HasLen; pub enum ReadOnlySource { /// Mmap source of data Mmap(MmapReadOnly), - /// Wrapping a `Vec` + /// Wrapping a `Vec` Anonymous(SharedVecSlice), } impl Deref for ReadOnlySource { - type Target = [u8]; fn deref(&self) -> &[u8] { @@ -27,35 +26,30 @@ impl Deref for ReadOnlySource { } impl ReadOnlySource { - /// Creates an empty ReadOnlySource pub fn empty() -> ReadOnlySource { ReadOnlySource::Anonymous(SharedVecSlice::empty()) } /// Returns the data underlying the ReadOnlySource object. - pub fn as_slice(&self,) -> &[u8] { + pub fn as_slice(&self) -> &[u8] { match *self { - ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { - mmap_read_only.as_slice() - }, - ReadOnlySource::Anonymous(ref shared_vec) => { - shared_vec.as_slice() - }, + ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() }, + ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(), } } - /// Creates a ReadOnlySource that is just a + /// Creates a ReadOnlySource that is just a /// view over a slice of the data. - /// + /// /// Keep in mind that any living slice extends /// the lifetime of the original ReadOnlySource, - /// + /// /// For instance, if `ReadOnlySource` wraps 500MB /// worth of data in anonymous memory, and only a - /// 1KB slice is remaining, the whole `500MBs` + /// 1KB slice is remaining, the whole `500MBs` /// are retained in memory. - pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource { + pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource { match *self { ReadOnlySource::Mmap(ref mmap_read_only) => { let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset); @@ -63,13 +57,13 @@ impl ReadOnlySource { } ReadOnlySource::Anonymous(ref shared_vec) => { ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset)) - }, + } } } } impl HasLen for ReadOnlySource { - fn len(&self,) -> usize { + fn len(&self) -> usize { self.as_slice().len() } } diff --git a/src/directory/shared_vec_slice.rs b/src/directory/shared_vec_slice.rs index b4cec5ecb..b534e6029 100644 --- a/src/directory/shared_vec_slice.rs +++ b/src/directory/shared_vec_slice.rs @@ -4,12 +4,11 @@ use std::sync::Arc; #[derive(Clone)] pub struct SharedVecSlice { pub data: Arc>, - pub start: usize, - pub len: usize + pub start: usize, + pub len: usize, } impl SharedVecSlice { - pub fn empty() -> SharedVecSlice { SharedVecSlice::new(Arc::new(Vec::new())) } @@ -23,11 +22,11 @@ impl SharedVecSlice { } } - pub fn as_slice(&self,) -> &[u8] { + pub fn as_slice(&self) -> &[u8] { &self.data[self.start..self.start + self.len] } - pub fn slice(&self, from_offset: usize, to_offset:usize) -> SharedVecSlice { + pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice { SharedVecSlice { data: self.data.clone(), start: self.start + from_offset, diff --git a/src/error.rs b/src/error.rs index aacc14653..471d21c57 100644 --- a/src/error.rs +++ b/src/error.rs @@ -38,7 +38,7 @@ pub enum Error { /// An Error appeared related to the lack of a field. SchemaError(String), /// Tried to access a fastfield reader for a field not configured accordingly. - FastFieldError(FastFieldNotAvailableError) + FastFieldError(FastFieldNotAvailableError), } impl From for Error { @@ -83,10 +83,8 @@ impl From for Error { impl From for Error { fn from(error: OpenWriteError) -> Error { match error { - OpenWriteError::FileAlreadyExists(filepath) => - Error::FileAlreadyExists(filepath), - OpenWriteError::IOError(io_error) => - Error::IOError(io_error), + OpenWriteError::FileAlreadyExists(filepath) => Error::FileAlreadyExists(filepath), + OpenWriteError::IOError(io_error) => Error::IOError(io_error), } } } @@ -94,10 +92,12 @@ impl From for Error { impl From for Error { fn from(error: OpenDirectoryError) -> Error { match error { - OpenDirectoryError::DoesNotExist(directory_path) => - Error::PathDoesNotExist(directory_path), - OpenDirectoryError::NotADirectory(directory_path) => - Error::InvalidArgument(format!("{:?} is not a directory", directory_path)), + OpenDirectoryError::DoesNotExist(directory_path) => { + Error::PathDoesNotExist(directory_path) + } + OpenDirectoryError::NotADirectory(directory_path) => { + Error::InvalidArgument(format!("{:?} is not a directory", directory_path)) + } } } } @@ -106,4 +106,4 @@ impl From for Error { fn from(error: serde_json::Error) -> Error { Error::IOError(error.into()) } -} \ No newline at end of file +} diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index e4985b471..08ad41e8e 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -21,8 +21,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io: writer.write_all(&[byte])?; shift = 0; byte = 0; - } - else { + } else { shift += 1; } } @@ -36,15 +35,14 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io: #[derive(Clone)] pub struct DeleteBitSet { data: ReadOnlySource, - len: usize, + len: usize, } impl DeleteBitSet { /// Opens a delete bitset given its data source. pub fn open(data: ReadOnlySource) -> DeleteBitSet { - let num_deleted: usize = data - .as_slice() + let num_deleted: usize = data.as_slice() .iter() .map(|b| b.count_ones() as usize) .sum(); @@ -71,15 +69,13 @@ impl DeleteBitSet { pub fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false - } - else { + } else { let byte_offset = doc / 8u32; let b: u8 = (*self.data)[byte_offset as usize]; let shift = (doc & 7u32) as u8; - b & (1u8 << shift) != 0 + b & (1u8 << shift) != 0 } } - } @@ -132,4 +128,4 @@ mod tests { test_delete_bitset_helper(&bitset); } } -} \ No newline at end of file +} diff --git a/src/fastfield/error.rs b/src/fastfield/error.rs index 50776a179..88902833b 100644 --- a/src/fastfield/error.rs +++ b/src/fastfield/error.rs @@ -1,7 +1,7 @@ use std::result; use schema::FieldEntry; -/// FastFieldNotAvailableError is returned when the +/// FastFieldNotAvailableError is returned when the /// user requested for a fast field reader, and the field was not /// defined in the schema as a fast field. #[derive(Debug)] @@ -10,17 +10,14 @@ pub struct FastFieldNotAvailableError { } impl FastFieldNotAvailableError { - /// Creates a `FastFieldNotAvailable` error. - /// `field_entry` is the configuration of the field + /// `field_entry` is the configuration of the field /// for which fast fields are not available. pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError { - FastFieldNotAvailableError { - field_name: field_entry.name().to_string(), - } + FastFieldNotAvailableError { field_name: field_entry.name().to_string() } } } /// Result when trying to access a fast field reader. -pub type Result = result::Result; +pub type Result = result::Result; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index a8d956576..71f44ba85 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -3,20 +3,20 @@ //! Fast fields are the equivalent of `DocValues` in `Lucene`. //! Fast fields is a non-compressed column-oriented fashion storage //! of `tantivy`. -//! +//! //! It is designed for the fast random access of some document //! fields given a document id. //! //! `FastField` are useful when a field is required for all or most of //! the `DocSet` : for instance for scoring, grouping, filtering, or facetting. -//! -//! +//! +//! //! Fields have to be declared as `FAST` in the schema. //! Currently only 64-bits integers (signed or unsigned) are //! supported. //! //! They are stored in a bitpacked fashion so that their -//! memory usage is directly linear with the amplitude of the +//! memory usage is directly linear with the amplitude of the //! values stored. //! //! Read access performance is comparable to that of an array lookup. @@ -67,13 +67,13 @@ mod tests { doc.add_u64(field, value); fast_field_writers.add_document(&doc); } - + #[test] pub fn test_fastfield() { - let test_fastfield = U64FastFieldReader::from(vec!(100,200,300)); + let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]); assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get(1), 200); - assert_eq!(test_fastfield.get(2), 300); + assert_eq!(test_fastfield.get(2), 300); } #[test] @@ -96,7 +96,8 @@ mod tests { } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); assert_eq!(fast_field_reader.get(0), 13u64); assert_eq!(fast_field_reader.get(1), 14u64); assert_eq!(fast_field_reader.get(2), 2u64); @@ -129,7 +130,8 @@ mod tests { } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); assert_eq!(fast_field_reader.get(0), 4u64); assert_eq!(fast_field_reader.get(1), 14_082_001u64); assert_eq!(fast_field_reader.get(2), 3_052u64); @@ -141,9 +143,9 @@ mod tests { assert_eq!(fast_field_reader.get(8), 215u64); } } - - #[test] - fn test_intfastfield_null_amplitude() { + + #[test] + fn test_intfastfield_null_amplitude() { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); @@ -164,18 +166,19 @@ mod tests { } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); for doc in 0..10_000 { assert_eq!(fast_field_reader.get(doc), 100_000u64); } } } - #[test] - fn test_intfastfield_large_numbers() { + #[test] + fn test_intfastfield_large_numbers() { let path = Path::new("test"); let mut directory: RAMDirectory = RAMDirectory::create(); - + { let write: WritePtr = directory.open_write(Path::new("test")).unwrap(); let mut serializer = FastFieldSerializer::new(write).unwrap(); @@ -183,7 +186,9 @@ mod tests { // forcing the amplitude to be high add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64); for i in 0u64..10_000u64 { - add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i); + add_single_field_doc(&mut fast_field_writers, + *FIELD, + 5_000_000_000_000_000_000u64 + i); } fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); @@ -194,10 +199,12 @@ mod tests { } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); assert_eq!(fast_field_reader.get(0), 0u64); for doc in 1..10_001 { - assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64); + assert_eq!(fast_field_reader.get(doc), + 5_000_000_000_000_000_000u64 + doc as u64 - 1u64); } } } @@ -229,7 +236,8 @@ mod tests { } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap(); + let fast_field_reader: I64FastFieldReader = + fast_field_readers.open_reader(i64_field).unwrap(); assert_eq!(fast_field_reader.min_value(), -100i64); assert_eq!(fast_field_reader.max_value(), 9_999i64); for (doc, i) in (-100i64..10_000i64).enumerate() { @@ -255,11 +263,12 @@ mod tests { fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } - + let source = directory.open_read(&path).unwrap(); { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap(); + let fast_field_reader: I64FastFieldReader = + fast_field_readers.open_reader(i64_field).unwrap(); assert_eq!(fast_field_reader.get(0u32), 0i64); } } @@ -291,10 +300,14 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); let mut a = 0u64; for _ in 0..n { - println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]); + println!("i {}=> {} {}", + a, + fast_field_reader.get(a as u32), + permutation[a as usize]); assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]); a = fast_field_reader.get(a as u32); } @@ -305,26 +318,26 @@ mod tests { fn bench_intfastfield_linear_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); b.iter(|| { - let n = test::black_box(7000u32); - let mut a = 0u64; - for i in (0u32..n).step_by(7) { - a ^= permutation[i as usize]; - } - a - }); + let n = test::black_box(7000u32); + let mut a = 0u64; + for i in (0u32..n).step_by(7) { + a ^= permutation[i as usize]; + } + a + }); } #[bench] fn bench_intfastfield_veclookup(b: &mut Bencher) { let permutation = generate_permutation(); b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u64; - for _ in 0u32..n { - a = permutation[a as usize]; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u64; + for _ in 0u32..n { + a = permutation[a as usize]; + } + a + }); } #[bench] @@ -345,15 +358,16 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); b.iter(|| { - let n = test::black_box(7000u32); - let mut a = 0u64; - for i in (0u32..n).step_by(7) { - a ^= fast_field_reader.get(i); - } - a - }); + let n = test::black_box(7000u32); + let mut a = 0u64; + for i in (0u32..n).step_by(7) { + a ^= fast_field_reader.get(i); + } + a + }); } } @@ -375,15 +389,16 @@ mod tests { let source = directory.open_read(&path).unwrap(); { let fast_field_readers = FastFieldsReader::open(source).unwrap(); - let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap(); + let fast_field_reader: U64FastFieldReader = + fast_field_readers.open_reader(*FIELD).unwrap(); b.iter(|| { - let n = test::black_box(1000u32); - let mut a = 0u32; - for _ in 0u32..n { - a = fast_field_reader.get(a) as u32; - } - a - }); + let n = test::black_box(1000u32); + let mut a = 0u32; + for _ in 0u32..n { + a = fast_field_reader.get(a) as u32; + } + a + }); } } } diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 23229184c..74b69cb9e 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -19,7 +19,6 @@ use common; /// Depending on the field type, a different /// fast field is required. pub trait FastFieldReader: Sized { - /// Type of the value stored in the fastfield. type ValueType; @@ -33,7 +32,7 @@ pub trait FastFieldReader: Sized { fn open(source: ReadOnlySource) -> Self; /// Returns true iff the given field_type makes - /// it possible to access the field values via a + /// it possible to access the field values via a /// fastfield. fn is_enabled(field_type: &FieldType) -> bool; } @@ -47,37 +46,35 @@ pub struct U64FastFieldReader { } impl U64FastFieldReader { - /// Returns the minimum value for this fast field. /// /// The min value does not take in account of possible - /// deleted document, and should be considered as a lower bound + /// deleted document, and should be considered as a lower bound /// of the actual minimum value. - pub fn min_value(&self,) -> u64 { + pub fn min_value(&self) -> u64 { self.min_value - } + } /// Returns the maximum value for this fast field. /// /// The max value does not take in account of possible - /// deleted document, and should be considered as an upper bound + /// deleted document, and should be considered as an upper bound /// of the actual maximum value. - pub fn max_value(&self,) -> u64 { + pub fn max_value(&self) -> u64 { self.max_value } } impl FastFieldReader for U64FastFieldReader { type ValueType = u64; - + fn get(&self, doc: DocId) -> u64 { self.min_value + self.bit_unpacker.get(doc as usize) } fn is_enabled(field_type: &FieldType) -> bool { match field_type { - &FieldType::U64(ref integer_options) => - integer_options.is_fast(), + &FieldType::U64(ref integer_options) => integer_options.is_fast(), _ => false, } } @@ -90,11 +87,13 @@ impl FastFieldReader for U64FastFieldReader { let min_value: u64; let max_value: u64; let bit_unpacker: BitUnpacker; - + { let mut cursor: &[u8] = data.as_slice(); - min_value = u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field."); - let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field."); + min_value = u64::deserialize(&mut cursor) + .expect("Failed to read the min_value of fast field."); + let amplitude = u64::deserialize(&mut cursor) + .expect("Failed to read the amplitude of fast field."); max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); bit_unpacker = BitUnpacker::new(cursor, num_bits as usize) @@ -107,7 +106,6 @@ impl FastFieldReader for U64FastFieldReader { max_value: max_value, } } - } @@ -132,7 +130,7 @@ impl From> for U64FastFieldReader { let source = directory.open_read(&path).unwrap(); let fast_field_readers = FastFieldsReader::open(source).unwrap(); fast_field_readers.open_reader(field).unwrap() - } + } } /// FastFieldReader for signed 64-bits integers. @@ -144,37 +142,35 @@ impl I64FastFieldReader { /// Returns the minimum value for this fast field. /// /// The min value does not take in account of possible - /// deleted document, and should be considered as a lower bound + /// deleted document, and should be considered as a lower bound /// of the actual minimum value. - pub fn min_value(&self,) -> i64 { + pub fn min_value(&self) -> i64 { common::u64_to_i64(self.underlying.min_value()) } /// Returns the maximum value for this fast field. /// /// The max value does not take in account of possible - /// deleted document, and should be considered as an upper bound + /// deleted document, and should be considered as an upper bound /// of the actual maximum value. - pub fn max_value(&self,) -> i64 { + pub fn max_value(&self) -> i64 { common::u64_to_i64(self.underlying.max_value()) } } impl FastFieldReader for I64FastFieldReader { type ValueType = i64; - + fn get(&self, doc: DocId) -> i64 { common::u64_to_i64(self.underlying.get(doc)) } - + /// Opens a new fast field reader given a read only source. /// /// # Panics /// Panics if the data is corrupted. fn open(data: ReadOnlySource) -> I64FastFieldReader { - I64FastFieldReader { - underlying: U64FastFieldReader::open(data) - } + I64FastFieldReader { underlying: U64FastFieldReader::open(data) } } fn is_enabled(field_type: &FieldType) -> bool { @@ -182,15 +178,13 @@ impl FastFieldReader for I64FastFieldReader { &FieldType::I64(ref integer_options) => { if integer_options.is_fast() { true - } - else { + } else { false } - }, + } _ => false, } } - } @@ -198,7 +192,7 @@ impl FastFieldReader for I64FastFieldReader { /// The FastFieldsReader` is the datastructure containing /// all of the fast fields' data. /// -/// It contains a mapping that associated these fields to +/// It contains a mapping that associated these fields to /// the proper slice in the fastfield reader file. pub struct FastFieldsReader { source: ReadOnlySource, @@ -206,11 +200,10 @@ pub struct FastFieldsReader { } impl FastFieldsReader { - /// Opens the `FastFieldsReader` file /// /// When opening the fast field reader, the - /// the list of the offset is read (as a footer of the + /// the list of the offset is read (as a footer of the /// data file). pub fn open(source: ReadOnlySource) -> io::Result { let header_offset; @@ -223,23 +216,21 @@ impl FastFieldsReader { } { let mut cursor = &buffer[header_offset as usize..]; - field_offsets = Vec::deserialize(&mut cursor)?; + field_offsets = Vec::deserialize(&mut cursor)?; } } - let mut end_offsets: Vec = field_offsets - .iter() - .map(|&(_, offset)| offset) - .collect(); + let mut end_offsets: Vec = field_offsets.iter().map(|&(_, offset)| offset).collect(); end_offsets.push(header_offset); let mut field_offsets_map: HashMap = HashMap::new(); - for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) { + for (field_start_offsets, stop_offset) in + field_offsets.iter().zip(end_offsets.iter().skip(1)) { let (field, start_offset) = *field_start_offsets; field_offsets_map.insert(field, (start_offset, *stop_offset)); } Ok(FastFieldsReader { - field_offsets: field_offsets_map, - source: source, - }) + field_offsets: field_offsets_map, + source: source, + }) } /// Returns the u64 fast value reader if the field @@ -254,8 +245,8 @@ impl FastFieldsReader { self.field_offsets .get(&field) .map(|&(start, stop)| { - let field_source = self.source.slice(start as usize, stop as usize); - FFReader::open(field_source) - }) + let field_source = self.source.slice(start as usize, stop as usize); + FFReader::open(field_source) + }) } } diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index a68e03a38..7f97b3b28 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -7,9 +7,9 @@ use std::io::{self, Write, Seek, SeekFrom}; /// `FastFieldSerializer` is in charge of serializing /// fastfields on disk. -/// +/// /// Fast fields are encoded using bit-packing. -/// +/// /// `FastFieldWriter`s are in charge of pushing the data to /// the serializer. /// The serializer expects to receive the following calls. @@ -41,17 +41,21 @@ impl FastFieldSerializer { // just making room for the pointer to header. let written_size: usize = try!(0u32.serialize(&mut write)); Ok(FastFieldSerializer { - write: write, - written_size: written_size, - fields: Vec::new(), - min_value: 0, - field_open: false, - bit_packer: BitPacker::new(0), - }) + write: write, + written_size: written_size, + fields: Vec::new(), + min_value: 0, + field_open: false, + bit_packer: BitPacker::new(0), + }) } - + /// Start serializing a new u64 fast field - pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> { + pub fn new_u64_fast_field(&mut self, + field: Field, + min_value: u64, + max_value: u64) + -> io::Result<()> { if self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed")); } @@ -68,15 +72,15 @@ impl FastFieldSerializer { } - /// Pushes a new value to the currently open u64 fast field. + /// Pushes a new value to the currently open u64 fast field. pub fn add_val(&mut self, val: u64) -> io::Result<()> { let val_to_write: u64 = val - self.min_value; self.bit_packer.write(val_to_write, &mut self.write)?; Ok(()) } - - /// Close the u64 fast field. - pub fn close_field(&mut self,) -> io::Result<()> { + + /// Close the u64 fast field. + pub fn close_field(&mut self) -> io::Result<()> { if !self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed")); } @@ -87,12 +91,12 @@ impl FastFieldSerializer { self.written_size += self.bit_packer.close(&mut self.write)?; Ok(()) } - - + + /// Closes the serializer - /// + /// /// After this call the data must be persistently save on disk. - pub fn close(mut self,) -> io::Result { + pub fn close(mut self) -> io::Result { if self.field_open { return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed")); } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 396f03606..55db0419f 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -12,7 +12,6 @@ pub struct FastFieldsWriter { } impl FastFieldsWriter { - /// Create all `FastFieldWriter` required by the schema. pub fn from_schema(schema: &Schema) -> FastFieldsWriter { let field_writers: Vec = schema @@ -27,40 +26,33 @@ impl FastFieldsWriter { let mut fast_field_writer = IntFastFieldWriter::new(field); fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64)); Some(fast_field_writer) - } - else { + } else { None } } &FieldType::U64(ref int_options) => { if int_options.is_fast() { Some(IntFastFieldWriter::new(field)) - } - else { + } else { None } } - _ => None + _ => None, } - }) + }) .collect(); - FastFieldsWriter { - field_writers: field_writers, - } + FastFieldsWriter { field_writers: field_writers } } - - /// Returns a `FastFieldsWriter` - /// with a `IntFastFieldWriter` for each + + /// Returns a `FastFieldsWriter` + /// with a `IntFastFieldWriter` for each /// of the field given in argument. pub fn new(fields: Vec) -> FastFieldsWriter { FastFieldsWriter { - field_writers: fields - .into_iter() - .map(IntFastFieldWriter::new) - .collect(), + field_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(), } } - + /// Get the `FastFieldWriter` associated to a field. pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> { // TODO optimize @@ -68,7 +60,7 @@ impl FastFieldsWriter { .iter_mut() .find(|field_writer| field_writer.field == field) } - + /// Indexes all of the fastfields of a new document. pub fn add_document(&mut self, doc: &Document) { @@ -77,7 +69,7 @@ impl FastFieldsWriter { } } - /// Serializes all of the `FastFieldWriter`s by pushing them in + /// Serializes all of the `FastFieldWriter`s by pushing them in /// order to the fast field serializer. pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { for field_writer in &self.field_writers { @@ -85,10 +77,10 @@ impl FastFieldsWriter { } Ok(()) } - + /// Ensures all of the fast field writers have /// reached `doc`. (included) - /// + /// /// The missing values will be filled with 0. pub fn fill_val_up_to(&mut self, doc: DocId) { for field_writer in &mut self.field_writers { @@ -99,16 +91,16 @@ impl FastFieldsWriter { /// Fast field writer for ints. /// The fast field writer just keeps the values in memory. -/// +/// /// Only when the segment writer can be closed and -/// persisted on disc, the fast field writer is +/// persisted on disc, the fast field writer is /// sent to a `FastFieldSerializer` via the `.serialize(...)` /// method. /// -/// We cannot serialize earlier as the values are -/// bitpacked and the number of bits required for bitpacking +/// We cannot serialize earlier as the values are +/// bitpacked and the number of bits required for bitpacking /// can only been known once we have seen all of the values. -/// +/// /// Both u64, and i64 use the same writer. /// i64 are just remapped to the `0..2^64 - 1` /// using `common::i64_to_u64`. @@ -119,7 +111,6 @@ pub struct IntFastFieldWriter { } impl IntFastFieldWriter { - /// Creates a new `IntFastFieldWriter` pub fn new(field: Field) -> IntFastFieldWriter { IntFastFieldWriter { @@ -128,10 +119,10 @@ impl IntFastFieldWriter { val_if_missing: 0u64, } } - + /// Sets the default value. /// - /// This default value is recorded for documents if + /// This default value is recorded for documents if /// a document does not have any value. fn set_val_if_missing(&mut self, val_if_missing: u64) { self.val_if_missing = val_if_missing; @@ -139,7 +130,7 @@ impl IntFastFieldWriter { /// Ensures all of the fast field writer have /// reached `doc`. (included) - /// + /// /// The missing values will be filled with 0. fn fill_val_up_to(&mut self, doc: DocId) { let target = doc as usize + 1; @@ -158,9 +149,9 @@ impl IntFastFieldWriter { pub fn add_val(&mut self, val: u64) { self.vals.push(val); } - - /// Extract the value associated to the fast field for + + /// Extract the value associated to the fast field for /// this document. /// /// i64 are remapped to u64 using the logic @@ -174,14 +165,12 @@ impl IntFastFieldWriter { match doc.get_first(self.field) { Some(v) => { match *v { - Value::U64(ref val) => { *val }, + Value::U64(ref val) => *val, Value::I64(ref val) => common::i64_to_u64(*val), - _ => { panic!("Expected a u64field, got {:?} ", v) } + _ => panic!("Expected a u64field, got {:?} ", v), } - }, - None => { - self.val_if_missing - } + } + None => self.val_if_missing, } } @@ -204,8 +193,3 @@ impl IntFastFieldWriter { serializer.close_field() } } - - - - - diff --git a/src/functional_test.rs b/src/functional_test.rs index 4d187e081..73723dcc4 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -18,7 +18,7 @@ fn test_indexing() { let mut schema_builder = SchemaBuilder::default(); let id_field = schema_builder.add_u64_field("id", INT_INDEXED); - let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED); + let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_from_tempdir(schema).unwrap(); @@ -41,14 +41,11 @@ fn test_indexing() { let searcher = index.searcher(); // check that everything is correct. check_index_content(&searcher, &committed_docs); - } - else { - if committed_docs.remove(&random_val) || - uncommitted_docs.remove(&random_val) { + } else { + if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) { let doc_id_term = Term::from_field_u64(id_field, random_val); index_writer.delete_term(doc_id_term); - } - else { + } else { uncommitted_docs.insert(random_val); let mut doc = Document::new(); doc.add_u64(id_field, random_val); diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 1998b761d..ae20d9866 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -6,17 +6,17 @@ use std::ops::DerefMut; // The DeleteQueue is similar in conceptually to a multiple // consumer single producer broadcast channel. -// +// // All consumer will receive all messages. -// +// // Consumer of the delete queue are holding a `DeleteCursor`, // which points to a specific place of the `DeleteQueue`. -// +// // New consumer can be created in two ways -// - calling `delete_queue.cursor()` returns a cursor, that +// - calling `delete_queue.cursor()` returns a cursor, that // will include all future delete operation (and no past operations). // - cloning an existing cursor returns a new cursor, that -// is at the exact same position, and can now advance independantly +// is at the exact same position, and can now advance independantly // from the original cursor. #[derive(Default)] struct InnerDeleteQueue { @@ -31,32 +31,27 @@ pub struct DeleteQueue { impl DeleteQueue { - // Creates a new delete queue. pub fn new() -> DeleteQueue { - - let delete_queue = DeleteQueue { - inner: Arc::default(), - }; - + + let delete_queue = DeleteQueue { inner: Arc::default() }; + let next_block = NextBlock::from(delete_queue.clone()); { let mut delete_queue_wlock = delete_queue.inner.write().unwrap(); - delete_queue_wlock.last_block = Some( - Arc::new(Block { - operations: Arc::default(), - next: next_block, - }) - ); + delete_queue_wlock.last_block = Some(Arc::new(Block { + operations: Arc::default(), + next: next_block, + })); } delete_queue } - - // Creates a new cursor that makes it possible to + + // Creates a new cursor that makes it possible to // consume future delete operations. - // + // // Past delete operations are not accessible. pub fn cursor(&self) -> DeleteCursor { let last_block = self.inner @@ -85,40 +80,37 @@ impl DeleteQueue { // DeleteQueue is a linked list of blocks of // delete operations. - // + // // Writing happens by simply appending to a vec. // `.flush()` takes this pending delete operations vec - // creates a new read-only block from it, + // creates a new read-only block from it, // and appends it to the linked list. - // - // `.flush()` happens when, for instance, + // + // `.flush()` happens when, for instance, // a consumer reaches the last read-only operations. - // It then ask the delete queue if there happen to + // It then ask the delete queue if there happen to // be some unflushed operations. // fn flush(&self) -> Option> { - let mut self_wlock = self - .inner + let mut self_wlock = self.inner .write() .expect("Failed to acquire write lock on delete queue writer"); - + let delete_operations; { let writer: &mut Vec = &mut self_wlock.writer; if writer.is_empty() { return None; } - delete_operations = mem::replace(writer, vec!()); + delete_operations = mem::replace(writer, vec![]); } let next_block = NextBlock::from(self.clone()); { - self_wlock.last_block = Some( - Arc::new(Block { - operations: Arc::new(delete_operations), - next: next_block, - }) - ); + self_wlock.last_block = Some(Arc::new(Block { + operations: Arc::new(delete_operations), + next: next_block, + })); } self_wlock.last_block.clone() } @@ -137,7 +129,7 @@ impl From for NextBlock { } } -impl NextBlock { +impl NextBlock { fn next_block(&self) -> Option> { { let next_read_lock = self.0 @@ -171,7 +163,7 @@ impl NextBlock { } } *next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); - return Some(next_block) + return Some(next_block); } } } @@ -189,10 +181,9 @@ pub struct DeleteCursor { } -impl DeleteCursor { - +impl DeleteCursor { /// Skips operations and position it so that - /// - either all of the delete operation currently in the + /// - either all of the delete operation currently in the /// queue are consume and the next get will return None. /// - the next get will return the first operation with an /// `opstamp >= target_opstamp`. @@ -203,18 +194,17 @@ impl DeleteCursor { if operation.opstamp >= target_opstamp { break; } - } - else { + } else { break; } self.advance(); } } - /// If the current block has been entirely + /// If the current block has been entirely /// consumed, try to load the next one. - /// - /// Return `true`, if after this attempt, + /// + /// Return `true`, if after this attempt, /// the cursor is on a block that has not /// been entirely consumed. /// Return `false`, if we have reached the end of the queue. @@ -229,24 +219,20 @@ impl DeleteCursor { self.pos = 0; true } - None => { - false - } + None => false, } - } - else { + } else { true } } - + /// Advance to the next delete operation. /// Returns true iff there is such an operation. pub fn advance(&mut self) -> bool { if self.load_block_if_required() { self.pos += 1; true - } - else { + } else { false } } @@ -256,12 +242,10 @@ impl DeleteCursor { pub fn get(&mut self) -> Option<&DeleteOperation> { if self.load_block_if_required() { Some(&self.block.operations[self.pos]) - } - else { + } else { None } } - } @@ -278,12 +262,12 @@ mod tests { #[test] fn test_deletequeue() { let delete_queue = DeleteQueue::new(); - + let make_op = |i: usize| { let field = Field(1u32); DeleteOperation { opstamp: i as u64, - term: Term::from_field_u64(field, i as u64) + term: Term::from_field_u64(field, i as u64), } }; @@ -299,7 +283,7 @@ mod tests { operations_it.advance(); assert!(operations_it.get().is_none()); operations_it.advance(); - + let mut snapshot2 = delete_queue.cursor(); assert!(snapshot2.get().is_none()); delete_queue.push(make_op(3)); @@ -310,7 +294,7 @@ mod tests { assert!(operations_it.get().is_none()); operations_it.advance(); } - { + { let mut operations_it = snapshot.clone(); assert_eq!(operations_it.get().unwrap().opstamp, 1); operations_it.advance(); @@ -320,6 +304,6 @@ mod tests { operations_it.advance(); assert!(operations_it.get().is_none()); } - + } -} \ No newline at end of file +} diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index db149a297..ce63b7643 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -26,4 +26,4 @@ impl Drop for DirectoryLock { error!("Failed to remove the lock file. {:?}", e); } } -} \ No newline at end of file +} diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index 16eb1ff28..4928da45e 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -4,7 +4,7 @@ use DocId; // Doc to opstamp is used to identify which // document should be deleted. -// +// // Since the docset matching the query of a delete operation // is not computed right when the delete operation is received, // we need to find a way to evaluate, for each document, @@ -14,13 +14,13 @@ use DocId; // // The doc to opstamp mapping stores precisely an array // indexed by doc id and storing the opstamp of the document. -// +// // This mapping is (for the moment) stricly increasing // because of the way document id are allocated. #[derive(Clone)] pub enum DocToOpstampMapping { WithMap(Arc>), - None + None, } impl From> for DocToOpstampMapping { @@ -30,9 +30,8 @@ impl From> for DocToOpstampMapping { } impl DocToOpstampMapping { - /// Given an opstamp return the limit doc id L - /// such that all doc id D such that + /// such that all doc id D such that // D >= L iff opstamp(D) >= than `target_opstamp`. // // The edge case opstamp = some doc opstamp is in practise @@ -58,23 +57,24 @@ mod tests { #[test] fn test_doc_to_opstamp_mapping_none() { let doc_to_opstamp_mapping = DocToOpstampMapping::None; - assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value()); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), + u32::max_value()); } #[test] fn test_doc_to_opstamp_mapping_complex() { { - let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!()); + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]); assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0); } { - let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64)); + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]); assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1); } { - let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64)); + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]); assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); for i in 2u64..13u64 { assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1); @@ -90,4 +90,4 @@ mod tests { } } } -} \ No newline at end of file +} diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e6868943e..d6b20d7bf 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -27,7 +27,7 @@ use schema::Document; use schema::Schema; use schema::Term; use std::mem; -use std::mem::swap; +use std::mem::swap; use std::thread::JoinHandle; use super::directory_lock::DirectoryLock; use super::operation::AddOperation; @@ -54,11 +54,10 @@ type DocumentReceiver = chan::Receiver; /// Each indexing thread builds its own independant `Segment`, via /// a `SegmentWriter` object. pub struct IndexWriter { - - // the lock is just used to bind the + // the lock is just used to bind the // lifetime of the lock with that of the IndexWriter. - _directory_lock: DirectoryLock, - + _directory_lock: DirectoryLock, + index: Index, heap_size_in_bytes_per_thread: usize, @@ -102,36 +101,34 @@ impl !Sync for IndexWriter {} /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. -pub fn open_index_writer( - index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) -> Result { +pub fn open_index_writer(index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize) + -> Result { if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { panic!(format!("The heap size per thread needs to be at least {}.", - HEAP_SIZE_LIMIT)); + HEAP_SIZE_LIMIT)); } - + let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone())); - + let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); let delete_queue = DeleteQueue::new(); - + let current_opstamp = index.opstamp(); let stamper = Stamper::new(current_opstamp); - let segment_updater = SegmentUpdater::new(index.clone(), - stamper.clone(), - delete_queue.cursor())?; - + let segment_updater = + SegmentUpdater::new(index.clone(), stamper.clone(), delete_queue.cursor())?; + let mut index_writer = IndexWriter { - _directory_lock: directory_lock, - + heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, index: index.clone(), @@ -140,7 +137,7 @@ pub fn open_index_writer( segment_updater: segment_updater, - workers_join_handle: vec!(), + workers_join_handle: vec![], num_threads: num_threads, delete_queue: delete_queue, @@ -158,28 +155,28 @@ pub fn open_index_writer( -pub fn compute_deleted_bitset( - delete_bitset: &mut BitSet, - segment_reader: &SegmentReader, - delete_cursor: &mut DeleteCursor, - doc_opstamps: DocToOpstampMapping, - target_opstamp: u64) -> Result { - +pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, + segment_reader: &SegmentReader, + delete_cursor: &mut DeleteCursor, + doc_opstamps: DocToOpstampMapping, + target_opstamp: u64) + -> Result { + let mut might_have_changed = false; - + loop { if let Some(delete_op) = delete_cursor.get() { if delete_op.opstamp > target_opstamp { break; - } - else { + } else { // A delete operation should only affect // document that were inserted after it. - // + // // Limit doc helps identify the first document // that may be affected by the delete operation. let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + if let Some(mut docset) = + segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { while docset.advance() { let deleted_doc = docset.doc(); if deleted_doc < limit_doc { @@ -189,8 +186,7 @@ pub fn compute_deleted_bitset( } } } - } - else { + } else { break; } delete_cursor.advance(); @@ -200,10 +196,10 @@ pub fn compute_deleted_bitset( /// Advance delete for the given segment up /// to the target opstamp. -pub fn advance_deletes( - mut segment: Segment, - segment_entry: &mut SegmentEntry, - target_opstamp: u64) -> Result> { +pub fn advance_deletes(mut segment: Segment, + segment_entry: &mut SegmentEntry, + target_opstamp: u64) + -> Result> { let mut file_protect: Option = None; @@ -216,24 +212,20 @@ pub fn advance_deletes( } let segment_reader = SegmentReader::open(segment.clone())?; let max_doc = segment_reader.max_doc(); - - let mut delete_bitset: BitSet = - match segment_entry.delete_bitset() { - Some(ref previous_delete_bitset) => - (*previous_delete_bitset).clone(), - None => - BitSet::with_capacity(max_doc as usize) - }; - + + let mut delete_bitset: BitSet = match segment_entry.delete_bitset() { + Some(ref previous_delete_bitset) => (*previous_delete_bitset).clone(), + None => BitSet::with_capacity(max_doc as usize), + }; + let delete_cursor = segment_entry.delete_cursor(); - compute_deleted_bitset( - &mut delete_bitset, - &segment_reader, - delete_cursor, - DocToOpstampMapping::None, - target_opstamp)?; - + compute_deleted_bitset(&mut delete_bitset, + &segment_reader, + delete_cursor, + DocToOpstampMapping::None, + target_opstamp)?; + for doc in 0u32..max_doc { if segment_reader.is_deleted(doc) { delete_bitset.insert(doc as usize); @@ -257,7 +249,7 @@ fn index_documents(heap: &mut Heap, segment: Segment, schema: &Schema, generation: usize, - document_iterator: &mut Iterator, + document_iterator: &mut Iterator, segment_updater: &mut SegmentUpdater, mut delete_cursor: DeleteCursor) -> Result { @@ -273,10 +265,10 @@ fn index_documents(heap: &mut Heap, } } let num_docs = segment_writer.max_doc(); - + // this is ensured by the call to peek before starting // the worker thread. - assert!(num_docs > 0); + assert!(num_docs > 0); let doc_opstamps: Vec = segment_writer.finalize()?; @@ -284,58 +276,54 @@ fn index_documents(heap: &mut Heap, segment_meta.set_max_doc(num_docs); let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - + let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); let segment_reader = SegmentReader::open(segment)?; let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset( - &mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - doc_to_opstamps, - last_docstamp, - )?; + let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + doc_to_opstamps, + last_docstamp)?; - let segment_entry = SegmentEntry::new( - segment_meta, - delete_cursor, - { if may_have_deletes { Some(deleted_bitset) } - else { None } } - ); - - Ok( - segment_updater - .add_segment(generation, segment_entry) - ) - -} + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { + if may_have_deletes { + Some(deleted_bitset) + } else { + None + } + }); + + Ok(segment_updater.add_segment(generation, segment_entry)) + +} impl IndexWriter { /// The index writer pub fn wait_merging_threads(mut self) -> Result<()> { - + // this will stop the indexing thread, // dropping the last reference to the segment_updater. drop(self.document_sender); - - - let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!()); + + + let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]); for join_handle in former_workers_handles { - try!(join_handle.join() - .expect("Indexing Worker thread panicked") - .map_err(|e| { - Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e)) - })); + try!(join_handle + .join() + .expect("Indexing Worker thread panicked") + .map_err(|e| { + Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e)) + })); } drop(self.workers_join_handle); - let result = self.segment_updater - .wait_merging_thread() - .map_err(|_| - Error::ErrorInThread("Failed to join merging thread.".to_string()) - ); - + let result = + self.segment_updater + .wait_merging_thread() + .map_err(|_| Error::ErrorInThread("Failed to join merging thread.".to_string())); + if let &Err(ref e) = &result { error!("Some merging thread failed {:?}", e); } @@ -351,37 +339,34 @@ impl IndexWriter { let document_receiver_clone = self.document_receiver.clone(); let mut segment_updater = self.segment_updater.clone(); let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); - + let generation = self.generation; - + let mut delete_cursor = self.delete_queue.cursor(); - let join_handle: JoinHandle> = - thread::Builder::new() + let join_handle: JoinHandle> = thread::Builder::new() .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) .spawn(move || { - + loop { - let mut document_iterator = document_receiver_clone.clone() - .into_iter() - .peekable(); - + let mut document_iterator = + document_receiver_clone.clone().into_iter().peekable(); + // the peeking here is to avoid // creating a new segment's files // if no document are available. // - // this is a valid guarantee as the + // this is a valid guarantee as the // peeked document now belongs to // our local iterator. if let Some(operation) = document_iterator.peek() { delete_cursor.skip_to(operation.opstamp); - } - else { + } else { // No more documents. // Happens when there is a commit, or if the `IndexWriter` // was dropped. - return Ok(()) + return Ok(()); } let segment = segment_updater.new_segment(); index_documents(&mut heap, @@ -391,7 +376,7 @@ impl IndexWriter { &mut document_iterator, &mut segment_updater, delete_cursor.clone())?; - + } })?; self.worker_id += 1; @@ -408,7 +393,7 @@ impl IndexWriter { pub fn set_merge_policy(&self, merge_policy: Box) { self.segment_updater.set_merge_policy(merge_policy); } - + fn start_workers(&mut self) -> Result<()> { for _ in 0..self.num_threads { try!(self.add_indexing_worker()); @@ -423,7 +408,9 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future { + pub fn merge(&mut self, + segment_ids: &[SegmentId]) + -> impl Future { self.segment_updater.start_merge(segment_ids) } @@ -436,7 +423,8 @@ impl IndexWriter { /// /// Returns the former segment_ready channel. fn recreate_document_channel(&mut self) -> DocumentReceiver { - let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) = + let (mut document_sender, mut document_receiver): (DocumentSender, + DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); swap(&mut self.document_sender, &mut document_sender); swap(&mut self.document_receiver, &mut document_receiver); @@ -464,12 +452,9 @@ impl IndexWriter { let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread; drop(self); for _ in receiver_clone {} - - let index_writer = open_index_writer( - &index, - num_threads, - heap_size_in_bytes_per_thread)?; - + + let index_writer = open_index_writer(&index, num_threads, heap_size_in_bytes_per_thread)?; + Ok(index_writer) } @@ -511,23 +496,24 @@ impl IndexWriter { let mut former_workers_join_handle = Vec::new(); swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - + for worker_handle in former_workers_join_handle { - let indexing_worker_result = try!(worker_handle.join() - .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); + let indexing_worker_result = + try!(worker_handle + .join() + .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); try!(indexing_worker_result); // add a new worker for the next generation. try!(self.add_indexing_worker()); } - + // wait for the segment update thread to have processed the info - self.segment_updater - .commit(self.committed_opstamp)?; - + self.segment_updater.commit(self.committed_opstamp)?; + Ok(self.committed_opstamp) - } + } /// Delete all documents containing a given term. /// @@ -535,7 +521,7 @@ impl IndexWriter { /// were added in previous commits, and documents /// that were added previously in the same commit. /// - /// Like adds, the deletion itself will be visible + /// Like adds, the deletion itself will be visible /// only after calling `commit()`. pub fn delete_term(&mut self, term: Term) -> u64 { let opstamp = self.stamper.stamp(); @@ -548,7 +534,7 @@ impl IndexWriter { } /// Returns the opstamp of the last successful commit. - /// + /// /// This is, for instance, the opstamp the index will /// rollback to if there is a failure like a power surge. /// @@ -602,16 +588,18 @@ mod tests { _ => panic!("Expected FileAlreadyExists error"), } } - + #[test] fn test_set_merge_policy() { let schema_builder = schema::SchemaBuilder::default(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); + assert_eq!(format!("{:?}", index_writer.get_merge_policy()), + "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); let merge_policy = box NoMergePolicy::default(); index_writer.set_merge_policy(merge_policy); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy"); + assert_eq!(format!("{:?}", index_writer.get_merge_policy()), + "NoMergePolicy"); } #[test] @@ -620,12 +608,12 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); { let _index_writer = index.writer(40_000_000).unwrap(); - // the lock should be released when the + // the lock should be released when the // index_writer leaves the scope. } let _index_writer_two = index.writer(40_000_000).unwrap(); } - + #[test] fn test_commit_and_rollback() { let mut schema_builder = schema::SchemaBuilder::default(); @@ -648,7 +636,7 @@ mod tests { } index_writer = index_writer.rollback().unwrap(); - + assert_eq!(index_writer.commit_opstamp(), 0u64); assert_eq!(num_docs_containing("a"), 0); @@ -701,12 +689,14 @@ mod tests { } // this should create 8 segments and trigger a merge. index_writer.commit().expect("commit failed"); - index_writer.wait_merging_threads().expect("waiting merging thread failed"); + index_writer + .wait_merging_threads() + .expect("waiting merging thread failed"); index.load_searchers().unwrap(); - + assert_eq!(num_docs_containing("a"), 200); assert!(index.searchable_segments().unwrap().len() < 8); - + } } diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 951af8c31..0fea3d3ee 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -9,7 +9,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000; const DEFAULT_MIN_MERGE_SIZE: usize = 8; -/// LogMergePolicy tries tries to merge segments that have a similar number of +/// LogMergePolicy tries tries to merge segments that have a similar number of /// documents. #[derive(Debug, Clone)] pub struct LogMergePolicy { @@ -24,7 +24,7 @@ impl LogMergePolicy { } /// Set the minimum number of segment that may be merge together. - pub fn set_min_merge_size(&mut self, min_merge_size: usize) { + pub fn set_min_merge_size(&mut self, min_merge_size: usize) { self.min_merge_size = min_merge_size; } @@ -52,14 +52,16 @@ impl MergePolicy for LogMergePolicy { return Vec::new(); } - let mut size_sorted_tuples = segments.iter() + let mut size_sorted_tuples = segments + .iter() .map(|x| x.num_docs()) .enumerate() .collect::>(); size_sorted_tuples.sort_by(|x, y| y.cmp(x)); - let size_sorted_log_tuples: Vec<_> = size_sorted_tuples.into_iter() + let size_sorted_log_tuples: Vec<_> = size_sorted_tuples + .into_iter() .map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2())) .collect(); @@ -77,14 +79,10 @@ impl MergePolicy for LogMergePolicy { levels .iter() .filter(|level| level.len() >= self.min_merge_size) - .map(|ind_vec| { - MergeCandidate(ind_vec.iter() - .map(|&ind| segments[ind].id()) - .collect()) - }) + .map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())) .collect() } - + fn box_clone(&self) -> Box { box self.clone() } @@ -128,9 +126,7 @@ mod tests { #[test] fn test_log_merge_policy_pair() { - let test_input = vec![seg_meta(10), - seg_meta(10), - seg_meta(10)]; + let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 1); } diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index ecab510d7..69a958b39 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -4,26 +4,26 @@ use std::marker; use std::fmt::Debug; -/// Set of segment suggested for a merge. +/// Set of segment suggested for a merge. #[derive(Debug, Clone)] pub struct MergeCandidate(pub Vec); -/// The Merge policy defines which segments should be merged. -/// +/// The Merge policy defines which segments should be merged. +/// /// Every time a the list of segments changes, the segment updater /// asks the merge policy if some segments should be merged. pub trait MergePolicy: marker::Send + marker::Sync + Debug { - /// Given the list of segment metas, returns the list of merge candidates. + /// Given the list of segment metas, returns the list of merge candidates. /// - /// This call happens on the segment updater thread, and will block - /// other segment updates, so all implementations should happen rapidly. + /// This call happens on the segment updater thread, and will block + /// other segment updates, so all implementations should happen rapidly. fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec; /// Returns a boxed clone of the MergePolicy. fn box_clone(&self) -> Box; } -/// Never merge segments. +/// Never merge segments. #[derive(Debug)] pub struct NoMergePolicy; @@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy { fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec { Vec::new() } - + fn box_clone(&self) -> Box { box NoMergePolicy } @@ -66,15 +66,14 @@ pub mod tests { .map(|segment_meta| segment_meta.id()) .collect::>(); if segment_ids.len() > 1 { - vec!(MergeCandidate(segment_ids)) - } - else { - vec!() + vec![MergeCandidate(segment_ids)] + } else { + vec![] } } - + fn box_clone(&self) -> Box { box MergeWheneverPossible } } -} \ No newline at end of file +} diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 71c1fa7c7..4bb52587a 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -31,9 +31,7 @@ struct DeltaPositionComputer { impl DeltaPositionComputer { fn new() -> DeltaPositionComputer { - DeltaPositionComputer { - buffer: vec![0u32; 512] - } + DeltaPositionComputer { buffer: vec![0u32; 512] } } fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] { @@ -50,16 +48,17 @@ impl DeltaPositionComputer { } -fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> { +fn compute_min_max_val(u64_reader: &U64FastFieldReader, + max_doc: DocId, + delete_bitset: &DeleteBitSet) + -> Option<(u64, u64)> { if max_doc == 0 { None - } - else if !delete_bitset.has_deletes() { - // no deleted documents, + } else if !delete_bitset.has_deletes() { + // no deleted documents, // we can use the previous min_val, max_val. Some((u64_reader.min_value(), u64_reader.max_value())) - } - else { + } else { // some deleted documents, // we need to recompute the max / min (0..max_doc) @@ -70,19 +69,21 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_b } } -fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option { +fn extract_fieldnorm_reader(segment_reader: &SegmentReader, + field: Field) + -> Option { segment_reader.get_fieldnorms_reader(field) } -fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option { - segment_reader - .fast_fields_reader() - .open_reader(field) +fn extract_fast_field_reader(segment_reader: &SegmentReader, + field: Field) + -> Option { + segment_reader.fast_fields_reader().open_reader(field) } impl IndexMerger { pub fn open(schema: Schema, segments: &[Segment]) -> Result { - let mut readers = vec!(); + let mut readers = vec![]; let mut max_doc: u32 = 0u32; for segment in segments { if segment.meta().num_docs() > 0 { @@ -92,65 +93,75 @@ impl IndexMerger { } } Ok(IndexMerger { - schema: schema, - readers: readers, - max_doc: max_doc, - }) + schema: schema, + readers: readers, + max_doc: max_doc, + }) } - fn write_fieldnorms(&self, - fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { + fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { let fieldnorm_fastfields: Vec = self.schema - .fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field_id, _)| Field(field_id as u32)) - .collect(); - self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer) + .fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| field_entry.is_indexed()) + .map(|(field_id, _)| Field(field_id as u32)) + .collect(); + self.generic_write_fast_field(fieldnorm_fastfields, + &extract_fieldnorm_reader, + fast_field_serializer) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { let fast_fields: Vec = self.schema - .fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_int_fast()) - .map(|(field_id, _)| Field(field_id as u32)) - .collect(); - self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer) + .fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| field_entry.is_int_fast()) + .map(|(field_id, _)| Field(field_id as u32)) + .collect(); + self.generic_write_fast_field(fast_fields, + &extract_fast_field_reader, + fast_field_serializer) } // used both to merge field norms and regular u64 fast fields. fn generic_write_fast_field(&self, - fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) -> Option, - fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { - + fields: Vec, + field_reader_extractor: &Fn(&SegmentReader, Field) + -> Option, + fast_field_serializer: &mut FastFieldSerializer) + -> Result<()> { + for field in fields { - - let mut u64_readers = vec!(); + + let mut u64_readers = vec![]; let mut min_val = u64::max_value(); let mut max_val = u64::min_value(); - + for reader in &self.readers { match field_reader_extractor(reader, field) { Some(u64_reader) => { - if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) { + if let Some((seg_min_val, seg_max_val)) = + compute_min_max_val(&u64_reader, + reader.max_doc(), + reader.delete_bitset()) { // the segment has some non-deleted documents min_val = min(min_val, seg_min_val); max_val = max(max_val, seg_max_val); - u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset())); - } + u64_readers + .push((reader.max_doc(), u64_reader, reader.delete_bitset())); + } } None => { - let error_msg = format!("Failed to find a u64_reader for field {:?}", field); + let error_msg = format!("Failed to find a u64_reader for field {:?}", + field); error!("{}", error_msg); - return Err(Error::SchemaError(error_msg)) + return Err(Error::SchemaError(error_msg)); } } - + } if u64_readers.is_empty() { @@ -160,7 +171,7 @@ impl IndexMerger { } assert!(min_val <= max_val); - + try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val)); for (max_doc, u64_reader, delete_bitset) in u64_readers { for doc_id in 0..max_doc { @@ -176,32 +187,29 @@ impl IndexMerger { Ok(()) } - fn write_postings( - &self, - postings_serializer: &mut PostingsSerializer) -> Result<()> { - + fn write_postings(&self, postings_serializer: &mut PostingsSerializer) -> Result<()> { + let mut merged_terms = TermIterator::from(&self.readers[..]); let mut delta_position_computer = DeltaPositionComputer::new(); - + let mut max_doc = 0; // map from segment doc ids to the resulting merged segment doc id. let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); - + for reader in &self.readers { let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); for doc_id in 0..reader.max_doc() { if reader.is_deleted(doc_id) { segment_local_map.push(None); - } - else { + } else { segment_local_map.push(Some(max_doc)); max_doc += 1u32; } } merged_doc_id_map.push(segment_local_map); } - + while merged_terms.advance() { // Create the total list of doc ids // by stacking the doc ids from the different segment. @@ -215,27 +223,28 @@ impl IndexMerger { let term = merged_terms.term(); let mut term_written = false; let segment_postings = merged_terms - .segment_ords() - .iter() - .cloned() - .flat_map(|segment_ord| { - self.readers[segment_ord] - .read_postings_all_info(&term) - .map(|segment_postings| (segment_ord, segment_postings)) - }) - .collect::>(); + .segment_ords() + .iter() + .cloned() + .flat_map(|segment_ord| { + self.readers[segment_ord] + .read_postings_all_info(&term) + .map(|segment_postings| (segment_ord, segment_postings)) + }) + .collect::>(); // We can remove the term if all documents which // contained it have been deleted. if segment_postings.len() > 0 { - + // We can now serialize this postings, by pushing each document to the - // postings serializer. - + // postings serializer. + for (segment_ord, mut segment_postings) in segment_postings { let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; while segment_postings.advance() { - if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] { + if let Some(remapped_doc_id) = + old_to_new_doc_id[segment_postings.doc() as usize] { if !term_written { // we make sure to only write the term iff // there is at least one document. @@ -243,11 +252,11 @@ impl IndexMerger { term_written = true; } let delta_positions: &[u32] = - delta_position_computer.compute_delta_positions(segment_postings.positions()); - try!(postings_serializer.write_doc( - remapped_doc_id, - segment_postings.term_freq(), - delta_positions)); + delta_position_computer + .compute_delta_positions(segment_postings.positions()); + try!(postings_serializer.write_doc(remapped_doc_id, + segment_postings.term_freq(), + delta_positions)); } } } @@ -256,7 +265,7 @@ impl IndexMerger { try!(postings_serializer.close_term()); } } - + } Ok(()) } @@ -267,12 +276,10 @@ impl IndexMerger { for doc_id in 0..reader.max_doc() { if !reader.is_deleted(doc_id) { let doc = try!(store_reader.get(doc_id)); - let field_values: Vec<&FieldValue> = doc.field_values() - .iter() - .collect(); + let field_values: Vec<&FieldValue> = doc.field_values().iter().collect(); try!(store_writer.store(&field_values)); } - } + } } Ok(()) } @@ -311,8 +318,8 @@ mod tests { fn test_index_merger_no_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); let text_fieldtype = schema::TextOptions::default() - .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) - .set_stored(); + .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) + .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(); let score_field = schema_builder.add_u64_field("score", score_fieldtype); @@ -361,11 +368,14 @@ mod tests { } } { - let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer.merge(&segment_ids) - .wait() - .expect("Merging failed"); + index_writer + .merge(&segment_ids) + .wait() + .expect("Merging failed"); index_writer.wait_merging_threads().unwrap(); } { @@ -379,13 +389,13 @@ mod tests { }; { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), - vec!(1, 2, 4,)); + vec![1, 2, 4]); assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), - vec!(0, 3,)); + vec![0, 3]); assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]), - vec!(4,)); + vec![4]); assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), - vec!(0, 1, 2, 3, 4,)); + vec![0, 1, 2, 3, 4]); } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); @@ -415,12 +425,12 @@ mod tests { collector.vals() }; assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]), - vec!(5, 7, 13,)); + vec![5, 7, 13]); } } } - fn search_term(searcher: &Searcher, term: Term) -> Vec { + fn search_term(searcher: &Searcher, term: Term) -> Vec { let mut collector = FastFieldTestCollector::for_field(Field(1)); let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq); searcher.search(&term_query, &mut collector).unwrap(); @@ -430,8 +440,7 @@ mod tests { #[test] fn test_index_merger_with_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); - let text_fieldtype = schema::TextOptions - ::default() + let text_fieldtype = schema::TextOptions::default() .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); @@ -441,21 +450,19 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let empty_vec = Vec::::new(); - - { // a first commit - index_writer.add_document( - doc!( + + { + // a first commit + index_writer.add_document(doc!( text_field => "a b d", score_field => 1u64 )); - index_writer.add_document( - doc!( + index_writer.add_document(doc!( text_field => "b c", score_field => 2u64 )); index_writer.delete_term(Term::from_field_text(text_field, "c")); - index_writer.add_document( - doc!( + index_writer.add_document(doc!( text_field => "c d", score_field => 3u64 )); @@ -465,31 +472,32 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!(1)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!(1)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(1, 3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), + vec![1]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), + vec![1]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![1, 3]); } - { // a second commit - index_writer.add_document( - doc!( + { + // a second commit + index_writer.add_document(doc!( text_field => "a d e", score_field => 4_000u64 )); - index_writer.add_document( - doc!( + index_writer.add_document(doc!( text_field => "e f", score_field => 5_000u64 )); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "f")); - index_writer.add_document( - doc!( + index_writer.add_document(doc!( text_field => "f g", score_field => 6_000u64 )); - index_writer.add_document( - doc!( + index_writer.add_document(doc!( text_field => "g h", score_field => 7_000u64 )); @@ -503,71 +511,112 @@ mod tests { assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[1].num_docs(), 2); assert_eq!(searcher.segment_readers()[1].max_doc(), 4); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); - - let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000]); + + let score_field_reader: U64FastFieldReader = searcher + .segment_reader(0) + .get_fast_field_reader(score_field) + .unwrap(); assert_eq!(score_field_reader.min_value(), 1); assert_eq!(score_field_reader.max_value(), 3); - let score_field_reader: U64FastFieldReader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap(); + let score_field_reader: U64FastFieldReader = searcher + .segment_reader(1) + .get_fast_field_reader(score_field) + .unwrap(); assert_eq!(score_field_reader.min_value(), 4000); assert_eq!(score_field_reader.max_value(), 7000); } - { // merging the segments - let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); - index_writer.merge(&segment_ids) - .wait() - .expect("Merging failed"); + { + // merging the segments + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer + .merge(&segment_ids) + .wait() + .expect("Merging failed"); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); - let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), + vec![3]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), + vec![3]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000]); + let score_field_reader: U64FastFieldReader = searcher + .segment_reader(0) + .get_fast_field_reader(score_field) + .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); } - { + { // test a commit with only deletes index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.commit().unwrap(); - + index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); - let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000]); + let score_field_reader: U64FastFieldReader = searcher + .segment_reader(0) + .get_fast_field_reader(score_field) + .unwrap(); assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.max_value(), 7000); } - { // Test merging a single segment in order to remove deletes. - let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); - index_writer.merge(&segment_ids) - .wait() - .expect("Merging failed"); + { + // Test merging a single segment in order to remove deletes. + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer + .merge(&segment_ids) + .wait() + .expect("Merging failed"); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); @@ -575,31 +624,45 @@ mod tests { assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].max_doc(), 2); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); - assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); - let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), + empty_vec); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), + vec![6_000]); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), + vec![6_000, 7_000]); + let score_field_reader: U64FastFieldReader = searcher + .segment_reader(0) + .get_fast_field_reader(score_field) + .unwrap(); assert_eq!(score_field_reader.min_value(), 6000); assert_eq!(score_field_reader.max_value(), 7000); } - { // Test removing all docs + { + // Test removing all docs index_writer.delete_term(Term::from_field_text(text_field, "g")); - let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); - index_writer.merge(&segment_ids) - .wait() - .expect("Merging failed"); + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + index_writer + .merge(&segment_ids) + .wait() + .expect("Merging failed"); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.num_docs(), 0); } - - + + } } diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 86673d517..728cf2e55 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -8,11 +8,11 @@ use std::fmt; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum SegmentState { Ready, - InMerge, + InMerge, } impl SegmentState { - pub fn letter_code(&self,) -> char { + pub fn letter_code(&self) -> char { match *self { SegmentState::InMerge => 'M', SegmentState::Ready => 'R', @@ -21,12 +21,12 @@ impl SegmentState { } -/// A segment entry describes the state of +/// A segment entry describes the state of /// a given segment, at a given instant. /// /// In addition to segment meta, /// it contains a few transient states -/// - state expresses whether the segment is already in the +/// - state expresses whether the segment is already in the /// middle of a merge /// - delete_bitset is a bitset describing /// documents that were deleted during the commit @@ -40,16 +40,14 @@ pub struct SegmentEntry { state: SegmentState, delete_bitset: Option, delete_cursor: DeleteCursor, - } impl SegmentEntry { - - /// Create a new `SegmentEntry` - pub fn new(segment_meta: SegmentMeta, + pub fn new(segment_meta: SegmentMeta, delete_cursor: DeleteCursor, - delete_bitset: Option) -> SegmentEntry { + delete_bitset: Option) + -> SegmentEntry { SegmentEntry { meta: segment_meta, state: SegmentState::Ready, @@ -62,7 +60,7 @@ impl SegmentEntry { /// Return a reference to the segment entry deleted bitset. /// /// `DocId` in this bitset are flagged as deleted. - pub fn delete_bitset(&self,) -> Option<&BitSet> { + pub fn delete_bitset(&self) -> Option<&BitSet> { self.delete_bitset.as_ref() } @@ -77,7 +75,7 @@ impl SegmentEntry { &mut self.delete_cursor } - /// Return the `SegmentEntry`. + /// Return the `SegmentEntry`. /// /// The state describes whether the segment is available for /// a merge or not. @@ -89,7 +87,7 @@ impl SegmentEntry { pub fn segment_id(&self) -> SegmentId { self.meta.id() } - + /// Accessor to the `SegmentMeta` pub fn meta(&self) -> &SegmentMeta { @@ -99,9 +97,9 @@ impl SegmentEntry { /// Mark the `SegmentEntry` as in merge. /// - /// Only segments that are not already + /// Only segments that are not already /// in a merge are elligible for future merge. - pub fn start_merge(&mut self,) { + pub fn start_merge(&mut self) { self.state = SegmentState::InMerge; } @@ -110,14 +108,14 @@ impl SegmentEntry { /// If a merge fails, it is important to switch /// the segment back to a idle state, so that it /// may be elligible for future merges. - pub fn cancel_merge(&mut self,) { + pub fn cancel_merge(&mut self) { self.state = SegmentState::Ready; } /// Returns true iff a segment should /// be considered for a merge. - pub fn is_ready(&self,) -> bool { + pub fn is_ready(&self) -> bool { self.state == SegmentState::Ready } } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index ef42303d1..2fea73b25 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -14,7 +14,7 @@ use indexer::delete_queue::DeleteCursor; struct SegmentRegisters { uncommitted: SegmentRegister, committed: SegmentRegister, - writing: HashSet, + writing: HashSet, } @@ -22,7 +22,7 @@ struct SegmentRegisters { /// The segment manager stores the list of segments /// as well as their state. /// -/// It guarantees the atomicity of the +/// It guarantees the atomicity of the /// changes (merges especially) #[derive(Default)] pub struct SegmentManager { @@ -32,43 +32,43 @@ pub struct SegmentManager { impl Debug for SegmentManager { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { let lock = self.read(); - write!(f, "{{ uncommitted: {:?}, committed: {:?} }}", lock.uncommitted, lock.committed) + write!(f, + "{{ uncommitted: {:?}, committed: {:?} }}", + lock.uncommitted, + lock.committed) } } -pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { +pub fn get_mergeable_segments(segment_manager: &SegmentManager) + -> (Vec, Vec) { let registers_lock = segment_manager.read(); (registers_lock.committed.get_mergeable_segments(), registers_lock.uncommitted.get_mergeable_segments()) } impl SegmentManager { - - pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteCursor) -> SegmentManager { + pub fn from_segments(segment_metas: Vec, + delete_cursor: DeleteCursor) + -> SegmentManager { SegmentManager { registers: RwLock::new(SegmentRegisters { - uncommitted: SegmentRegister::default(), - committed: SegmentRegister::new(segment_metas, delete_cursor), - writing: HashSet::new(), - }), + uncommitted: SegmentRegister::default(), + committed: SegmentRegister::new(segment_metas, + delete_cursor), + writing: HashSet::new(), + }), } } /// Returns all of the segment entries (committed or uncommitted) - pub fn segment_entries(&self,) -> Vec { - let mut segment_entries = self.read() - .uncommitted - .segment_entries(); - segment_entries.extend( - self.read() - .committed - .segment_entries() - ); + pub fn segment_entries(&self) -> Vec { + let mut segment_entries = self.read().uncommitted.segment_entries(); + segment_entries.extend(self.read().committed.segment_entries()); segment_entries } /// Returns the overall number of segments in the `SegmentManager` - pub fn num_segments(&self,) -> usize { + pub fn num_segments(&self) -> usize { let registers_lock = self.read(); registers_lock.committed.len() + registers_lock.uncommitted.len() } @@ -78,19 +78,14 @@ impl SegmentManager { let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); files.insert(LOCKFILE_FILEPATH.clone()); - - let segment_metas: Vec = - registers_lock.committed - .get_all_segments() - .into_iter() - .chain(registers_lock.uncommitted - .get_all_segments() - .into_iter()) - .chain(registers_lock.writing - .iter() - .cloned() - .map(SegmentMeta::new)) - .collect(); + + let segment_metas: Vec = registers_lock + .committed + .get_all_segments() + .into_iter() + .chain(registers_lock.uncommitted.get_all_segments().into_iter()) + .chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new)) + .collect(); for segment_meta in segment_metas { files.extend(segment_meta.list_files()); } @@ -102,18 +97,22 @@ impl SegmentManager { registers .committed .segment_entry(segment_id) - .or_else(|| registers.uncommitted.segment_entry(segment_id)) + .or_else(|| registers.uncommitted.segment_entry(segment_id)) } // Lock poisoning should never happen : // The lock is acquired and released within this class, - // and the operations cannot panic. - fn read(&self,) -> RwLockReadGuard { - self.registers.read().expect("Failed to acquire read lock on SegmentManager.") + // and the operations cannot panic. + fn read(&self) -> RwLockReadGuard { + self.registers + .read() + .expect("Failed to acquire read lock on SegmentManager.") } - fn write(&self,) -> RwLockWriteGuard { - self.registers.write().expect("Failed to acquire write lock on SegmentManager.") + fn write(&self) -> RwLockWriteGuard { + self.registers + .write() + .expect("Failed to acquire write lock on SegmentManager.") } pub fn commit(&self, segment_entries: Vec) { @@ -124,42 +123,42 @@ impl SegmentManager { registers_lock.committed.add_segment_entry(segment_entry); } } - + pub fn start_merge(&self, segment_ids: &[SegmentId]) { let mut registers_lock = self.write(); if registers_lock.uncommitted.contains_all(segment_ids) { for segment_id in segment_ids { registers_lock.uncommitted.start_merge(segment_id); } - } - else if registers_lock.committed.contains_all(segment_ids) { + } else if registers_lock.committed.contains_all(segment_ids) { for segment_id in segment_ids { registers_lock.committed.start_merge(segment_id); } - } - else { + } else { error!("Merge operation sent for segments that are not all uncommited or commited."); } } pub fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_id: SegmentId) { - + before_merge_segment_ids: &[SegmentId], + after_merge_segment_id: SegmentId) { + let mut registers_lock = self.write(); - + // we mark all segments are ready for merge. { let target_segment_register: &mut SegmentRegister; target_segment_register = { - if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { + if registers_lock + .uncommitted + .contains_all(&before_merge_segment_ids) { &mut registers_lock.uncommitted - } - else if registers_lock.committed.contains_all(&before_merge_segment_ids) { + } else if registers_lock + .committed + .contains_all(&before_merge_segment_ids) { &mut registers_lock.committed - } - else { + } else { warn!("couldn't find segment in SegmentManager"); return; } @@ -185,19 +184,24 @@ impl SegmentManager { registers_lock.writing.remove(&segment_entry.segment_id()); registers_lock.uncommitted.add_segment_entry(segment_entry); } - + pub fn end_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentEntry) { - + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentEntry) { + let mut registers_lock = self.write(); - registers_lock.writing.remove(&after_merge_segment_entry.segment_id()); - + registers_lock + .writing + .remove(&after_merge_segment_entry.segment_id()); + let mut target_register: &mut SegmentRegister = { - if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { + if registers_lock + .uncommitted + .contains_all(&before_merge_segment_ids) { &mut registers_lock.uncommitted - } - else if registers_lock.committed.contains_all(&before_merge_segment_ids) { + } else if registers_lock + .committed + .contains_all(&before_merge_segment_ids) { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); @@ -209,12 +213,12 @@ impl SegmentManager { } target_register.add_segment_entry(after_merge_segment_entry); - - + + } - pub fn committed_segment_metas(&self,) -> Vec { + pub fn committed_segment_metas(&self) -> Vec { let registers_lock = self.read(); registers_lock.committed.segment_metas() } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index f8f7c7d64..9b8fc8427 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -9,14 +9,14 @@ use indexer::delete_queue::DeleteCursor; /// The segment register keeps track /// of the list of segment, their size as well /// as the state they are in. -/// -/// It is consumed by indexes to get the list of +/// +/// It is consumed by indexes to get the list of /// segments that are currently searchable, -/// and by the index merger to identify +/// and by the index merger to identify /// merge candidates. #[derive(Default)] pub struct SegmentRegister { - segment_states: HashMap, + segment_states: HashMap, } @@ -33,8 +33,7 @@ impl Debug for SegmentRegister { impl SegmentRegister { - - pub fn clear(&mut self,) { + pub fn clear(&mut self) { self.segment_states.clear(); } @@ -42,29 +41,26 @@ impl SegmentRegister { self.segment_states.len() } - pub fn get_all_segments(&self,) -> Vec { + pub fn get_all_segments(&self) -> Vec { self.segment_states .values() .map(|segment_entry| segment_entry.meta().clone()) .collect() } - - pub fn get_mergeable_segments(&self,) -> Vec { + + pub fn get_mergeable_segments(&self) -> Vec { self.segment_states .values() .filter(|segment_entry| segment_entry.is_ready()) .map(|segment_entry| segment_entry.meta().clone()) .collect() } - - pub fn segment_entries(&self,) -> Vec { - self.segment_states - .values() - .cloned() - .collect() + + pub fn segment_entries(&self) -> Vec { + self.segment_states.values().cloned().collect() } - - pub fn segment_metas(&self,) -> Vec { + + pub fn segment_metas(&self) -> Vec { let mut segment_ids: Vec = self.segment_states .values() .map(|segment_entry| segment_entry.meta().clone()) @@ -72,28 +68,28 @@ impl SegmentRegister { segment_ids.sort_by_key(|meta| meta.id()); segment_ids } - + pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { self.segment_states .get(&segment_id) .map(|segment_entry| segment_entry.clone()) } - + pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { segment_ids .iter() .all(|segment_id| self.segment_states.contains_key(segment_id)) } - + pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) { let segment_id = segment_entry.segment_id(); self.segment_states.insert(segment_id, segment_entry); } - + pub fn remove_segment(&mut self, segment_id: &SegmentId) { self.segment_states.remove(segment_id); - } - + } + pub fn cancel_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) @@ -106,21 +102,16 @@ impl SegmentRegister { .get_mut(segment_id) .expect("Received a merge notification for a segment that is not registered") .start_merge(); - } - + } + pub fn new(segment_metas: Vec, delete_cursor: DeleteCursor) -> SegmentRegister { let mut segment_states = HashMap::new(); for segment_meta in segment_metas { let segment_id = segment_meta.id(); - let segment_entry = SegmentEntry::new( - segment_meta, - delete_cursor.clone(), - None); + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None); segment_states.insert(segment_id, segment_entry); } - SegmentRegister { - segment_states: segment_states - } + SegmentRegister { segment_states: segment_states } } } @@ -140,7 +131,7 @@ mod tests { .map(|segment_meta| segment_meta.id()) .collect() } - + #[test] fn test_segment_register() { let delete_queue = DeleteQueue::new(); @@ -149,32 +140,48 @@ mod tests { let segment_id_a = SegmentId::generate_random(); let segment_id_b = SegmentId::generate_random(); let segment_id_merged = SegmentId::generate_random(); - + { let segment_meta = SegmentMeta::new(segment_id_a); - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready); - assert_eq!(segment_ids(&segment_register), vec!(segment_id_a)); + assert_eq!(segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::Ready); + assert_eq!(segment_ids(&segment_register), vec![segment_id_a]); { let segment_meta = SegmentMeta::new(segment_id_b); - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready); + assert_eq!(segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::Ready); segment_register.start_merge(&segment_id_a); segment_register.start_merge(&segment_id_b); - assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::InMerge); - assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::InMerge); + assert_eq!(segment_register + .segment_entry(&segment_id_a) + .unwrap() + .state(), + SegmentState::InMerge); + assert_eq!(segment_register + .segment_entry(&segment_id_b) + .unwrap() + .state(), + SegmentState::InMerge); segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); { let segment_meta_merged = SegmentMeta::new(segment_id_merged); - let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None); - segment_register.add_segment_entry(segment_entry); + let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None); + segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged)); + assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]); } - -} \ No newline at end of file + +} diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index e84a24c33..7d800f708 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -29,11 +29,11 @@ impl SegmentSerializer { let postings_serializer = try!(PostingsSerializer::open(segment)); Ok(SegmentSerializer { - postings_serializer: postings_serializer, - store_writer: StoreWriter::new(store_write), - fast_field_serializer: fast_field_serializer, - fieldnorms_serializer: fieldnorms_serializer, - }) + postings_serializer: postings_serializer, + store_writer: StoreWriter::new(store_write), + fast_field_serializer: fast_field_serializer, + fieldnorms_serializer: fieldnorms_serializer, + }) } /// Accessor to the `PostingsSerializer`. diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 2428026b2..2a7108ff7 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -49,11 +49,8 @@ use super::segment_manager::{SegmentManager, get_mergeable_segments}; /// and flushed. /// /// This method is not part of tantivy's public API -pub fn save_new_metas(schema: Schema, - opstamp: u64, - directory: &mut Directory) - -> Result<()> { - save_metas(vec!(), schema, opstamp, directory) +pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> { + save_metas(vec![], schema, opstamp, directory) } @@ -82,7 +79,7 @@ pub fn save_metas(segment_metas: Vec, let res = directory.atomic_write(&META_FILEPATH, &w[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); Ok(res) - + } @@ -90,7 +87,7 @@ pub fn save_metas(segment_metas: Vec, // of the `SegmentUpdate`s. // // All this processing happens on a single thread -// consuming a common queue. +// consuming a common queue. #[derive(Clone)] pub struct SegmentUpdater(Arc); @@ -99,56 +96,56 @@ pub struct SegmentUpdater(Arc); fn perform_merge(segment_ids: &[SegmentId], segment_updater: &SegmentUpdater, mut merged_segment: Segment, - target_opstamp: u64) -> Result { + target_opstamp: u64) + -> Result { // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids); - + let ref index = segment_updater.0.index; let schema = index.schema(); - let mut segment_entries = vec!(); + let mut segment_entries = vec![]; - let mut file_protections: Vec = vec!(); + let mut file_protections: Vec = vec![]; for segment_id in segment_ids { - if let Some(mut segment_entry) = segment_updater.0 - .segment_manager - .segment_entry(segment_id) { + if let Some(mut segment_entry) = + segment_updater.0.segment_manager.segment_entry(segment_id) { let segment = index.segment(segment_entry.meta().clone()); - if let Some(file_protection) = advance_deletes(segment, &mut segment_entry, target_opstamp)? { + if let Some(file_protection) = + advance_deletes(segment, &mut segment_entry, target_opstamp)? { file_protections.push(file_protection); } segment_entries.push(segment_entry); - } - else { + } else { error!("Error, had to abort merge as some of the segment is not managed anymore.a"); - return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id))); + return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", + segment_id))); } } - + let delete_cursor = segment_entries[0].delete_cursor().clone(); let segments: Vec = segment_entries .iter() - .map(|segment_entry| { - index.segment(segment_entry.meta().clone()) - }) + .map(|segment_entry| index.segment(segment_entry.meta().clone())) .collect(); - - + + // An IndexMerger is like a "view" of our merged segments. let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?; // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = - SegmentSerializer::for_segment(&mut merged_segment) + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment) .expect("Creating index serializer failed"); - let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); + let num_docs = merger + .write(segment_serializer) + .expect("Serializing merged index failed"); let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_max_doc(num_docs); - + let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None); Ok(after_merge_segment_entry) } @@ -162,30 +159,28 @@ struct InnerSegmentUpdater { merging_thread_id: AtomicUsize, merging_threads: RwLock>>>, generation: AtomicUsize, - killed: AtomicBool, + killed: AtomicBool, stamper: Stamper, } impl SegmentUpdater { - pub fn new(index: Index, stamper: Stamper, - delete_cursor: DeleteCursor) -> Result { + delete_cursor: DeleteCursor) + -> Result { let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); - Ok( - SegmentUpdater(Arc::new(InnerSegmentUpdater { - pool: CpuPool::new(1), - index: index, - segment_manager: segment_manager, - merge_policy: RwLock::new(box DefaultMergePolicy::default()), - merging_thread_id: AtomicUsize::default(), - merging_threads: RwLock::new(HashMap::new()), - generation: AtomicUsize::default(), - killed: AtomicBool::new(false), - stamper: stamper, - })) - ) + Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater { + pool: CpuPool::new(1), + index: index, + segment_manager: segment_manager, + merge_policy: RwLock::new(box DefaultMergePolicy::default()), + merging_thread_id: AtomicUsize::default(), + merging_threads: RwLock::new(HashMap::new()), + generation: AtomicUsize::default(), + killed: AtomicBool::new(false), + stamper: stamper, + }))) } pub fn new_segment(&self) -> Segment { @@ -200,40 +195,41 @@ impl SegmentUpdater { } pub fn set_merge_policy(&self, merge_policy: Box) { - *self.0.merge_policy.write().unwrap()= merge_policy; + *self.0.merge_policy.write().unwrap() = merge_policy; } fn get_merging_thread_id(&self) -> usize { self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) } - fn run_async T>(&self, f: F) -> CpuFuture { + fn run_async T> + (&self, + f: F) + -> CpuFuture { let me_clone = self.clone(); - self.0.pool.spawn_fn(move || { - Ok(f(me_clone)) - }) + self.0.pool.spawn_fn(move || Ok(f(me_clone))) } pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool { if generation >= self.0.generation.load(Ordering::Acquire) { self.run_async(|segment_updater| { - segment_updater.0.segment_manager.add_segment(segment_entry); - segment_updater.consider_merge_options(); - true - }).forget(); + segment_updater.0.segment_manager.add_segment(segment_entry); + segment_updater.consider_merge_options(); + true + }) + .forget(); true - } - else { + } else { false } } - pub fn kill(&mut self,) { + pub fn kill(&mut self) { self.0.killed.store(true, Ordering::Release); } - fn is_alive(&self,) -> bool { + fn is_alive(&self) -> bool { !self.0.killed.load(Ordering::Acquire) } @@ -243,77 +239,80 @@ impl SegmentUpdater { /// Tne method returns copies of the segment entries, /// updated with the delete information. fn purge_deletes(&self, target_opstamp: u64) -> Result> { - let mut segment_entries = self.0.segment_manager.segment_entries(); + let mut segment_entries = self.0.segment_manager.segment_entries(); for segment_entry in &mut segment_entries { let segment = self.0.index.segment(segment_entry.meta().clone()); advance_deletes(segment, segment_entry, target_opstamp)?; } Ok(segment_entries) - + } pub fn save_metas(&self, opstamp: u64) { if self.is_alive() { let index = &self.0.index; let directory = index.directory(); - save_metas( - self.0.segment_manager.committed_segment_metas(), - index.schema(), - opstamp, - directory.box_clone().borrow_mut()).expect("Could not save metas."); + save_metas(self.0.segment_manager.committed_segment_metas(), + index.schema(), + opstamp, + directory.box_clone().borrow_mut()) + .expect("Could not save metas."); } } pub fn garbage_collect_files(&self) -> Result<()> { - self.run_async(move |segment_updater| { - segment_updater.garbage_collect_files_exec(); - }).wait() + self.run_async(move |segment_updater| { segment_updater.garbage_collect_files_exec(); }) + .wait() } - fn garbage_collect_files_exec(&self) { + fn garbage_collect_files_exec(&self) { let living_files = self.0.segment_manager.list_files(); let mut index = self.0.index.clone(); index.directory_mut().garbage_collect(living_files); } pub fn commit(&self, opstamp: u64) -> Result<()> { - self.run_async(move |segment_updater| { - if segment_updater.is_alive() { - let segment_entries = segment_updater - .purge_deletes(opstamp) - .expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(segment_entries); - segment_updater.save_metas(opstamp); - segment_updater.garbage_collect_files_exec(); - segment_updater.consider_merge_options(); - } - }).wait() + self.run_async(move |segment_updater| if segment_updater.is_alive() { + let segment_entries = segment_updater + .purge_deletes(opstamp) + .expect("Failed purge deletes"); + segment_updater.0.segment_manager.commit(segment_entries); + segment_updater.save_metas(opstamp); + segment_updater.garbage_collect_files_exec(); + segment_updater.consider_merge_options(); + }) + .wait() } - pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future { - + pub fn start_merge(&self, + segment_ids: &[SegmentId]) + -> impl Future { + self.0.segment_manager.start_merge(segment_ids); let segment_updater_clone = self.clone(); - - let segment_ids_vec = segment_ids.to_vec(); - + + let segment_ids_vec = segment_ids.to_vec(); + let merging_thread_id = self.get_merging_thread_id(); let (merging_future_send, merging_future_recv) = oneshot(); - + if segment_ids.is_empty() { return merging_future_recv; } - + let target_opstamp = self.0.stamper.stamp(); let merging_join_handle = thread::spawn(move || { - + // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids_vec); - - let merged_segment = segment_updater_clone.new_segment(); + + let merged_segment = segment_updater_clone.new_segment(); let merged_segment_id = merged_segment.id(); - let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp); + let merge_result = perform_merge(&segment_ids_vec, + &segment_updater_clone, + merged_segment, + target_opstamp); match merge_result { Ok(after_merge_segment_entry) => { @@ -321,11 +320,11 @@ impl SegmentUpdater { segment_updater_clone .end_merge(segment_ids_vec, after_merge_segment_entry) .expect("Segment updater thread is corrupted."); - - // the future may fail if the listener of the oneshot future + + // the future may fail if the listener of the oneshot future // has been destroyed. // - // This is not a problem here, so we just ignore any + // This is not a problem here, so we just ignore any // possible error. let _merging_future_res = merging_future_send.send(merged_segment_meta); } @@ -339,16 +338,26 @@ impl SegmentUpdater { // merging_future_send will be dropped, sending an error to the future. } } - segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); + segment_updater_clone + .0 + .merging_threads + .write() + .unwrap() + .remove(&merging_thread_id); Ok(()) }); - self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle); + self.0 + .merging_threads + .write() + .unwrap() + .insert(merging_thread_id, merging_join_handle); merging_future_recv } fn consider_merge_options(&self) { - let (committed_segments, uncommitted_segments) = get_mergeable_segments(&self.0.segment_manager); + let (committed_segments, uncommitted_segments) = + get_mergeable_segments(&self.0.segment_manager); // Committed segments cannot be merged with uncommitted_segments. // We therefore consider merges using these two sets of segments independently. let merge_policy = self.get_merge_policy(); @@ -360,17 +369,20 @@ impl SegmentUpdater { } } - fn cancel_merge(&self, - before_merge_segment_ids: &[SegmentId], - after_merge_segment_entry: SegmentId) { - self.0.segment_manager.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry); + fn cancel_merge(&self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentId) { + self.0 + .segment_manager + .cancel_merge(&before_merge_segment_ids, after_merge_segment_entry); } - - fn end_merge(&self, - before_merge_segment_ids: Vec, - mut after_merge_segment_entry: SegmentEntry) -> Result<()> { - + + fn end_merge(&self, + before_merge_segment_ids: Vec, + mut after_merge_segment_entry: SegmentEntry) + -> Result<()> { + self.run_async(move |segment_updater| { debug!("End merge {:?}", after_merge_segment_entry.meta()); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); @@ -406,22 +418,22 @@ impl SegmentUpdater { /// /// Upon termination of the current merging threads, /// merge opportunity may appear. - // + // /// We keep waiting until the merge policy judges that /// no opportunity is available. /// - /// Note that it is not required to call this + /// Note that it is not required to call this /// method in your application. - /// Terminating your application without letting + /// Terminating your application without letting /// merge terminate is perfectly safe. - /// + /// /// Obsolete files will eventually be cleaned up /// by the directory garbage collector. pub fn wait_merging_thread(&self) -> Result<()> { let mut num_segments: usize; loop { - + num_segments = self.0.segment_manager.num_segments(); let mut new_merging_threads = HashMap::new(); @@ -434,9 +446,7 @@ impl SegmentUpdater { merging_thread_handle .join() .map(|_| ()) - .map_err(|_| { - Error::ErrorInThread("Merging thread failed.".to_string()) - })? + .map_err(|_| Error::ErrorInThread("Merging thread failed.".to_string()))? } // Our merging thread may have queued their completed self.run_async(move |_| {}).wait()?; @@ -446,10 +456,9 @@ impl SegmentUpdater { if new_num_segments >= num_segments { break; } - } + } Ok(()) } - } @@ -469,7 +478,7 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - + // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); index_writer.set_merge_policy(box MergeWheneverPossible); @@ -481,7 +490,7 @@ mod tests { } assert!(index_writer.commit().is_ok()); } - + { for _ in 0..100 { index_writer.add_document(doc!(text_field=>"c")); @@ -489,7 +498,7 @@ mod tests { } assert!(index_writer.commit().is_ok()); } - + { index_writer.add_document(doc!(text_field=>"e")); index_writer.add_document(doc!(text_field=>"f")); @@ -506,8 +515,9 @@ mod tests { assert_eq!(index.searcher().num_docs(), 302); { - index_writer.wait_merging_threads() - .expect( "waiting for merging threads"); + index_writer + .wait_merging_threads() + .expect("waiting for merging threads"); } index.load_searchers().unwrap(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index fcba8ed54..c56b1131c 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -1,7 +1,7 @@ use Result; use DocId; use std::io; -use schema::Schema; +use schema::Schema; use schema::Term; use core::Segment; use core::SerializableSegment; @@ -22,220 +22,219 @@ use super::operation::AddOperation; /// A `SegmentWriter` is in charge of creating segment index from a /// documents. -/// +/// /// They creates the postings list in anonymous memory. /// The segment is layed on disk when the segment gets `finalized`. pub struct SegmentWriter<'a> { - heap: &'a Heap, + heap: &'a Heap, max_doc: DocId, - per_field_postings_writers: Vec>, - segment_serializer: SegmentSerializer, - fast_field_writers: FastFieldsWriter, - fieldnorms_writer: FastFieldsWriter, - doc_opstamps: Vec, + per_field_postings_writers: Vec>, + segment_serializer: SegmentSerializer, + fast_field_writers: FastFieldsWriter, + fieldnorms_writer: FastFieldsWriter, + doc_opstamps: Vec, } fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter { - let u64_fields: Vec = schema.fields() - .iter() - .enumerate() - .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field_id, _)| Field(field_id as u32)) - .collect(); - FastFieldsWriter::new(u64_fields) + let u64_fields: Vec = schema + .fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| field_entry.is_indexed()) + .map(|(field_id, _)| Field(field_id as u32)) + .collect(); + FastFieldsWriter::new(u64_fields) } -fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box { - match *field_entry.field_type() { - FieldType::Str(ref text_options) => { - match text_options.get_indexing_options() { - TextIndexingOptions::TokenizedWithFreq => { - SpecializedPostingsWriter::::new_boxed(heap) - } - TextIndexingOptions::TokenizedWithFreqAndPosition => { - SpecializedPostingsWriter::::new_boxed(heap) - } - _ => { - SpecializedPostingsWriter::::new_boxed(heap) - } - } - } - FieldType::U64(_) => { - SpecializedPostingsWriter::::new_boxed(heap) - } - FieldType::I64(_) => { - SpecializedPostingsWriter::::new_boxed(heap) - } - } +fn posting_from_field_entry<'a>(field_entry: &FieldEntry, + heap: &'a Heap) + -> Box { + match *field_entry.field_type() { + FieldType::Str(ref text_options) => { + match text_options.get_indexing_options() { + TextIndexingOptions::TokenizedWithFreq => { + SpecializedPostingsWriter::::new_boxed(heap) + } + TextIndexingOptions::TokenizedWithFreqAndPosition => { + SpecializedPostingsWriter::::new_boxed(heap) + } + _ => SpecializedPostingsWriter::::new_boxed(heap), + } + } + FieldType::U64(_) => SpecializedPostingsWriter::::new_boxed(heap), + FieldType::I64(_) => SpecializedPostingsWriter::::new_boxed(heap), + } } impl<'a> SegmentWriter<'a> { - - /// Creates a new `SegmentWriter` - /// - /// The arguments are defined as follows - /// - /// - heap: most of the segment writer data (terms, and postings lists recorders) - /// is stored in a user-defined heap object. This makes it possible for the user to define - /// the flushing behavior as a buffer limit - /// - segment: The segment being written - /// - schema - pub fn for_segment(heap: &'a Heap, - mut segment: Segment, - schema: &Schema) -> Result> { - let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); - let mut per_field_postings_writers: Vec> = Vec::new(); - for field_entry in schema.fields() { - let postings_writer: Box = posting_from_field_entry(field_entry, heap); - per_field_postings_writers.push(postings_writer); - } - Ok(SegmentWriter { - heap: heap, - max_doc: 0, - per_field_postings_writers: per_field_postings_writers, - fieldnorms_writer: create_fieldnorms_writer(schema), - segment_serializer: segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(schema), - doc_opstamps: Vec::with_capacity(1_000), - }) - } - - /// Lay on disk the current content of the `SegmentWriter` - /// - /// Finalize consumes the `SegmentWriter`, so that it cannot - /// be used afterwards. - pub fn finalize(mut self) -> Result> { - for per_field_postings_writer in &mut self.per_field_postings_writers { - per_field_postings_writer.close(self.heap); - } - write(&self.per_field_postings_writers, - &self.fast_field_writers, - &self.fieldnorms_writer, - self.segment_serializer, - self.heap)?; - Ok(self.doc_opstamps) - } - - /// Returns true iff the segment writer's buffer has reached capacity. - /// - /// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB` - /// The `Segment` is `finalize`d when the buffer gets full. - /// - /// Because, we cannot cut through a document, the margin is there to ensure that we rarely - /// exceeds the heap size. - pub fn is_buffer_full(&self,) -> bool { - self.heap.num_free_bytes() <= MARGIN_IN_BYTES - } - - /// Indexes a new document - /// - /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> { - let doc_id = self.max_doc; - let doc = &add_operation.document; - self.doc_opstamps.push(add_operation.opstamp); - for (field, field_values) in doc.get_sorted_field_values() { - let field_posting_writer: &mut Box = &mut self.per_field_postings_writers[field.0 as usize]; - let field_options = schema.get_field_entry(field); - match *field_options.field_type() { - FieldType::Str(ref text_options) => { - let num_tokens: u32 = - if text_options.get_indexing_options().is_tokenized() { - field_posting_writer.index_text(doc_id, field, &field_values, self.heap) - } - else { - let num_field_values = field_values.len() as u32; - for field_value in field_values { - let term = Term::from_field_text(field, field_value.value().text()); - field_posting_writer.suscribe(doc_id, 0, &term, self.heap); - } - num_field_values - }; - self.fieldnorms_writer - .get_field_writer(field) - .map(|field_norms_writer| { - field_norms_writer.add_val(num_tokens as u64) - }); - } - FieldType::U64(ref int_option) => { - if int_option.is_indexed() { - for field_value in field_values { - let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value()); - field_posting_writer.suscribe(doc_id, 0, &term, self.heap); - } - } - } - FieldType::I64(ref int_option) => { - if int_option.is_indexed() { - for field_value in field_values { - let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value()); - field_posting_writer.suscribe(doc_id, 0, &term, self.heap); - } - } - } - } - } - self.fieldnorms_writer.fill_val_up_to(doc_id); - self.fast_field_writers.add_document(&doc); - let stored_fieldvalues: Vec<&FieldValue> = doc - .field_values() - .iter() - .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) - .collect(); - let doc_writer = self.segment_serializer.get_store_writer(); - try!(doc_writer.store(&stored_fieldvalues)); - self.max_doc += 1; - Ok(()) + /// Creates a new `SegmentWriter` + /// + /// The arguments are defined as follows + /// + /// - heap: most of the segment writer data (terms, and postings lists recorders) + /// is stored in a user-defined heap object. This makes it possible for the user to define + /// the flushing behavior as a buffer limit + /// - segment: The segment being written + /// - schema + pub fn for_segment(heap: &'a Heap, + mut segment: Segment, + schema: &Schema) + -> Result> { + let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); + let mut per_field_postings_writers: Vec> = Vec::new(); + for field_entry in schema.fields() { + let postings_writer: Box = posting_from_field_entry(field_entry, + heap); + per_field_postings_writers.push(postings_writer); + } + Ok(SegmentWriter { + heap: heap, + max_doc: 0, + per_field_postings_writers: per_field_postings_writers, + fieldnorms_writer: create_fieldnorms_writer(schema), + segment_serializer: segment_serializer, + fast_field_writers: FastFieldsWriter::from_schema(schema), + doc_opstamps: Vec::with_capacity(1_000), + }) } - - - /// Max doc is - /// - the number of documents in the segment assuming there is no deletes - /// - the maximum document id (including deleted documents) + 1 - /// - /// Currently, **tantivy** does not handle deletes anyway, - /// so `max_doc == num_docs` - pub fn max_doc(&self,) -> u32 { - self.max_doc - } - - /// Number of documents in the index. - /// Deleted documents are not counted. - /// - /// Currently, **tantivy** does not handle deletes anyway, - /// so `max_doc == num_docs` - #[allow(dead_code)] - pub fn num_docs(&self,) -> u32 { - self.max_doc - } + /// Lay on disk the current content of the `SegmentWriter` + /// + /// Finalize consumes the `SegmentWriter`, so that it cannot + /// be used afterwards. + pub fn finalize(mut self) -> Result> { + for per_field_postings_writer in &mut self.per_field_postings_writers { + per_field_postings_writer.close(self.heap); + } + write(&self.per_field_postings_writers, + &self.fast_field_writers, + &self.fieldnorms_writer, + self.segment_serializer, + self.heap)?; + Ok(self.doc_opstamps) + } + + /// Returns true iff the segment writer's buffer has reached capacity. + /// + /// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB` + /// The `Segment` is `finalize`d when the buffer gets full. + /// + /// Because, we cannot cut through a document, the margin is there to ensure that we rarely + /// exceeds the heap size. + pub fn is_buffer_full(&self) -> bool { + self.heap.num_free_bytes() <= MARGIN_IN_BYTES + } + + /// Indexes a new document + /// + /// As a user, you should rather use `IndexWriter`'s add_document. + pub fn add_document(&mut self, + add_operation: &AddOperation, + schema: &Schema) + -> io::Result<()> { + let doc_id = self.max_doc; + let doc = &add_operation.document; + self.doc_opstamps.push(add_operation.opstamp); + for (field, field_values) in doc.get_sorted_field_values() { + let field_posting_writer: &mut Box = + &mut self.per_field_postings_writers[field.0 as usize]; + let field_options = schema.get_field_entry(field); + match *field_options.field_type() { + FieldType::Str(ref text_options) => { + let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() { + field_posting_writer.index_text(doc_id, field, &field_values, self.heap) + } else { + let num_field_values = field_values.len() as u32; + for field_value in field_values { + let term = Term::from_field_text(field, field_value.value().text()); + field_posting_writer.suscribe(doc_id, 0, &term, self.heap); + } + num_field_values + }; + self.fieldnorms_writer + .get_field_writer(field) + .map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64)); + } + FieldType::U64(ref int_option) => { + if int_option.is_indexed() { + for field_value in field_values { + let term = Term::from_field_u64(field_value.field(), + field_value.value().u64_value()); + field_posting_writer.suscribe(doc_id, 0, &term, self.heap); + } + } + } + FieldType::I64(ref int_option) => { + if int_option.is_indexed() { + for field_value in field_values { + let term = Term::from_field_i64(field_value.field(), + field_value.value().i64_value()); + field_posting_writer.suscribe(doc_id, 0, &term, self.heap); + } + } + } + } + } + self.fieldnorms_writer.fill_val_up_to(doc_id); + self.fast_field_writers.add_document(&doc); + let stored_fieldvalues: Vec<&FieldValue> = doc.field_values() + .iter() + .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) + .collect(); + let doc_writer = self.segment_serializer.get_store_writer(); + try!(doc_writer.store(&stored_fieldvalues)); + self.max_doc += 1; + Ok(()) + } + + + /// Max doc is + /// - the number of documents in the segment assuming there is no deletes + /// - the maximum document id (including deleted documents) + 1 + /// + /// Currently, **tantivy** does not handle deletes anyway, + /// so `max_doc == num_docs` + pub fn max_doc(&self) -> u32 { + self.max_doc + } + + /// Number of documents in the index. + /// Deleted documents are not counted. + /// + /// Currently, **tantivy** does not handle deletes anyway, + /// so `max_doc == num_docs` + #[allow(dead_code)] + pub fn num_docs(&self) -> u32 { + self.max_doc + } } // This method is used as a trick to workaround the borrow checker fn write<'a>(per_field_postings_writers: &[Box], - fast_field_writers: &FastFieldsWriter, - fieldnorms_writer: &FastFieldsWriter, - mut serializer: SegmentSerializer, - heap: &'a Heap,) -> Result<()> { - for per_field_postings_writer in per_field_postings_writers { - try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap)); - } - try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); - try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); - try!(serializer.close()); - Ok(()) + fast_field_writers: &FastFieldsWriter, + fieldnorms_writer: &FastFieldsWriter, + mut serializer: SegmentSerializer, + heap: &'a Heap) + -> Result<()> { + for per_field_postings_writer in per_field_postings_writers { + try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap)); + } + try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); + try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); + try!(serializer.close()); + Ok(()) } impl<'a> SerializableSegment for SegmentWriter<'a> { - fn write(&self, serializer: SegmentSerializer) -> Result { - let max_doc = self.max_doc; - write(&self.per_field_postings_writers, - &self.fast_field_writers, - &self.fieldnorms_writer, - serializer, - self.heap)?; - Ok(max_doc) - } + fn write(&self, serializer: SegmentSerializer) -> Result { + let max_doc = self.max_doc; + write(&self.per_field_postings_writers, + &self.fast_field_writers, + &self.fieldnorms_writer, + serializer, + self.heap)?; + Ok(max_doc) + } } diff --git a/src/indexer/stamper.rs b/src/indexer/stamper.rs index 816eb6dc4..0046caba4 100644 --- a/src/indexer/stamper.rs +++ b/src/indexer/stamper.rs @@ -6,12 +6,11 @@ use std::sync::Arc; pub struct Stamper(Arc); impl Stamper { - pub fn new(first_opstamp: u64) -> Stamper { Stamper(Arc::new(AtomicU64::new(first_opstamp))) } - pub fn stamp(&self,) -> u64 { + pub fn stamp(&self) -> u64 { self.0.fetch_add(1u64, Ordering::SeqCst) } -} \ No newline at end of file +} diff --git a/src/lib.rs b/src/lib.rs index 7f889691d..9d4bd9ef5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,7 @@ //! # `tantivy` //! -//! Tantivy is a search engine library. +//! Tantivy is a search engine library. //! Think `Lucene`, but in Rust. //! //! A good place for you to get started is to check out @@ -64,8 +64,10 @@ extern crate libc; #[cfg(windows)] extern crate winapi; -#[cfg(test)] extern crate test; -#[cfg(test)] extern crate rand; +#[cfg(test)] +extern crate test; +#[cfg(test)] +extern crate rand; #[cfg(test)] @@ -137,15 +139,14 @@ pub use core::TermIterator; /// Expose the current version of tantivy, as well /// whether it was compiled with the simd compression. pub fn version() -> &'static str { - if cfg!(feature="simdcompression") { - concat!(version!(), "-simd") - } - else { - concat!(version!(), "-nosimd") + if cfg!(feature = "simdcompression") { + concat!(version!(), "-simd") + } else { + concat!(version!(), "-nosimd") } } -/// Tantivy's makes it possible to personalize when +/// Tantivy's makes it possible to personalize when /// the indexer should merge its segments pub mod merge_policy { pub use indexer::MergePolicy; @@ -167,28 +168,27 @@ pub type Score = f32; pub type SegmentLocalId = u32; impl DocAddress { - /// Return the segment ordinal. /// The segment ordinal is an id identifying the segment /// hosting the document. It is only meaningful, in the context /// of a searcher. - pub fn segment_ord(&self,) -> SegmentLocalId { + pub fn segment_ord(&self) -> SegmentLocalId { self.0 } - + /// Return the segment local `DocId` - pub fn doc(&self,) -> DocId { + pub fn doc(&self) -> DocId { self.1 } } -/// `DocAddress` contains all the necessary information +/// `DocAddress` contains all the necessary information /// to identify a document given a `Searcher` object. -/// -/// It consists in an id identifying its segment, and +/// +/// It consists in an id identifying its segment, and /// its segment-local `DocId`. -/// +/// /// The id used for the segment is actually an ordinal /// in the list of segment hold by a `Searcher`. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -275,8 +275,8 @@ mod tests { assert_eq!(searcher.doc_freq(&term_d), 0); } } - - + + #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -319,19 +319,23 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { // 0 + { + // 0 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } - { // 1 + { + // 1 let doc = doc!(text_field=>" a c"); index_writer.add_document(doc); } - { // 2 + { + // 2 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc); } - { // 3 + { + // 3 let doc = doc!(text_field=>" b d"); index_writer.add_document(doc); } @@ -341,11 +345,13 @@ mod tests { { index_writer.delete_term(Term::from_field_text(text_field, "a")); } - { // 4 + { + // 4 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc); } - { // 5 + { + // 5 let doc = doc!(text_field=>" a"); index_writer.add_document(doc); } @@ -355,15 +361,21 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + assert!(reader + .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) + .is_none()); { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -374,11 +386,13 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { // 0 + { + // 0 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } - { // 1 + { + // 1 index_writer.delete_term(Term::from_field_text(text_field, "c")); } index_writer.rollback().unwrap(); @@ -387,15 +401,21 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + assert!(reader + .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) + .is_none()); { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -406,14 +426,14 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { + { let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } - { + { index_writer.delete_term(Term::from_field_text(text_field, "c")); } - index_writer = index_writer.rollback().unwrap(); + index_writer = index_writer.rollback().unwrap(); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); } @@ -421,13 +441,19 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + assert!(reader + .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) + .is_none()); { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "a")) + .unwrap(); assert!(!postings.advance()); } { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "b")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); @@ -435,7 +461,9 @@ mod tests { assert!(!postings.advance()); } { - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "c")).unwrap(); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "c")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); @@ -449,17 +477,18 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("value", INT_INDEXED); let schema = schema_builder.build(); - + let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer.add_document( - doc!(field=>1u64) - ); + index_writer.add_document(doc!(field=>1u64)); index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); let term = Term::from_field_u64(field, 1u64); - let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap(); + let mut postings = searcher + .segment_reader(0) + .read_postings(&term, SegmentPostingsOption::NoFreq) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert!(!postings.advance()); @@ -470,20 +499,19 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let value_field = schema_builder.add_i64_field("value", INT_INDEXED); let schema = schema_builder.build(); - + let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let negative_val = -1i64; - index_writer.add_document( - doc!(value_field => negative_val) - ); + index_writer.add_document(doc!(value_field => negative_val)); index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) - .read_postings(&term, SegmentPostingsOption::NoFreq).unwrap(); + .read_postings(&term, SegmentPostingsOption::NoFreq) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert!(!postings.advance()); @@ -495,15 +523,15 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - + // writing the segment let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); - + let add_document = |index_writer: &mut IndexWriter, val: &'static str| { let doc = doc!(text_field=>val); index_writer.add_document(doc); }; - + let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { let delterm = Term::from_field_text(text_field, val); index_writer.delete_term(delterm); @@ -544,8 +572,12 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); - assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "af")).unwrap(); + assert!(reader + .read_postings_all_info(&Term::from_field_text(text_field, "abcd")) + .is_none()); + let mut postings = reader + .read_postings_all_info(&Term::from_field_text(text_field, "af")) + .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); @@ -587,35 +619,29 @@ mod tests { collector.docs() }; { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "a"))), - vec!(1, 2)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), + vec![1, 2]); } { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "af"))), - vec!(0)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), + vec![0]); } { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "b"))), - vec!(0, 1, 2)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), + vec![0, 1, 2]); } { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "c"))), - vec!(1, 2)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]), + vec![1, 2]); } { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "d"))), - vec!(2)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]), + vec![2]); } { - assert_eq!( - get_doc_ids(vec!(Term::from_field_text(text_field, "b"), - Term::from_field_text(text_field, "a"), )), - vec!(0, 1, 2)); + assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"), + Term::from_field_text(text_field, "a")]), + vec![0, 1, 2]); } } } @@ -684,30 +710,35 @@ mod tests { let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); { - let fast_field_reader_res = segment_reader.get_fast_field_reader::(text_field); + let fast_field_reader_res = + segment_reader.get_fast_field_reader::(text_field); assert!(fast_field_reader_res.is_err()); } { - let fast_field_reader_res = segment_reader.get_fast_field_reader::(stored_int_field); + let fast_field_reader_res = + segment_reader.get_fast_field_reader::(stored_int_field); assert!(fast_field_reader_res.is_err()); } { - let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); + let fast_field_reader_res = + segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_err()); } { - let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); + let fast_field_reader_res = + segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } - + { - let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); + let fast_field_reader_res = + segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } - + } } diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs index 30efa08ac..627261696 100644 --- a/src/postings/freq_handler.rs +++ b/src/postings/freq_handler.rs @@ -123,4 +123,4 @@ impl FreqHandler { } } } -} \ No newline at end of file +} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index b48811fc3..fd748fb1e 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -34,7 +34,7 @@ pub use common::HasLen; #[cfg(test)] mod tests { - + use super::*; use schema::{Document, TEXT, STRING, SchemaBuilder, Term}; use core::SegmentComponent; @@ -49,8 +49,8 @@ mod tests { use test::Bencher; use indexer::operation::AddOperation; use rand::{XorShiftRng, Rng, SeedableRng}; - - + + #[test] pub fn test_position_write() { let mut schema_builder = SchemaBuilder::default(); @@ -62,7 +62,7 @@ mod tests { let term = Term::from_field_text(text_field, "abc"); posting_serializer.new_term(&term).unwrap(); for doc_id in 0u32..3u32 { - let positions = vec!(1,2,3,2); + let positions = vec![1, 2, 3, 2]; posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); } posting_serializer.close_term().unwrap(); @@ -70,7 +70,7 @@ mod tests { let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); assert!(read.len() <= 16); } - + #[test] pub fn test_position_and_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -80,14 +80,15 @@ mod tests { let segment = index.new_segment(); let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap(); + let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema) + .unwrap(); { let mut doc = Document::default(); doc.add_text(text_field, "a b a c a d a a."); doc.add_text(text_field, "d d d d a"); // checking that position works if the field has two values. let op = AddOperation { opstamp: 0u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -96,7 +97,7 @@ mod tests { doc.add_text(text_field, "b a"); let op = AddOperation { opstamp: 1u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -107,7 +108,7 @@ mod tests { doc.add_text(text_field, &text); let op = AddOperation { opstamp: 2u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -119,7 +120,7 @@ mod tests { let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); assert_eq!(fieldnorm_reader.get(0), 8 + 5); assert_eq!(fieldnorm_reader.get(1), 2); - for i in 2 .. 1000 { + for i in 2..1000 { assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64); } } @@ -138,7 +139,7 @@ mod tests { assert!(postings_a.advance()); assert_eq!(postings_a.doc(), 1u32); assert_eq!(postings_a.term_freq(), 1); - for i in 2u32 .. 1000u32 { + for i in 2u32..1000u32 { assert!(postings_a.advance()); assert_eq!(postings_a.term_freq(), 1); assert_eq!(postings_a.positions(), [i]); @@ -150,7 +151,7 @@ mod tests { let term_e = Term::from_field_text(text_field, "e"); let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap(); assert_eq!(postings_e.len(), 1000 - 2); - for i in 2u32 .. 1000u32 { + for i in 2u32..1000u32 { assert!(postings_e.advance()); assert_eq!(postings_e.term_freq(), i); let positions = postings_e.positions(); @@ -164,7 +165,7 @@ mod tests { } } } - + #[test] pub fn test_position_and_fieldnorm2() { let mut schema_builder = SchemaBuilder::default(); @@ -186,7 +187,8 @@ mod tests { assert!(index_writer.commit().is_ok()); } index.load_searchers().unwrap(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq); let searcher = index.searcher(); let mut term_weight = term_query.specialized_weight(&*searcher); term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions; @@ -196,13 +198,13 @@ mod tests { assert_eq!(term_scorer.doc(), 1u32); assert_eq!(term_scorer.postings().positions(), &[1u32, 4]); } - + #[test] fn test_intersection() { { - let left = VecPostings::from(vec!(1, 3, 9)); - let right = VecPostings::from(vec!(3, 4, 9, 18)); - let mut intersection = IntersectionDocSet::from(vec!(left, right)); + let left = VecPostings::from(vec![1, 3, 9]); + let right = VecPostings::from(vec![3, 4, 9, 18]); + let mut intersection = IntersectionDocSet::from(vec![left, right]); assert!(intersection.advance()); assert_eq!(intersection.doc(), 3); assert!(intersection.advance()); @@ -210,17 +212,17 @@ mod tests { assert!(!intersection.advance()); } { - let a = VecPostings::from(vec!(1, 3, 9)); - let b = VecPostings::from(vec!(3, 4, 9, 18)); - let c = VecPostings::from(vec!(1, 5, 9, 111)); - let mut intersection = IntersectionDocSet::from(vec!(a, b, c)); + let a = VecPostings::from(vec![1, 3, 9]); + let b = VecPostings::from(vec![3, 4, 9, 18]); + let c = VecPostings::from(vec![1, 5, 9, 111]); + let mut intersection = IntersectionDocSet::from(vec![a, b, c]); assert!(intersection.advance()); assert_eq!(intersection.doc(), 9); assert!(!intersection.advance()); } } - - + + lazy_static! { static ref TERM_A: Term = { let field = Field(0); @@ -266,27 +268,34 @@ mod tests { index }; } - + #[bench] fn bench_segment_postings(b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); - + b.iter(|| { - let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); - while segment_postings.advance() {} - }); - } - + let mut segment_postings = segment_reader + .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) + .unwrap(); + while segment_postings.advance() {} + }); + } + #[bench] fn bench_segment_intersection(b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); b.iter(|| { - let segment_postings_a = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); - let segment_postings_b = segment_reader.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq).unwrap(); - let mut intersection = IntersectionDocSet::from(vec!(segment_postings_a, segment_postings_b)); + let segment_postings_a = segment_reader + .read_postings(&*TERM_A, SegmentPostingsOption::NoFreq) + .unwrap(); + let segment_postings_b = segment_reader + .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) + .unwrap(); + let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, + segment_postings_b]); while intersection.advance() {} }); - } + } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 3a3219730..88e81699c 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -22,7 +22,7 @@ pub trait PostingsWriter { /// * heap - heap used to store the postings informations as well as the terms /// in the hashmap. fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap); - + /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; @@ -115,9 +115,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> { - let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index - .iter() - .collect(); + let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index.iter().collect(); term_offsets.sort_by_key(|&(k, _v)| k); let mut term = unsafe { Term::with_capacity(100) }; for (term_bytes, (addr, recorder)) in term_offsets { @@ -138,4 +136,4 @@ fn test_hashmap_size() { assert_eq!(hashmap_size_in_bits(0), 10); assert_eq!(hashmap_size_in_bits(100_000), 11); assert_eq!(hashmap_size_in_bits(300_000_000), 23); -} \ No newline at end of file +} diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ff11a74e4..fc21e5555 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -26,14 +26,19 @@ impl<'a> SegmentPostings<'a> { fn load_next_block(&mut self) { let num_remaining_docs = self.len - self.cur.0; if num_remaining_docs >= NUM_DOCS_PER_BLOCK { - self.remaining_data = self.block_decoder - .uncompress_block_sorted(self.remaining_data, self.doc_offset); + self.remaining_data = + self.block_decoder + .uncompress_block_sorted(self.remaining_data, self.doc_offset); self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); } else { - self.remaining_data = self.block_decoder - .uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs); - self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs); + self.remaining_data = + self.block_decoder + .uncompress_vint_sorted(self.remaining_data, + self.doc_offset, + num_remaining_docs); + self.freq_handler + .read_freq_vint(self.remaining_data, num_remaining_docs); } } @@ -46,7 +51,8 @@ impl<'a> SegmentPostings<'a> { pub fn from_data(len: u32, data: &'a [u8], delete_bitset: &'a DeleteBitSet, - freq_handler: FreqHandler) -> SegmentPostings<'a> { + freq_handler: FreqHandler) + -> SegmentPostings<'a> { SegmentPostings { len: len as usize, doc_offset: 0, diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index f12417f37..9941ab848 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -75,22 +75,22 @@ impl PostingsSerializer { -> Result { let terms_fst_builder = try!(FstMapBuilder::new(terms_write)); Ok(PostingsSerializer { - terms_fst_builder: terms_fst_builder, - postings_write: postings_write, - positions_write: positions_write, - written_bytes_postings: 0, - written_bytes_positions: 0, - last_doc_id_encoded: 0u32, - positions_encoder: CompositeEncoder::new(), - block_encoder: BlockEncoder::new(), - doc_ids: Vec::new(), - term_freqs: Vec::new(), - position_deltas: Vec::new(), - schema: schema, - text_indexing_options: TextIndexingOptions::Unindexed, - term_open: false, - current_term_info: TermInfo::default(), - }) + terms_fst_builder: terms_fst_builder, + postings_write: postings_write, + positions_write: positions_write, + written_bytes_postings: 0, + written_bytes_positions: 0, + last_doc_id_encoded: 0u32, + positions_encoder: CompositeEncoder::new(), + block_encoder: BlockEncoder::new(), + doc_ids: Vec::new(), + term_freqs: Vec::new(), + position_deltas: Vec::new(), + schema: schema, + text_indexing_options: TextIndexingOptions::Unindexed, + term_open: false, + current_term_info: TermInfo::default(), + }) } @@ -155,7 +155,8 @@ impl PostingsSerializer { pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { - self.terms_fst_builder.insert_value(&self.current_term_info)?; + self.terms_fst_builder + .insert_value(&self.current_term_info)?; if !self.doc_ids.is_empty() { // we have doc ids waiting to be written @@ -165,8 +166,9 @@ impl PostingsSerializer { // In that case, the remaining part is encoded // using variable int encoding. { - let block_encoded = self.block_encoder - .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded = + self.block_encoder + .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); self.written_bytes_postings += block_encoded.len(); try!(self.postings_write.write_all(block_encoded)); self.doc_ids.clear(); @@ -186,7 +188,7 @@ impl PostingsSerializer { // end of the term, at which point they are compressed and written. if self.text_indexing_options.is_position_enabled() { self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64) - .serialize(&mut self.positions_write)); + .serialize(&mut self.positions_write)); let positions_encoded: &[u8] = self.positions_encoder .compress_unsorted(&self.position_deltas[..]); try!(self.positions_write.write_all(positions_encoded)); @@ -224,8 +226,9 @@ impl PostingsSerializer { if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { { // encode the doc ids - let block_encoded: &[u8] = self.block_encoder - .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); + let block_encoded: &[u8] = + self.block_encoder + .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; try!(self.postings_write.write_all(block_encoded)); self.written_bytes_postings += block_encoded.len(); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index c268ca850..fbcf9e05a 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -2,21 +2,21 @@ use common::BinarySerializable; use std::io; -/// `TermInfo` contains all of the information +/// `TermInfo` contains all of the information /// associated to terms in the `.term` file. -/// +/// /// It consists of /// * `doc_freq` : the number of document in the segment /// containing this term. It is also the length of the /// posting list associated to this term -/// * `postings_offset` : an offset in the `.idx` file +/// * `postings_offset` : an offset in the `.idx` file /// addressing the start of the posting list associated /// to this term. #[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, - /// Offset within the postings (`.idx`) file. + /// Offset within the postings (`.idx`) file. pub postings_offset: u32, /// Offset within the position (`.pos`) file. pub positions_offset: u32, @@ -25,20 +25,17 @@ pub struct TermInfo { impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut io::Write) -> io::Result { - Ok( - try!(self.doc_freq.serialize(writer)) + - try!(self.postings_offset.serialize(writer)) + - try!(self.positions_offset.serialize(writer)) - ) + Ok(try!(self.doc_freq.serialize(writer)) + try!(self.postings_offset.serialize(writer)) + + try!(self.positions_offset.serialize(writer))) } fn deserialize(reader: &mut io::Read) -> io::Result { let doc_freq = try!(u32::deserialize(reader)); let postings_offset = try!(u32::deserialize(reader)); let positions_offset = try!(u32::deserialize(reader)); Ok(TermInfo { - doc_freq: doc_freq, - postings_offset: postings_offset, - positions_offset: positions_offset, - }) + doc_freq: doc_freq, + postings_offset: postings_offset, + positions_offset: positions_offset, + }) } } diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 524293fb8..caa47dba1 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -37,10 +37,11 @@ impl Query for BooleanQuery { } fn weight(&self, searcher: &Searcher) -> Result> { - let sub_weights = try!(self.subqueries - .iter() - .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) - .collect()); + let sub_weights = + try!(self.subqueries + .iter() + .map(|&(ref _occur, ref subquery)| subquery.weight(searcher)) + .collect()); let occurs: Vec = self.subqueries .iter() .map(|&(ref occur, ref _subquery)| *occur) @@ -54,12 +55,14 @@ impl BooleanQuery { /// Helper method to create a boolean query matching a given list of terms. /// The resulting query is a disjunction of the terms. pub fn new_multiterms_query(terms: Vec) -> BooleanQuery { - let occur_term_queries: Vec<(Occur, Box)> = terms.into_iter() + let occur_term_queries: Vec<(Occur, Box)> = terms + .into_iter() .map(|term| { - let term_query: Box = box TermQuery::new(term, SegmentPostingsOption::Freq); - (Occur::Should, term_query) - }) + let term_query: Box = box TermQuery::new(term, + SegmentPostingsOption::Freq); + (Occur::Should, term_query) + }) .collect(); BooleanQuery::from(occur_term_queries) } -} \ No newline at end of file +} diff --git a/src/query/boolean_query/boolean_scorer.rs b/src/query/boolean_query/boolean_scorer.rs index 46300c7a6..8e1bf5950 100644 --- a/src/query/boolean_query/boolean_scorer.rs +++ b/src/query/boolean_query/boolean_scorer.rs @@ -10,7 +10,7 @@ use query::boolean_query::ScoreCombiner; /// Each `HeapItem` represents the head of /// one of scorer being merged. /// -/// * `doc` - is the current doc id for the given segment postings +/// * `doc` - is the current doc id for the given segment postings /// * `ord` - is the ordinal used to identify to which segment postings /// this heap item belong to. #[derive(Eq, PartialEq)] @@ -27,8 +27,8 @@ impl PartialOrd for HeapItem { } impl Ord for HeapItem { - fn cmp(&self, other:&Self) -> Ordering { - (other.doc).cmp(&self.doc) + fn cmp(&self, other: &Self) -> Ordering { + (other.doc).cmp(&self.doc) } } @@ -41,9 +41,7 @@ pub struct BooleanScorer { } impl BooleanScorer { - - pub fn new(scorers: Vec, - occur_filter: OccurFilter) -> BooleanScorer { + pub fn new(scorers: Vec, occur_filter: OccurFilter) -> BooleanScorer { let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len()); let mut non_empty_scorers: Vec = Vec::new(); for mut posting in scorers { @@ -57,11 +55,11 @@ impl BooleanScorer { .map(|posting| posting.doc()) .enumerate() .map(|(ord, doc)| { - HeapItem { - doc: doc, - ord: ord as u32 - } - }) + HeapItem { + doc: doc, + ord: ord as u32, + } + }) .collect(); BooleanScorer { scorers: non_empty_scorers, @@ -69,20 +67,19 @@ impl BooleanScorer { doc: 0u32, score_combiner: score_combiner, occur_filter: occur_filter, - } } - + /// Advances the head of our heap (the segment posting with the lowest doc) /// It will also update the new current `DocId` as well as the term frequency /// associated with the segment postings. - /// + /// /// After advancing the `SegmentPosting`, the postings is removed from the heap /// if it has been entirely consumed, or pushed back into the heap. - /// + /// /// # Panics /// This method will panic if the head `SegmentPostings` is not empty. - fn advance_head(&mut self,) { + fn advance_head(&mut self) { { let mut mutable_head = self.queue.peek_mut().unwrap(); let cur_scorers = &mut self.scorers[mutable_head.ord as usize]; @@ -96,7 +93,7 @@ impl BooleanScorer { } impl DocSet for BooleanScorer { - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { loop { self.score_combiner.clear(); let mut ord_bitset = 0u64; @@ -106,40 +103,37 @@ impl DocSet for BooleanScorer { self.doc = heap_item.doc; let score = self.scorers[ord].score(); self.score_combiner.update(score); - ord_bitset |= 1 << ord; + ord_bitset |= 1 << ord; } None => { return false; } } self.advance_head(); - while let Some(&HeapItem {doc, ord}) = self.queue.peek() { + while let Some(&HeapItem { doc, ord }) = self.queue.peek() { if doc == self.doc { let ord = ord as usize; let score = self.scorers[ord].score(); self.score_combiner.update(score); ord_bitset |= 1 << ord; - } - else { + } else { break; } self.advance_head(); - } + } if self.occur_filter.accept(ord_bitset) { return true; } } - } - - fn doc(&self,) -> DocId { + } + + fn doc(&self) -> DocId { self.doc } } impl Scorer for BooleanScorer { - - fn score(&self,) -> f32 { + fn score(&self) -> f32 { self.score_combiner.score() } } - diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 830f85edf..cb3f2f4f6 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -23,9 +23,9 @@ impl BooleanWeight { impl Weight for BooleanWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let sub_scorers: Vec> = try!(self.weights - .iter() - .map(|weight| weight.scorer(reader)) - .collect()); + .iter() + .map(|weight| weight.scorer(reader)) + .collect()); let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter); Ok(box boolean_scorer) } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index c03c0fbc4..21ea30575 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -11,7 +11,7 @@ pub use self::score_combiner::ScoreCombiner; #[cfg(test)] mod tests { - + use super::*; use postings::{DocSet, VecPostings}; use query::Scorer; @@ -23,12 +23,12 @@ mod tests { use collector::tests::TestCollector; use Index; use schema::*; - use fastfield::{U64FastFieldReader}; + use fastfield::U64FastFieldReader; use postings::SegmentPostingsOption; fn abs_diff(left: f32, right: f32) -> f32 { (right - left).abs() - } + } #[test] @@ -64,7 +64,8 @@ mod tests { } let make_term_query = |text: &str| { - let term_query = TermQuery::new(Term::from_field_text(text_field, text), SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new(Term::from_field_text(text_field, text), + SegmentPostingsOption::NoFreq); let query: Box = box term_query; query }; @@ -78,58 +79,59 @@ mod tests { test_collector.docs() }; { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")) ]); - assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3)); + let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]); + assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")) ]); - assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3)); + let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a"))]); + assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), (Occur::Should, make_term_query("b"))]); - assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 2, 3)); + let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), + (Occur::Should, make_term_query("b"))]); + assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Should, make_term_query("b"))]); - assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3)); + let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), + (Occur::Should, make_term_query("b"))]); + assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), + let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Should, make_term_query("b")), - (Occur::MustNot, make_term_query("d")), - ]); - assert_eq!(matching_docs(&boolean_query), vec!(0, 1)); + (Occur::MustNot, make_term_query("d"))]); + assert_eq!(matching_docs(&boolean_query), vec![0, 1]); } { - let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]); + let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d"))]); assert_eq!(matching_docs(&boolean_query), Vec::::new()); } } #[test] pub fn test_boolean_scorer() { - let occurs = vec!(Occur::Should, Occur::Should); + let occurs = vec![Occur::Should, Occur::Should]; let occur_filter = OccurFilter::new(&occurs); - - let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300)); - - let left = VecPostings::from(vec!(1, 2, 3)); + + let left_fieldnorms = U64FastFieldReader::from(vec![100, 200, 300]); + + let left = VecPostings::from(vec![1, 2, 3]); let left_scorer = TermScorer { idf: 1f32, fieldnorm_reader_opt: Some(left_fieldnorms), postings: left, }; - - let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35)); - let right = VecPostings::from(vec!(1, 3, 8)); - + + let right_fieldnorms = U64FastFieldReader::from(vec![15, 25, 35]); + let right = VecPostings::from(vec![1, 3, 8]); + let right_scorer = TermScorer { idf: 4f32, fieldnorm_reader_opt: Some(right_fieldnorms), postings: right, }; - let mut boolean_scorer = BooleanScorer::new(vec!(left_scorer, right_scorer), occur_filter); + let mut boolean_scorer = BooleanScorer::new(vec![left_scorer, right_scorer], occur_filter); assert_eq!(boolean_scorer.next(), Some(1u32)); assert!(abs_diff(boolean_scorer.score(), 0.8707107) < 0.001); assert_eq!(boolean_scorer.next(), Some(2u32)); @@ -139,7 +141,7 @@ mod tests { assert!(abs_diff(boolean_scorer.score(), 0.5163978) < 0.001f32); assert!(!boolean_scorer.advance()); } - - + + } diff --git a/src/query/boolean_query/score_combiner.rs b/src/query/boolean_query/score_combiner.rs index 204c57c23..c3d3e648b 100644 --- a/src/query/boolean_query/score_combiner.rs +++ b/src/query/boolean_query/score_combiner.rs @@ -7,26 +7,25 @@ pub struct ScoreCombiner { } impl ScoreCombiner { - pub fn update(&mut self, score: Score) { self.score += score; self.num_fields += 1; } - pub fn clear(&mut self,) { + pub fn clear(&mut self) { self.score = 0f32; self.num_fields = 0; } - + /// Compute the coord term - fn coord(&self,) -> f32 { + fn coord(&self) -> f32 { self.coords[self.num_fields] } - - pub fn score(&self, ) -> Score { + + pub fn score(&self) -> Score { self.score * self.coord() } - + pub fn default_for_num_scorers(num_scorers: usize) -> ScoreCombiner { let query_coords: Vec = (0..num_scorers + 1) .map(|i| (i as Score) / (num_scorers as Score)) @@ -43,4 +42,4 @@ impl From> for ScoreCombiner { score: 0f32, } } -} \ No newline at end of file +} diff --git a/src/query/mod.rs b/src/query/mod.rs index 75ef5845a..94cce7304 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -1,5 +1,5 @@ /// Query module -/// +/// /// The query module regroups all of tantivy's query objects /// diff --git a/src/query/occur_filter.rs b/src/query/occur_filter.rs index 53280fa6c..42246770d 100644 --- a/src/query/occur_filter.rs +++ b/src/query/occur_filter.rs @@ -5,21 +5,20 @@ use query::Occur; // at most 64 elements. /// /// It wraps some simple bitmask to compute the filter -/// rapidly. +/// rapidly. #[derive(Clone, Copy)] pub struct OccurFilter { and_mask: u64, - result: u64, + result: u64, } impl OccurFilter { - /// Returns true if the bitset is matching the occur list. pub fn accept(&self, ord_set: u64) -> bool { (self.and_mask & ord_set) == self.result } - - /// Builds an `OccurFilter` from a list of `Occur`. + + /// Builds an `OccurFilter` from a list of `Occur`. pub fn new(occurs: &[Occur]) -> OccurFilter { let mut and_mask = 0u64; let mut result = 0u64; @@ -29,16 +28,16 @@ impl OccurFilter { Occur::Must => { and_mask |= shift; result |= shift; - }, + } Occur::MustNot => { and_mask |= shift; - }, - Occur::Should => {}, + } + Occur::Should => {} } } OccurFilter { and_mask: and_mask, - result: result + result: result, } } } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 05765149b..1345ee2eb 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -9,7 +9,7 @@ pub use self::phrase_scorer::PhraseScorer; #[cfg(test)] mod tests { - + use super::*; use core::Index; use schema::FieldValue; @@ -18,30 +18,35 @@ mod tests { #[test] pub fn test_phrase_query() { - + let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { // 0 + { + // 0 let doc = doc!(text_field=>"b b b d c g c"); index_writer.add_document(doc); } - { // 1 + { + // 1 let doc = doc!(text_field=>"a b b d c g c"); index_writer.add_document(doc); } - { // 2 + { + // 2 let doc = doc!(text_field=>"a b a b c"); index_writer.add_document(doc); } - { // 3 + { + // 3 let doc = doc!(text_field=>"c a b a d ga a"); index_writer.add_document(doc); } - { // 4 + { + // 4 let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc); } @@ -57,17 +62,19 @@ mod tests { .map(|text| Term::from_field_text(text_field, text)) .collect(); let phrase_query = PhraseQuery::from(terms); - searcher.search(&phrase_query, &mut test_collector).expect("search should succeed"); + searcher + .search(&phrase_query, &mut test_collector) + .expect("search should succeed"); test_collector.docs() }; let empty_vec = Vec::::new(); - assert_eq!(test_query(vec!("a", "b", "c")), vec!(2, 4)); - assert_eq!(test_query(vec!("a", "b")), vec!(1, 2, 3, 4)); - assert_eq!(test_query(vec!("b", "b")), vec!(0, 1)); - assert_eq!(test_query(vec!("g", "ewrwer")), empty_vec); - assert_eq!(test_query(vec!("g", "a")), empty_vec); + assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]); + assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]); + assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]); + assert_eq!(test_query(vec!["g", "ewrwer"]), empty_vec); + assert_eq!(test_query(vec!["g", "a"]), empty_vec); } - + } diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index d44bdceb0..3d23ef6b1 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -8,12 +8,12 @@ use Result; /// `PhraseQuery` matches a specific sequence of word. -/// For instance the phrase query for `"part time"` will match +/// For instance the phrase query for `"part time"` will match /// the sentence -/// +/// /// **Alan just got a part time job.** /// -/// On the other hand it will not match the sentence. +/// On the other hand it will not match the sentence. /// /// **This is my favorite part of the job.** /// @@ -22,12 +22,10 @@ use Result; /// #[derive(Debug)] pub struct PhraseQuery { - phrase_terms: Vec, + phrase_terms: Vec, } impl Query for PhraseQuery { - - /// Used to make it possible to cast Box /// into a specific type. This is mostly useful for unit tests. fn as_any(&self) -> &Any { @@ -40,15 +38,12 @@ impl Query for PhraseQuery { fn weight(&self, _searcher: &Searcher) -> Result> { Ok(box PhraseWeight::from(self.phrase_terms.clone())) } - } impl From> for PhraseQuery { fn from(phrase_terms: Vec) -> PhraseQuery { assert!(phrase_terms.len() > 1); - PhraseQuery { - phrase_terms: phrase_terms, - } + PhraseQuery { phrase_terms: phrase_terms } } } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index d2a6a645f..6a41330e9 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -15,18 +15,16 @@ impl<'a> PhraseScorer<'a> { let mut positions_arr: Vec<&[u32]> = self.intersection_docset .docsets() .iter() - .map(|posting| { - posting.positions() - }) + .map(|posting| posting.positions()) .collect(); - + let num_postings = positions_arr.len() as u32; - + let mut ord = 1u32; - let mut pos_candidate = positions_arr[0][0]; + let mut pos_candidate = positions_arr[0][0]; positions_arr[0] = &(positions_arr[0])[1..]; let mut count_matching = 1; - + 'outer: loop { let target = pos_candidate + ord; let positions = positions_arr[ord as usize]; @@ -40,11 +38,10 @@ impl<'a> PhraseScorer<'a> { if count_matching == num_postings { return true; } - } - else if pos_i > target { + } else if pos_i > target { count_matching = 1; pos_candidate = positions[i] - ord; - positions_arr[ord as usize] = &(positions_arr[ord as usize])[(i+1)..]; + positions_arr[ord as usize] = &(positions_arr[ord as usize])[(i + 1)..]; } ord += 1; if ord == num_postings { @@ -58,7 +55,7 @@ impl<'a> PhraseScorer<'a> { } impl<'a> DocSet for PhraseScorer<'a> { - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { while self.intersection_docset.advance() { if self.phrase_match() { return true; @@ -67,15 +64,14 @@ impl<'a> DocSet for PhraseScorer<'a> { false } - fn doc(&self,) -> DocId { + fn doc(&self) -> DocId { self.intersection_docset.doc() } } impl<'a> Scorer for PhraseScorer<'a> { - fn score(&self,) -> f32 { + fn score(&self) -> f32 { 1f32 } - } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index d2a384183..a171b4160 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -14,26 +14,22 @@ pub struct PhraseWeight { impl From> for PhraseWeight { fn from(phrase_terms: Vec) -> PhraseWeight { - PhraseWeight { - phrase_terms: phrase_terms - } + PhraseWeight { phrase_terms: phrase_terms } } } impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); - for term in &self.phrase_terms { - let term_postings_option = reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); + for term in &self.phrase_terms { + let term_postings_option = + reader.read_postings(term, SegmentPostingsOption::FreqAndPositions); if let Some(term_postings) = term_postings_option { term_postings_list.push(term_postings); - } - else { + } else { return Ok(box EmptyScorer); } } - Ok(box PhraseScorer { - intersection_docset: IntersectionDocSet::from(term_postings_list), - }) + Ok(box PhraseScorer { intersection_docset: IntersectionDocSet::from(term_postings_list) }) } } diff --git a/src/query/query.rs b/src/query/query.rs index f4bcfa3c2..ad530120f 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -9,36 +9,35 @@ use std::any::Any; /// Query trait are in charge of defining : -/// +/// /// - a set of documents /// - a way to score these documents /// /// When performing a [search](#method.search), these documents will then /// be pushed to a [Collector](../collector/trait.Collector.html), -/// which will in turn be in charge of deciding what to do with them. -/// +/// which will in turn be in charge of deciding what to do with them. +/// /// Concretely, this scored docset is represented by the /// [`Scorer`](./trait.Scorer.html) trait. /// -/// Because our index is actually split into segments, the +/// Because our index is actually split into segments, the /// query does not actually directly creates `DocSet` object. /// Instead, the query creates a [`Weight`](./trait.Weight.html) -/// object for a given searcher. -/// -/// The weight object, in turn, makes it possible to create +/// object for a given searcher. +/// +/// The weight object, in turn, makes it possible to create /// a scorer for a specific [`SegmentReader`](../struct.SegmentReader.html). -/// +/// /// So to sum it up : /// - a `Query` is recipe to define a set of documents as well the way to score them. -/// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance +/// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance /// hold statistics about the different term of the query. It is created by the query. -/// - a `Scorer` is a cursor over the set of matching documents, for a specific +/// - a `Scorer` is a cursor over the set of matching documents, for a specific /// [`SegmentReader`](../struct.SegmentReader.html). It is created by the [`Weight`](./trait.Weight.html). /// /// When implementing a new type of `Query`, it is normal to implement a /// dedicated `Query`, `Weight` and `Scorer`. pub trait Query: fmt::Debug { - /// Used to make it possible to cast Box /// into a specific type. This is mostly useful for unit tests. fn as_any(&self) -> &Any; @@ -47,24 +46,21 @@ pub trait Query: fmt::Debug { /// /// See [Weight](./trait.Weight.html). fn weight(&self, searcher: &Searcher) -> Result>; - + /// Search works as follows : /// /// First the weight object associated to the query is created. - /// + /// /// Then, the query loops over the segments and for each segment : /// - setup the collector and informs it that the segment being processed has changed. /// - creates a `Scorer` object associated for this segment /// - iterate throw the matched documents and push them to the collector. /// - fn search( - &self, - searcher: &Searcher, - collector: &mut Collector) -> Result { - - let mut timer_tree = TimerTree::default(); + fn search(&self, searcher: &Searcher, collector: &mut Collector) -> Result { + + let mut timer_tree = TimerTree::default(); let weight = try!(self.weight(searcher)); - + { let mut search_timer = timer_tree.open("search"); for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs index 47d15d0e8..16b93eace 100644 --- a/src/query/query_parser/logical_ast.rs +++ b/src/query/query_parser/logical_ast.rs @@ -9,9 +9,9 @@ pub enum LogicalLiteral { } #[derive(Clone)] -pub enum LogicalAST{ +pub enum LogicalAST { Clause(Vec<(Occur, LogicalAST)>), - Leaf(Box) + Leaf(Box), } fn occur_letter(occur: Occur) -> &'static str { @@ -28,8 +28,7 @@ impl fmt::Debug for LogicalAST { LogicalAST::Clause(ref clause) => { if clause.is_empty() { try!(write!(formatter, "")); - } - else { + } else { let (ref occur, ref subquery) = clause[0]; try!(write!(formatter, "({}{:?}", occur_letter(*occur), subquery)); for &(ref occur, ref subquery) in &clause[1..] { @@ -39,9 +38,7 @@ impl fmt::Debug for LogicalAST { } Ok(()) } - LogicalAST::Leaf(ref literal) => { - write!(formatter, "{:?}", literal) - } + LogicalAST::Leaf(ref literal) => write!(formatter, "{:?}", literal), } } } @@ -55,12 +52,8 @@ impl From for LogicalAST { impl fmt::Debug for LogicalLiteral { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { - LogicalLiteral::Term(ref term) => { - write!(formatter, "{:?}", term) - }, - LogicalLiteral::Phrase(ref terms) => { - write!(formatter, "\"{:?}\"", terms) - } + LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term), + LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms), } } } diff --git a/src/query/query_parser/mod.rs b/src/query/query_parser/mod.rs index 147374ccc..91bc7d172 100644 --- a/src/query/query_parser/mod.rs +++ b/src/query/query_parser/mod.rs @@ -4,4 +4,4 @@ mod user_input_ast; pub mod logical_ast; pub use self::query_parser::QueryParser; -pub use self::query_parser::QueryParserError; \ No newline at end of file +pub use self::query_parser::QueryParserError; diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 26b629050..ec45a56ee 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -11,31 +11,27 @@ fn literal(input: I) -> ParseResult phrase.or(word) }; - let negative_numbers = - (char('-'), many1(satisfy(|c: char| c.is_numeric()))) + let negative_numbers = (char('-'), many1(satisfy(|c: char| c.is_numeric()))) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); - - let field = - ( - letter(), - many(satisfy(|c: char| c.is_alphanumeric() || c == '_')) - ) + + let field = (letter(), many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))) .map(|(s1, s2): (char, String)| format!("{}{}", s1, s2)); let term_val_with_field = negative_numbers.or(term_val()); let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| { - UserInputLiteral { - field_name: Some(field_name), - phrase: phrase, - } - }); + UserInputLiteral { + field_name: + Some(field_name), + phrase: phrase, + } + }); let term_default_field = term_val().map(|phrase| { - UserInputLiteral { - field_name: None, - phrase: phrase, - } - }); + UserInputLiteral { + field_name: None, + phrase: phrase, + } + }); try(term_query) .or(term_default_field) .map(|query_literal| UserInputAST::from(query_literal)) @@ -58,13 +54,11 @@ pub fn parse_to_ast(input: I) -> ParseResult where I: Stream { sep_by(parser(leaf), spaces()) - .map(|subqueries: Vec| { - if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - } - }) + .map(|subqueries: Vec| if subqueries.len() == 1 { + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) + }) .parse_stream(input) } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 6d76e95b1..49fa5d3cd 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -92,10 +92,10 @@ impl QueryParser { analyzer: box SimpleTokenizer, } } - + /// Set the default way to compose queries to a conjunction. /// - /// By default a , + /// By default a , pub fn set_conjunction_by_default(&mut self) { self.conjunction_by_default = true; } @@ -114,11 +114,11 @@ impl QueryParser { let logical_ast = self.parse_query_to_logical_ast(query)?; Ok(convert_to_query(logical_ast)) } - + /// Parse the user query into an AST. fn parse_query_to_logical_ast(&self, query: &str) -> Result { - let (user_input_ast, _remaining) = - parse_to_ast(query).map_err(|_| QueryParserError::SyntaxError)?; + let (user_input_ast, _remaining) = parse_to_ast(query) + .map_err(|_| QueryParserError::SyntaxError)?; self.compute_logical_ast(user_input_ast) } @@ -127,10 +127,10 @@ impl QueryParser { .get_field(field_name) .ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name))) } - + fn compute_logical_ast(&self, - user_input_ast: UserInputAST) - -> Result { + user_input_ast: UserInputAST) + -> Result { let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?; if occur == Occur::MustNot { return Err(QueryParserError::AllButQueryForbidden); @@ -172,21 +172,19 @@ impl QueryParser { break; } } - } - else { + } else { terms.push(Term::from_field_text(field, phrase)); } if terms.is_empty() { return Ok(None); - } - else if terms.len() == 1 { - return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))) + } else if terms.len() == 1 { + return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))); } else { - return Ok(Some(LogicalLiteral::Phrase(terms))) + return Ok(Some(LogicalLiteral::Phrase(terms))); } } } - + } fn default_occur(&self) -> Occur { @@ -198,8 +196,8 @@ impl QueryParser { } fn compute_logical_ast_with_occur(&self, - user_input_ast: UserInputAST) - -> Result<(Occur, LogicalAST), QueryParserError> { + user_input_ast: UserInputAST) + -> Result<(Occur, LogicalAST), QueryParserError> { match user_input_ast { UserInputAST::Clause(sub_queries) => { let default_occur = self.default_occur(); @@ -244,17 +242,14 @@ impl QueryParser { asts.push(LogicalAST::Leaf(box ast)); } } - let result_ast = - if asts.len() == 0 { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter() - .map(|ast| (Occur::Should, ast)) - .collect()) - }; + let result_ast = if asts.len() == 0 { + // this should never happen + return Err(QueryParserError::SyntaxError); + } else if asts.len() == 1 { + asts[0].clone() + } else { + LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) + }; Ok((Occur::Should, result_ast)) } } @@ -292,7 +287,8 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { fn convert_to_query(logical_ast: LogicalAST) -> Box { match logical_ast { LogicalAST::Clause(clause) => { - let occur_subqueries = clause.into_iter() + let occur_subqueries = clause + .into_iter() .map(|(occur, subquery)| (occur, convert_to_query(subquery))) .collect::>(); box BooleanQuery::from(occur_subqueries) @@ -311,7 +307,7 @@ mod test { use super::QueryParser; use super::QueryParserError; use super::super::logical_ast::*; - + fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); let title = schema_builder.add_text_field("title", TEXT); @@ -331,7 +327,7 @@ mod test { fn parse_query_to_logical_ast(query: &str, default_conjunction: bool) -> Result { - let mut query_parser = make_query_parser(); + let mut query_parser = make_query_parser(); if default_conjunction { query_parser.set_conjunction_by_default(); } @@ -345,40 +341,33 @@ mod test { let query_str = format!("{:?}", query); assert_eq!(query_str, expected); } - - + + #[test] pub fn test_parse_query_simple() { let query_parser = make_query_parser(); - assert!(query_parser.parse_query("toto").is_ok()); + assert!(query_parser.parse_query("toto").is_ok()); } #[test] pub fn test_parse_nonindexed_field_yields_error() { let query_parser = make_query_parser(); - + let is_not_indexed_err = |query: &str| { let result: Result, QueryParserError> = query_parser.parse_query(query); if let Err(QueryParserError::FieldNotIndexed(field_name)) = result { Some(field_name.clone()) - } - else { + } else { None } }; - assert_eq!( - is_not_indexed_err("notindexed_text:titi"), - Some(String::from("notindexed_text")) - ); - assert_eq!( - is_not_indexed_err("notindexed_u64:23424"), - Some(String::from("notindexed_u64")) - ); - assert_eq!( - is_not_indexed_err("notindexed_i64:-234324"), - Some(String::from("notindexed_i64")) - ); + assert_eq!(is_not_indexed_err("notindexed_text:titi"), + Some(String::from("notindexed_text"))); + assert_eq!(is_not_indexed_err("notindexed_u64:23424"), + Some(String::from("notindexed_u64"))); + assert_eq!(is_not_indexed_err("notindexed_i64:-234324"), + Some(String::from("notindexed_i64"))); } @@ -392,25 +381,32 @@ mod test { #[test] pub fn test_parse_query_ints() { let query_parser = make_query_parser(); - assert!(query_parser.parse_query("signed:2324").is_ok()); - assert!(query_parser.parse_query("signed:\"22\"").is_ok()); + assert!(query_parser.parse_query("signed:2324").is_ok()); + assert!(query_parser.parse_query("signed:\"22\"").is_ok()); assert!(query_parser.parse_query("signed:\"-2234\"").is_ok()); - assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok()); + assert!(query_parser + .parse_query("signed:\"-9999999999999\"") + .is_ok()); assert!(query_parser.parse_query("signed:\"a\"").is_err()); assert!(query_parser.parse_query("signed:\"2a\"").is_err()); - assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err()); + assert!(query_parser + .parse_query("signed:\"18446744073709551615\"") + .is_err()); assert!(query_parser.parse_query("unsigned:\"2\"").is_ok()); assert!(query_parser.parse_query("unsigned:\"-2\"").is_err()); - assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok()); + assert!(query_parser + .parse_query("unsigned:\"18446744073709551615\"") + .is_ok()); test_parse_query_to_logical_ast_helper("unsigned:2324", "Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])", false); test_parse_query_to_logical_ast_helper("signed:-2324", - &format!("{:?}", Term::from_field_i64(Field(2u32), -2324)), + &format!("{:?}", + Term::from_field_i64(Field(2u32), -2324)), false); } - + #[test] pub fn test_parse_query_to_ast_disjunction() { @@ -424,7 +420,9 @@ mod test { "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \ 105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))", false); - assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(), + assert_eq!(parse_query_to_logical_ast("-title:toto", false) + .err() + .unwrap(), QueryParserError::AllButQueryForbidden); test_parse_query_to_logical_ast_helper("title:a b", "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))", @@ -436,7 +434,9 @@ mod test { #[test] pub fn test_parse_query_to_ast_conjunction() { - test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 0, 0, 0, 116, 111, 116, 111])", true); + test_parse_query_to_logical_ast_helper("title:toto", + "Term([0, 0, 0, 0, 116, 111, 116, 111])", + true); test_parse_query_to_logical_ast_helper("+title:toto", "Term([0, 0, 0, 0, 116, 111, 116, 111])", true); @@ -444,7 +444,9 @@ mod test { "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \ 105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))", true); - assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(), + assert_eq!(parse_query_to_logical_ast("-title:toto", true) + .err() + .unwrap(), QueryParserError::AllButQueryForbidden); test_parse_query_to_logical_ast_helper("title:a b", "(+Term([0, 0, 0, 0, 97]) +(Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))", diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 4068075b4..26c05f628 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -2,18 +2,14 @@ use std::fmt; pub struct UserInputLiteral { pub field_name: Option, - pub phrase: String, + pub phrase: String, } impl fmt::Debug for UserInputLiteral { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match self.field_name { - Some(ref field_name) => { - write!(formatter, "{}:\"{}\"", field_name, self.phrase) - } - None => { - write!(formatter, "\"{}\"", self.phrase) - } + Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase), + None => write!(formatter, "\"{}\"", self.phrase), } } } @@ -22,41 +18,33 @@ pub enum UserInputAST { Clause(Vec>), Not(Box), Must(Box), - Leaf(Box) - + Leaf(Box), } impl From for UserInputAST { fn from(literal: UserInputLiteral) -> UserInputAST { UserInputAST::Leaf(box literal) } - } +} impl fmt::Debug for UserInputAST { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { - UserInputAST::Must(ref subquery) => { - write!(formatter, "+({:?})", subquery) - }, + UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery), UserInputAST::Clause(ref subqueries) => { if subqueries.is_empty() { try!(write!(formatter, "")); - } - else { + } else { try!(write!(formatter, "{:?}", &subqueries[0])); for subquery in &subqueries[1..] { try!(write!(formatter, " {:?}", subquery)); } } Ok(()) - - }, - UserInputAST::Not(ref subquery) => { - write!(formatter, "-({:?})", subquery) - } - UserInputAST::Leaf(ref subquery) => { - write!(formatter, "{:?}", subquery) + } + UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery), + UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery), } } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index ae8c33d66..4ac05cd83 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -5,30 +5,29 @@ use collector::Collector; use std::ops::{Deref, DerefMut}; /// Scored set of documents matching a query within a specific segment. -/// +/// /// See [Query](./trait.Query.html). pub trait Scorer: DocSet { - /// Returns the score. - /// + /// /// This method will perform a bit of computation and is not cached. - fn score(&self,) -> Score; - + fn score(&self) -> Score; + /// Consumes the complete `DocSet` and - /// push the scored documents to the collector. + /// push the scored documents to the collector. fn collect(&mut self, collector: &mut Collector) { while self.advance() { collector.collect(self.doc(), self.score()); } } -} +} impl<'a> Scorer for Box { - fn score(&self,) -> Score { + fn score(&self) -> Score { self.deref().score() } - + fn collect(&mut self, collector: &mut Collector) { let scorer = self.deref_mut(); while scorer.advance() { @@ -38,22 +37,22 @@ impl<'a> Scorer for Box { } /// EmptyScorer is a dummy Scorer in which no document matches. -/// +/// /// It is useful for tests and handling edge cases. pub struct EmptyScorer; impl DocSet for EmptyScorer { - fn advance(&mut self,) -> bool { + fn advance(&mut self) -> bool { false } - fn doc(&self,) -> DocId { + fn doc(&self) -> DocId { DocId::max_value() } } impl Scorer for EmptyScorer { - fn score(&self,) -> Score { + fn score(&self) -> Score { 0f32 } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 709dcfbe7..9670e73e2 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -9,7 +9,7 @@ pub use self::term_scorer::TermScorer; #[cfg(test)] mod tests { - + use postings::{DocSet, VecPostings}; use query::Scorer; use query::term_query::TermScorer; @@ -23,7 +23,7 @@ mod tests { fn abs_diff(left: f32, right: f32) -> f32 { (right - left).abs() - } + } #[test] @@ -44,7 +44,8 @@ mod tests { index.load_searchers().unwrap(); let searcher = index.searcher(); - let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq); + let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), + SegmentPostingsOption::NoFreq); let term_weight = term_query.weight(&searcher).unwrap(); let segment_reader = searcher.segment_reader(0); let mut term_scorer = term_weight.scorer(segment_reader).unwrap(); @@ -53,13 +54,13 @@ mod tests { assert_eq!(term_scorer.score(), 0.30685282); } - + #[test] pub fn test_term_scorer() { - let left_fieldnorms = U64FastFieldReader::from(vec!(10, 4)); + let left_fieldnorms = U64FastFieldReader::from(vec![10, 4]); assert_eq!(left_fieldnorms.get(0), 10); assert_eq!(left_fieldnorms.get(1), 4); - let left = VecPostings::from(vec!(1)); + let left = VecPostings::from(vec![1]); let mut left_scorer = TermScorer { idf: 0.30685282, fieldnorm_reader_opt: Some(left_fieldnorms), @@ -69,4 +70,4 @@ mod tests { assert!(abs_diff(left_scorer.score(), 0.15342641) < 0.001f32); } -} \ No newline at end of file +} diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 05115a9b2..330138edc 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -13,7 +13,7 @@ use std::any::Any; /// The score associated is defined as /// `idf` * sqrt(`term_freq` / `field norm`) /// in which : -/// * idf - inverse document frequency. +/// * idf - inverse document frequency. /// * term_freq - number of occurrences of the term in the field /// * field norm - number of tokens in the field. #[derive(Debug)] @@ -31,9 +31,9 @@ impl TermQuery { } } - + /// Returns a weight object. - /// + /// /// While `.weight(...)` returns a boxed trait object, /// this method return a specific implementation. /// This is useful for optimization purpose. @@ -55,5 +55,4 @@ impl Query for TermQuery { fn weight(&self, searcher: &Searcher) -> Result> { Ok(box self.specialized_weight(searcher)) } - } diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index 6294a8744..0819aeb58 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -6,41 +6,46 @@ use query::Scorer; use postings::Postings; use fastfield::FastFieldReader; -pub struct TermScorer where TPostings: Postings { +pub struct TermScorer + where TPostings: Postings +{ pub idf: Score, pub fieldnorm_reader_opt: Option, pub postings: TPostings, } -impl TermScorer where TPostings: Postings { +impl TermScorer + where TPostings: Postings +{ pub fn postings(&self) -> &TPostings { &self.postings } } -impl DocSet for TermScorer where TPostings: Postings { - fn advance(&mut self,) -> bool { +impl DocSet for TermScorer + where TPostings: Postings +{ + fn advance(&mut self) -> bool { self.postings.advance() } - - fn doc(&self,) -> DocId { + + fn doc(&self) -> DocId { self.postings.doc() } } -impl Scorer for TermScorer where TPostings: Postings { - fn score(&self,) -> Score { +impl Scorer for TermScorer + where TPostings: Postings +{ + fn score(&self) -> Score { let doc = self.postings.doc(); let tf = match self.fieldnorm_reader_opt { Some(ref fieldnorm_reader) => { let field_norm = fieldnorm_reader.get(doc); (self.postings.term_freq() as f32 / field_norm as f32) } - None => { - self.postings.term_freq() as f32 - } + None => self.postings.term_freq() as f32, }; self.idf * tf.sqrt() - } + } } - diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 48a000755..e781ebdbd 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -16,40 +16,35 @@ pub struct TermWeight { impl Weight for TermWeight { - fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let specialized_scorer = try!(self.specialized_scorer(reader)); Ok(box specialized_scorer) } - } impl TermWeight { - fn idf(&self) -> f32 { 1.0 + (self.num_docs as f32 / (self.doc_freq as f32 + 1.0)).ln() } - pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result>> { + pub fn specialized_scorer<'a>(&'a self, + reader: &'a SegmentReader) + -> Result>> { let field = self.term.field(); let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); - Ok( - reader - .read_postings(&self.term, self.segment_postings_options) - .map(|segment_postings| - TermScorer { - idf: self.idf(), - fieldnorm_reader_opt: fieldnorm_reader_opt, - postings: segment_postings, - } - ) - .unwrap_or( - TermScorer { - idf: 1f32, - fieldnorm_reader_opt: None, - postings: SegmentPostings::empty() + Ok(reader + .read_postings(&self.term, self.segment_postings_options) + .map(|segment_postings| { + TermScorer { + idf: self.idf(), + fieldnorm_reader_opt: fieldnorm_reader_opt, + postings: segment_postings, + } }) - ) + .unwrap_or(TermScorer { + idf: 1f32, + fieldnorm_reader_opt: None, + postings: SegmentPostings::empty(), + })) } - -} \ No newline at end of file +} diff --git a/src/query/weight.rs b/src/query/weight.rs index db583a3e4..0164eb797 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -8,9 +8,7 @@ use core::SegmentReader; /// /// See [Query](./trait.Query.html). pub trait Weight { - /// Returns the scorer for the given segment. /// See [Query](./trait.Query.html). fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result>; - } diff --git a/src/schema/document.rs b/src/schema/document.rs index 6496a1889..0f89266dc 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -1,13 +1,13 @@ use super::*; -use itertools::Itertools; +use itertools::Itertools; /// Tantivy's Document is the object that can -/// be indexed and then searched for. -/// +/// be indexed and then searched for. +/// /// Documents are fundamentally a collection of unordered couple `(field, value)`. /// In this list, one field may appear more than once. -/// -/// +/// +/// /// Documents are really just a list of couple `(field, value)`. /// In this list, one field may appear more than once. @@ -30,25 +30,24 @@ impl PartialEq for Document { impl Eq for Document {} impl Document { - /// Creates a new, empty document object pub fn new() -> Document { Document::default() } /// Returns the number of `(field, value)` pairs. - pub fn len(&self,) -> usize { + pub fn len(&self) -> usize { self.field_values.len() } - + /// Returns true iff the document contains no fields. - pub fn is_empty(&self,) -> bool { + pub fn is_empty(&self) -> bool { self.field_values.is_empty() } - + /// Add a text field. pub fn add_text(&mut self, field: Field, text: &str) { - let value = Value::Str(String::from(text)); + let value = Value::Str(String::from(text)); self.add(FieldValue::new(field, value)); } @@ -66,29 +65,27 @@ impl Document { pub fn add(&mut self, field_value: FieldValue) { self.field_values.push(field_value); } - + /// field_values accessor pub fn field_values(&self) -> &[FieldValue] { &self.field_values } - + /// Sort and groups the field_values by field. /// - /// The result of this method is not cached and is + /// The result of this method is not cached and is /// computed on the fly when this method is called. pub fn get_sorted_field_values(&self) -> Vec<(Field, Vec<&FieldValue>)> { - let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect(); - field_values.sort_by_key(|field_value| field_value.field()); - field_values + let mut field_values: Vec<&FieldValue> = self.field_values().iter().collect(); + field_values.sort_by_key(|field_value| field_value.field()); + field_values .into_iter() .group_by(|field_value| field_value.field()) .into_iter() - .map(|(key, group)| { - (key, group.into_iter().collect()) - }) + .map(|(key, group)| (key, group.into_iter().collect())) .collect::)>>() } - + /// Returns all of the `FieldValue`s associated the given field pub fn get_all(&self, field: Field) -> Vec<&Value> { self.field_values @@ -110,9 +107,7 @@ impl Document { impl From> for Document { fn from(field_values: Vec) -> Document { - Document { - field_values: field_values - } + Document { field_values: field_values } } } @@ -121,7 +116,7 @@ impl From> for Document { mod tests { use schema::*; - + #[test] fn test_doc() { let mut schema_builder = SchemaBuilder::default(); @@ -130,5 +125,5 @@ mod tests { doc.add_text(text_field, "My title"); assert_eq!(doc.field_values().len(), 1); } - -} \ No newline at end of file + +} diff --git a/src/schema/field.rs b/src/schema/field.rs index 650ac0fd2..d73a66b34 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -7,7 +7,7 @@ use common::BinarySerializable; /// `Field` is actually a `u8` identifying a `Field` /// The schema is in charge of holding mapping between field names /// to `Field` objects. -/// +/// /// Because the field id is a `u8`, tantivy can only have at most `255` fields. /// Value 255 is reserved. #[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)] @@ -22,4 +22,3 @@ impl BinarySerializable for Field { u32::deserialize(reader).map(Field) } } - diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 45e34489c..1df77f5cc 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -9,10 +9,10 @@ use schema::FieldType; /// A `FieldEntry` represents a field and its configuration. /// `Schema` are a collection of `FieldEntry` -/// -/// It consists of -/// - a field name -/// - a field type, itself wrapping up options describing +/// +/// It consists of +/// - a field name +/// - a field type, itself wrapping up options describing /// how the field should be indexed. #[derive(Clone, Debug)] pub struct FieldEntry { @@ -21,7 +21,6 @@ pub struct FieldEntry { } impl FieldEntry { - /// Creates a new u64 field entry in the schema, given /// a name, and some options. pub fn new_text(field_name: String, field_type: TextOptions) -> FieldEntry { @@ -30,7 +29,7 @@ impl FieldEntry { field_type: FieldType::Str(field_type), } } - + /// Creates a new u64 field entry in the schema, given /// a name, and some options. pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry { @@ -39,7 +38,7 @@ impl FieldEntry { field_type: FieldType::U64(field_type), } } - + /// Creates a new i64 field entry in the schema, given /// a name, and some options. pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry { @@ -48,48 +47,42 @@ impl FieldEntry { field_type: FieldType::I64(field_type), } } - + /// Returns the name of the field - pub fn name(&self,) -> &str { + pub fn name(&self) -> &str { &self.name } - + /// Returns the field type - pub fn field_type(&self,) -> &FieldType { + pub fn field_type(&self) -> &FieldType { &self.field_type } - + /// Returns true iff the field is indexed - pub fn is_indexed(&self,) -> bool { + pub fn is_indexed(&self) -> bool { match self.field_type { FieldType::Str(ref options) => options.get_indexing_options().is_indexed(), FieldType::U64(ref options) => options.is_indexed(), FieldType::I64(ref options) => options.is_indexed(), } } - + /// Returns true iff the field is a int (signed or unsigned) fast field - pub fn is_int_fast(&self,) -> bool { + pub fn is_int_fast(&self) -> bool { match self.field_type { FieldType::U64(ref options) => options.is_fast(), FieldType::I64(ref options) => options.is_fast(), _ => false, } } - + /// Returns true iff the field is stored - pub fn is_stored(&self,) -> bool { + pub fn is_stored(&self) -> bool { match self.field_type { - FieldType::U64(ref options) => { - options.is_stored() - } - FieldType::I64(ref options) => { - options.is_stored() - } - FieldType::Str(ref options) => { - options.is_stored() - } + FieldType::U64(ref options) => options.is_stored(), + FieldType::I64(ref options) => options.is_stored(), + FieldType::Str(ref options) => options.is_stored(), } } } @@ -105,17 +98,17 @@ impl Serialize for FieldEntry { FieldType::Str(ref options) => { s.serialize_field("type", "text")?; s.serialize_field("options", options)?; - }, + } FieldType::U64(ref options) => { s.serialize_field("type", "u64")?; s.serialize_field("options", options)?; - }, + } FieldType::I64(ref options) => { s.serialize_field("type", "i64")?; s.serialize_field("options", options)?; } } - + s.end() } } @@ -126,7 +119,11 @@ impl<'de> Deserialize<'de> for FieldEntry { { #[derive(Deserialize)] #[serde(field_identifier, rename_all = "lowercase")] - enum Field { Name, Type, Options }; + enum Field { + Name, + Type, + Options, + }; const FIELDS: &'static [&'static str] = &["name", "type", "options"]; @@ -161,13 +158,24 @@ impl<'de> Deserialize<'de> for FieldEntry { } Field::Options => { match ty { - None => return Err(de::Error::custom("The `type` field must be specified before `options`")), + None => { + return Err(de::Error::custom("The `type` field must be specified before `options`",),) + } Some(ty) => { match ty { - "text" => field_type = Some(FieldType::Str(map.next_value()?)), - "u64" => field_type = Some(FieldType::U64(map.next_value()?)), - "i64" => field_type = Some(FieldType::I64(map.next_value()?)), - _ => return Err(de::Error::custom(format!("Unrecognised type {}", ty))) + "text" => { + field_type = Some(FieldType::Str(map.next_value()?)) + } + "u64" => { + field_type = Some(FieldType::U64(map.next_value()?)) + } + "i64" => { + field_type = Some(FieldType::I64(map.next_value()?)) + } + _ => { + return Err(de::Error::custom(format!("Unrecognised type {}", + ty))) + } } } } @@ -177,12 +185,13 @@ impl<'de> Deserialize<'de> for FieldEntry { let name = name.ok_or_else(|| de::Error::missing_field("name"))?; ty.ok_or_else(|| de::Error::missing_field("ty"))?; - let field_type = field_type.ok_or_else(|| de::Error::missing_field("options"))?; + let field_type = field_type + .ok_or_else(|| de::Error::missing_field("options"))?; Ok(FieldEntry { - name: name, - field_type: field_type, - }) + name: name, + field_type: field_type, + }) } } @@ -197,7 +206,7 @@ mod tests { use super::*; use schema::TEXT; use serde_json; - + #[test] fn test_json_serialization() { let field_value = FieldEntry::new_text(String::from("title"), TEXT); @@ -217,10 +226,10 @@ mod tests { let field_value: FieldEntry = serde_json::from_str(expected).unwrap(); assert_eq!("title", field_value.name); - + match field_value.field_type { FieldType::Str(_) => assert!(true), - _ => panic!("expected FieldType::Str") + _ => panic!("expected FieldType::Str"), } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 70f6509f8..2d3ec0bb3 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -11,12 +11,12 @@ pub enum ValueParsingError { /// Encounterred a numerical value that overflows or underflow its integer type. OverflowError(String), /// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u64 type) - /// Tantivy will try to autocast values. + /// Tantivy will try to autocast values. TypeError(String), } -/// A `FieldType` describes the type (text, u64) of a field as well as +/// A `FieldType` describes the type (text, u64) of a field as well as /// how it should be handled by tantivy. #[derive(Clone, Debug)] pub enum FieldType { @@ -29,36 +29,29 @@ pub enum FieldType { } impl FieldType { - /// returns true iff the field is indexed. pub fn is_indexed(&self) -> bool { match self { - &FieldType::Str(ref text_options) => { - text_options.get_indexing_options().is_indexed() - } - &FieldType::U64(ref int_options) => { - int_options.is_indexed() - } - &FieldType::I64(ref int_options) => { - int_options.is_indexed() - } + &FieldType::Str(ref text_options) => text_options.get_indexing_options().is_indexed(), + &FieldType::U64(ref int_options) => int_options.is_indexed(), + &FieldType::I64(ref int_options) => int_options.is_indexed(), } } /// Parses a field value from json, given the target FieldType. /// /// Tantivy will not try to cast values. - /// For instance, If the json value is the integer `3` and the - /// target field is a `Str`, this method will return an Error. + /// For instance, If the json value is the integer `3` and the + /// target field is a `Str`, this method will return an Error. pub fn value_from_json(&self, json: &JsonValue) -> Result { match *json { JsonValue::String(ref field_text) => { match *self { - FieldType::Str(_) => { - Ok(Value::Str(field_text.clone())) - } - FieldType::U64(_) | FieldType::I64(_) => { - Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json))) + FieldType::Str(_) => Ok(Value::Str(field_text.clone())), + FieldType::U64(_) | + FieldType::I64(_) => { + Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", + json))) } } } @@ -67,27 +60,30 @@ impl FieldType { FieldType::I64(_) => { if let Some(field_val_i64) = field_val_num.as_i64() { Ok(Value::I64(field_val_i64)) - } - else { - Err(ValueParsingError::OverflowError(format!("Expected an i64 int, got {:?}", json))) + } else { + Err(ValueParsingError::OverflowError(format!("Expected an i64 int, got {:?}", + json))) } } FieldType::U64(_) => { if let Some(field_val_u64) = field_val_num.as_u64() { Ok(Value::U64(field_val_u64)) - } - else { - Err(ValueParsingError::OverflowError(format!("Expected an u64 int, got {:?}", json))) + } else { + Err(ValueParsingError::OverflowError(format!("Expected an u64 int, got {:?}", + json))) } } FieldType::Str(_) => { - Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", json))) + Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", + json))) } } } _ => { - Err(ValueParsingError::TypeError(format!("Json value not supported error {:?}. Expected {:?}", json, self))) + Err(ValueParsingError::TypeError(format!("Json value not supported error {:?}. Expected {:?}", + json, + self))) } } } -} \ No newline at end of file +} diff --git a/src/schema/field_value.rs b/src/schema/field_value.rs index 594172daf..8202cc5ca 100644 --- a/src/schema/field_value.rs +++ b/src/schema/field_value.rs @@ -14,7 +14,6 @@ pub struct FieldValue { } impl FieldValue { - /// Constructor pub fn new(field: Field, value: Value) -> FieldValue { FieldValue { @@ -22,22 +21,21 @@ impl FieldValue { value: value, } } - - /// Field accessor + + /// Field accessor pub fn field(&self) -> Field { self.field } /// Value accessor - pub fn value(&self,) -> &Value { + pub fn value(&self) -> &Value { &self.value } } impl BinarySerializable for FieldValue { fn serialize(&self, writer: &mut Write) -> io::Result { - Ok(self.field.serialize(writer)? + - self.value.serialize(writer)?) + Ok(self.field.serialize(writer)? + self.value.serialize(writer)?) } fn deserialize(reader: &mut Read) -> io::Result { @@ -46,4 +44,3 @@ impl BinarySerializable for FieldValue { Ok(FieldValue::new(field, value)) } } - diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index 2f4812e13..acbb767aa 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -9,48 +9,47 @@ pub struct IntOptions { } impl IntOptions { - /// Returns true iff the value is stored. - pub fn is_stored(&self,) -> bool { + pub fn is_stored(&self) -> bool { self.stored } - - + + /// Returns true iff the value is indexed. - pub fn is_indexed(&self,) -> bool { + pub fn is_indexed(&self) -> bool { self.indexed } - - /// Returns true iff the value is a fast field. - pub fn is_fast(&self,) -> bool { + + /// Returns true iff the value is a fast field. + pub fn is_fast(&self) -> bool { self.fast } - + /// Set the u64 options as stored. /// /// Only the fields that are set as *stored* are /// persisted into the Tantivy's store. - pub fn set_stored(mut self,) -> IntOptions { + pub fn set_stored(mut self) -> IntOptions { self.stored = true; self } - + /// Set the u64 options as indexed. /// /// Setting an integer as indexed will generate /// a posting list for each value taken by the integer. - pub fn set_indexed(mut self,) -> IntOptions { + pub fn set_indexed(mut self) -> IntOptions { self.indexed = true; self } - + /// Set the u64 options as a fast field. /// /// Fast fields are designed for random access. - /// Access time are similar to a random lookup in an array. + /// Access time are similar to a random lookup in an array. /// If more than one value is associated to a fast field, only the last one is /// kept. - pub fn set_fast(mut self,) -> IntOptions { + pub fn set_fast(mut self) -> IntOptions { self.fast = true; self } @@ -63,7 +62,7 @@ impl Default for IntOptions { indexed: false, stored: false, } - } + } } @@ -85,7 +84,7 @@ pub const INT_INDEXED: IntOptions = IntOptions { fast: false, }; -/// Shortcut for a u64 stored field. +/// Shortcut for a u64 stored field. /// /// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED` pub const INT_STORED: IntOptions = IntOptions { @@ -96,7 +95,6 @@ pub const INT_STORED: IntOptions = IntOptions { impl BitOr for IntOptions { - type Output = IntOptions; fn bitor(self, other: IntOptions) -> IntOptions { @@ -106,4 +104,4 @@ impl BitOr for IntOptions { res.fast = self.fast | other.fast; res } -} \ No newline at end of file +} diff --git a/src/schema/mod.rs b/src/schema/mod.rs index f0ae29c84..daa3f15b4 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -93,7 +93,8 @@ u64 that are indexed as fast will be stored in a special data structure that wil make it possible to access the u64 value given the doc id rapidly. This is useful if the value of the field is required during scoring or collection for instance. -*/ +*/ + mod schema; mod term; @@ -166,4 +167,4 @@ mod tests { assert!(is_valid_field_name("my_text_field")); } -} \ No newline at end of file +} diff --git a/src/schema/named_field_document.rs b/src/schema/named_field_document.rs index 3a28be243..f78730de1 100644 --- a/src/schema/named_field_document.rs +++ b/src/schema/named_field_document.rs @@ -3,9 +3,9 @@ use schema::Value; -/// Internal representation of a document used for JSON +/// Internal representation of a document used for JSON /// serialization. -/// +/// /// A `NamedFieldDocument` is a simple representation of a document /// as a `BTreeMap>`. /// diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 3bfd803e0..62359f50d 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -37,8 +37,6 @@ pub struct SchemaBuilder { impl SchemaBuilder { - - /// Create a new `SchemaBuilder` pub fn new() -> SchemaBuilder { SchemaBuilder::default() @@ -49,15 +47,12 @@ impl SchemaBuilder { /// /// # Caution /// - /// Appending two fields with the same name - /// will result in the shadowing of the first + /// Appending two fields with the same name + /// will result in the shadowing of the first /// by the second one. - /// The first field will get a field id - /// but only the second one will be indexed - pub fn add_u64_field( - &mut self, - field_name_str: &str, - field_options: IntOptions) -> Field { + /// The first field will get a field id + /// but only the second one will be indexed + pub fn add_u64_field(&mut self, field_name_str: &str, field_options: IntOptions) -> Field { let field_name = String::from(field_name_str); let field_entry = FieldEntry::new_u64(field_name, field_options); self.add_field(field_entry) @@ -68,15 +63,12 @@ impl SchemaBuilder { /// /// # Caution /// - /// Appending two fields with the same name - /// will result in the shadowing of the first + /// Appending two fields with the same name + /// will result in the shadowing of the first /// by the second one. - /// The first field will get a field id + /// The first field will get a field id /// but only the second one will be indexed - pub fn add_i64_field( - &mut self, - field_name_str: &str, - field_options: IntOptions) -> Field { + pub fn add_i64_field(&mut self, field_name_str: &str, field_options: IntOptions) -> Field { let field_name = String::from(field_name_str); let field_entry = FieldEntry::new_i64(field_name, field_options); self.add_field(field_entry) @@ -87,21 +79,18 @@ impl SchemaBuilder { /// /// # Caution /// - /// Appending two fields with the same name - /// will result in the shadowing of the first + /// Appending two fields with the same name + /// will result in the shadowing of the first /// by the second one. - /// The first field will get a field id - /// but only the second one will be indexed - pub fn add_text_field( - &mut self, - field_name_str: &str, - field_options: TextOptions) -> Field { + /// The first field will get a field id + /// but only the second one will be indexed + pub fn add_text_field(&mut self, field_name_str: &str, field_options: TextOptions) -> Field { let field_name = String::from(field_name_str); let field_entry = FieldEntry::new_text(field_name, field_options); self.add_field(field_entry) } - - + + /// Adds a field entry to the schema in build. fn add_field(&mut self, field_entry: FieldEntry) -> Field { let field = Field(self.fields.len() as u32); @@ -110,15 +99,15 @@ impl SchemaBuilder { self.fields_map.insert(field_name, field); field } - - + + /// Finalize the creation of a `Schema` /// This will consume your `SchemaBuilder` - pub fn build(self,) -> Schema { + pub fn build(self) -> Schema { Schema(Arc::new(InnerSchema { - fields: self.fields, - fields_map: self.fields_map, - })) + fields: self.fields, + fields_map: self.fields_map, + })) } } @@ -135,7 +124,7 @@ impl Default for SchemaBuilder { #[derive(Debug)] struct InnerSchema { fields: Vec, - fields_map: HashMap, // transient + fields_map: HashMap, // transient } @@ -164,22 +153,21 @@ struct InnerSchema { pub struct Schema(Arc); impl Schema { - /// Return the `FieldEntry` associated to a `Field`. pub fn get_field_entry(&self, field: Field) -> &FieldEntry { &self.0.fields[field.0 as usize] } - + /// Return the field name for a given `Field`. pub fn get_field_name(&self, field: Field) -> &str { self.get_field_entry(field).name() } - + /// Return the list of all the `Field`s. - pub fn fields(&self,) -> &[FieldEntry] { + pub fn fields(&self) -> &[FieldEntry] { &self.0.fields } - + /// Returns the field options associated with a given name. /// /// # Panics @@ -192,7 +180,7 @@ impl Schema { pub fn get_field(&self, field_name: &str) -> Option { self.0.fields_map.get(field_name).cloned() } - + /// Create a named document off the doc. pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument { let mut field_map = BTreeMap::new(); @@ -200,34 +188,33 @@ impl Schema { let field_name = self.get_field_name(field); let values: Vec = field_values .into_iter() - .map(|field_val| field_val.value() ) + .map(|field_val| field_val.value()) .cloned() .collect(); field_map.insert(field_name.to_string(), values); } NamedFieldDocument(field_map) } - - - /// Encode the schema in JSON. + + + /// Encode the schema in JSON. /// /// Encoding a document cannot fail. pub fn to_json(&self, doc: &Document) -> String { serde_json::to_string(&self.to_named_doc(doc)).expect("doc encoding failed. This is a bug") } - /// Build a document object from a json-object. + /// Build a document object from a json-object. pub fn parse_document(&self, doc_json: &str) -> Result { - let json_obj: JsonObject = serde_json::from_str(doc_json).map_err(|_| { - let doc_json_sample: String = - if doc_json.len() < 20 { - String::from(doc_json) - } - else { - format!("{:?}...", &doc_json[0..20]) - }; - DocParsingError::NotJSON(doc_json_sample) - })?; + let json_obj: JsonObject = serde_json::from_str(doc_json) + .map_err(|_| { + let doc_json_sample: String = if doc_json.len() < 20 { + String::from(doc_json) + } else { + format!("{:?}...", &doc_json[0..20]) + }; + DocParsingError::NotJSON(doc_json_sample) + })?; let mut doc = Document::default(); for (field_name, json_value) in json_obj.iter() { @@ -238,31 +225,29 @@ impl Schema { match *json_value { JsonValue::Array(ref json_items) => { for json_item in json_items { - let value = try!( - field_type - .value_from_json(&json_item) - .map_err(|e| DocParsingError::ValueError(field_name.clone(), e)) - ); + let value = try!(field_type + .value_from_json(&json_item) + .map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })); doc.add(FieldValue::new(field, value)); } } _ => { - let value = try!( - field_type - .value_from_json(&json_value) - .map_err(|e| DocParsingError::ValueError(field_name.clone(), e)) - ); + let value = try!(field_type + .value_from_json(&json_value) + .map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })); doc.add(FieldValue::new(field, value)); } - + } } - None => { - return Err(DocParsingError::NoSuchFieldInSchema(field_name.clone())) - } + None => return Err(DocParsingError::NoSuchFieldInSchema(field_name.clone())), } } - Ok(doc) + Ok(doc) } } @@ -284,15 +269,13 @@ impl Serialize for Schema { } } -impl<'de> Deserialize<'de> for Schema -{ +impl<'de> Deserialize<'de> for Schema { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { struct SchemaVisitor; - impl<'de> Visitor<'de> for SchemaVisitor - { + impl<'de> Visitor<'de> for SchemaVisitor { type Value = Schema; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { @@ -314,7 +297,7 @@ impl<'de> Deserialize<'de> for Schema Ok(schema.build()) } } - + deserializer.deserialize_map(SchemaVisitor) } } @@ -330,7 +313,7 @@ impl From for Schema { -/// Error that may happen when deserializing +/// Error that may happen when deserializing /// a document from JSON. #[derive(Debug)] pub enum DocParsingError { @@ -338,24 +321,24 @@ pub enum DocParsingError { NotJSON(String), /// One of the value node could not be parsed. ValueError(String, ValueParsingError), - /// The json-document contains a field that is not declared in the schema. + /// The json-document contains a field that is not declared in the schema. NoSuchFieldInSchema(String), } #[cfg(test)] mod tests { - + use schema::*; use serde_json; use schema::field_type::ValueParsingError; use schema::schema::DocParsingError::NotJSON; - + #[test] pub fn test_schema_serialization() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(); - let popularity_options = IntOptions::default().set_stored().set_fast(); + let count_options = IntOptions::default().set_stored().set_fast(); + let popularity_options = IntOptions::default().set_stored().set_fast(); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); schema_builder.add_u64_field("count", count_options); @@ -400,8 +383,8 @@ mod tests { ]"#; println!("{}", schema_json); println!("{}", expected); - assert_eq!(schema_json, expected); - + assert_eq!(schema_json, expected); + let schema: Schema = serde_json::from_str(expected).unwrap(); let mut fields = schema.fields().iter(); @@ -413,11 +396,11 @@ mod tests { } - + #[test] pub fn test_document_to_json() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(); + let count_options = IntOptions::default().set_stored().set_fast(); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); schema_builder.add_u64_field("count", count_options); @@ -432,11 +415,11 @@ mod tests { let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap(); assert_eq!(doc, doc_serdeser); } - + #[test] pub fn test_parse_document() { let mut schema_builder = SchemaBuilder::default(); - let count_options = IntOptions::default().set_stored().set_fast(); + let count_options = IntOptions::default().set_stored().set_fast(); let popularity_options = IntOptions::default().set_stored().set_fast(); let title_field = schema_builder.add_text_field("title", TEXT); let author_field = schema_builder.add_text_field("author", STRING); @@ -448,12 +431,14 @@ mod tests { assert!(doc.field_values().is_empty()); } { - let doc = schema.parse_document(r#"{ + let doc = schema + .parse_document(r#"{ "title": "my title", "author": "fulmicoton", "count": 4, "popularity": 10 - }"#).unwrap(); + }"#) + .unwrap(); assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); @@ -535,7 +520,7 @@ mod tests { match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); - }, + } _ => { panic!("expected 9223372036854775808 to overflow i64, but it didn't"); } @@ -550,7 +535,7 @@ mod tests { match json_err { Err(NotJSON(_)) => { assert!(true); - }, + } _ => { panic!("expected invalid JSON to fail parsing, but it didn't"); } diff --git a/src/schema/term.rs b/src/schema/term.rs index 7abbbd9dc..d475768bf 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -16,21 +16,20 @@ const INT_TERM_LEN: usize = 4 + 8; pub struct Term(Vec); impl Term { - /// Set the content of the term. pub(crate) fn set_content(&mut self, content: &[u8]) { assert!(content.len() >= 4); self.0.resize(content.len(), 0u8); (&mut self.0[..]).clone_from_slice(content); } - + /// Returns the field id. - fn field_id(&self,) -> u32 { + fn field_id(&self) -> u32 { BigEndian::read_u32(&self.0[..4]) } /// Returns the field. - pub fn field(&self,) -> Field { + pub fn field(&self) -> Field { Field(self.field_id()) } @@ -46,7 +45,7 @@ impl Term { /// /// Assuming the term has a field id of 1, and a u64 value of 3234, /// the Term will have 8 bytes. - /// + /// /// The first four byte are dedicated to storing the field id as a u64. /// The 4 following bytes are encoding the u64 value. pub fn from_field_u64(field: Field, val: u64) -> Term { @@ -57,33 +56,33 @@ impl Term { } /// Sets a u64 value in the term. - /// + /// /// U64 are serialized using (8-byte) BigEndian /// representation. /// The use of BigEndian has the benefit of preserving - /// the natural order of the values. + /// the natural order of the values. pub fn set_u64(&mut self, val: u64) { self.0.resize(INT_TERM_LEN, 0u8); BigEndian::write_u64(&mut self.0[4..], val); } - + /// Builds a term given a field, and a u64-value /// /// Assuming the term has a field id of 1, and a u64 value of 3234, /// the Term will have 8 bytes. - /// + /// /// The first four byte are dedicated to storing the field id as a u64. /// The 4 following bytes are encoding the u64 value. pub fn from_field_i64(field: Field, val: i64) -> Term { let val_u64: u64 = common::i64_to_u64(val); Term::from_field_u64(field, val_u64) } - + /// Builds a term given a field, and a string value /// /// Assuming the term has a field id of 2, and a text value of "abc", /// the Term will have 4 bytes. - /// The first byte is 2, and the three following bytes are the utf-8 + /// The first byte is 2, and the three following bytes are the utf-8 /// representation of "abc". pub fn from_field_text(field: Field, text: &str) -> Term { let buffer = Vec::with_capacity(4 + text.len()); @@ -93,7 +92,7 @@ impl Term { term } - /// Creates a new Term with an empty buffer, + /// Creates a new Term with an empty buffer, /// but with a given capacity. /// /// It is declared unsafe, as the term content @@ -109,7 +108,7 @@ impl Term { pub fn get_u64(&self) -> u64 { BigEndian::read_u64(&self.0[4..]) } - + /// Builds a term from its byte representation. /// /// If you want to build a field for a given `str`, @@ -123,7 +122,7 @@ impl Term { /// /// If the term is a string, its value is utf-8 encoded. /// If the term is a u64, its value is encoded according - /// to `byteorder::LittleEndian`. + /// to `byteorder::LittleEndian`. pub fn value(&self) -> &[u8] { &self.0[4..] } @@ -132,20 +131,20 @@ impl Term { /// /// # Panics /// If the value is not valid utf-8. This may happen - /// if the index is corrupted or if you try to + /// if the index is corrupted or if you try to /// call this method on a non-string type. pub fn text(&self) -> &str { str::from_utf8(self.value()).expect("Term does not contain valid utf-8.") } - /// Set the texts only, keeping the field untouched. + /// Set the texts only, keeping the field untouched. pub fn set_text(&mut self, text: &str) { self.0.resize(4, 0u8); self.0.extend(text.as_bytes()); } - - /// Returns the underlying `&[u8]` - pub fn as_slice(&self,)->&[u8] { + + /// Returns the underlying `&[u8]` + pub fn as_slice(&self) -> &[u8] { &self.0 } } @@ -165,7 +164,7 @@ impl fmt::Debug for Term { #[cfg(test)] mod tests { - + use schema::*; #[test] @@ -177,7 +176,7 @@ mod tests { { let term = Term::from_field_text(title_field, "test"); assert_eq!(term.field(), title_field); - assert_eq!(&term.as_slice()[0..4], &[0u8,0u8,0u8,1u8]); + assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 1u8]); assert_eq!(&term.as_slice()[4..], "test".as_bytes()); } { @@ -194,6 +193,6 @@ mod tests { assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8); assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8); } - + } -} \ No newline at end of file +} diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 31550cd7f..36e8fd993 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -9,29 +9,27 @@ pub struct TextOptions { } impl TextOptions { - /// Returns the indexing options. - pub fn get_indexing_options(&self,) -> TextIndexingOptions { + pub fn get_indexing_options(&self) -> TextIndexingOptions { self.indexing } /// Returns true iff the text is to be stored. - pub fn is_stored(&self,) -> bool { + pub fn is_stored(&self) -> bool { self.stored } /// Sets the field as stored - pub fn set_stored(mut self,) -> TextOptions { + pub fn set_stored(mut self) -> TextOptions { self.stored = true; self } - - /// Sets the field as indexed, with the specific indexing options. + + /// Sets the field as indexed, with the specific indexing options. pub fn set_indexing_options(mut self, indexing: TextIndexingOptions) -> TextOptions { self.indexing = indexing; self } - } impl Default for TextOptions { @@ -53,14 +51,14 @@ pub enum TextIndexingOptions { #[serde(rename="unindexed")] Unindexed, /// Untokenized means that the field text will not be split into tokens before being indexed. - /// A field with the value "Hello world", will have the document suscribe to one single + /// A field with the value "Hello world", will have the document suscribe to one single /// postings, the postings associated to the string "Hello world". /// /// It will **not** be searchable if the user enter "hello" for instance. - /// This can be useful for tags, or ids for instance. + /// This can be useful for tags, or ids for instance. #[serde(rename="untokenized")] Untokenized, - /// TokenizedNoFreq will tokenize the field value, and append the document doc id + /// TokenizedNoFreq will tokenize the field value, and append the document doc id /// to the posting lists associated to all of the tokens. /// The frequence of appearance of the term in the document however will be lost. /// The term frequency used in the TfIdf formula will always be 1. @@ -69,9 +67,8 @@ pub enum TextIndexingOptions { /// TokenizedWithFreq will tokenize the field value, and encode /// both the docid and the term frequency in the posting lists associated to all #[serde(rename="freq")] - // of the tokens. TokenizedWithFreq, - /// Like TokenizedWithFreq, but also encodes the positions of the + /// Like TokenizedWithFreq, but also encodes the positions of the /// terms in a separate file. This option is required for phrase queries. /// Don't use this if you are certain you won't need it, the term positions file can be very big. #[serde(rename="position")] @@ -79,37 +76,36 @@ pub enum TextIndexingOptions { } impl TextIndexingOptions { - /// Returns true iff the term frequency will be encoded. pub fn is_termfreq_enabled(&self) -> bool { match *self { - TextIndexingOptions::TokenizedWithFreq - | TextIndexingOptions::TokenizedWithFreqAndPosition => true, + TextIndexingOptions::TokenizedWithFreq | + TextIndexingOptions::TokenizedWithFreqAndPosition => true, _ => false, } } - + /// Returns true iff the term is tokenized before being indexed - pub fn is_tokenized(&self,) -> bool { + pub fn is_tokenized(&self) -> bool { match *self { - TextIndexingOptions::TokenizedNoFreq - | TextIndexingOptions::TokenizedWithFreq - | TextIndexingOptions::TokenizedWithFreqAndPosition=> true, + TextIndexingOptions::TokenizedNoFreq | + TextIndexingOptions::TokenizedWithFreq | + TextIndexingOptions::TokenizedWithFreqAndPosition => true, _ => false, } } - - + + /// Returns true iff the term will generate some posting lists. - pub fn is_indexed(&self,) -> bool { + pub fn is_indexed(&self) -> bool { match *self { TextIndexingOptions::Unindexed => false, _ => true, } - } - - /// Returns true iff the term positions within the document are stored as well. - pub fn is_position_enabled(&self,) -> bool { + } + + /// Returns true iff the term positions within the document are stored as well. + pub fn is_position_enabled(&self) -> bool { match *self { TextIndexingOptions::TokenizedWithFreqAndPosition => true, _ => false, @@ -119,17 +115,15 @@ impl TextIndexingOptions { impl BitOr for TextIndexingOptions { - type Output = TextIndexingOptions; + type Output = TextIndexingOptions; fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions { use super::TextIndexingOptions::*; if self == Unindexed { other - } - else if other == Unindexed || self == other { + } else if other == Unindexed || self == other { self - } - else { + } else { // make it possible panic!(format!("Combining {:?} and {:?} is ambiguous", self, other)); } @@ -161,7 +155,6 @@ pub const STORED: TextOptions = TextOptions { impl BitOr for TextOptions { - type Output = TextOptions; fn bitor(self, other: TextOptions) -> TextOptions { @@ -176,7 +169,7 @@ impl BitOr for TextOptions { #[cfg(test)] mod tests { use schema::*; - + #[test] fn test_field_options() { { diff --git a/src/schema/value.rs b/src/schema/value.rs index bf75b9405..7d89cec72 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -11,7 +11,7 @@ pub enum Value { /// Unsigned 64-bits Integer `u64` U64(u64), /// Signed 64-bits Integer `i64` - I64(i64) + I64(i64), } impl Serialize for Value { @@ -26,15 +26,13 @@ impl Serialize for Value { } } -impl<'de> Deserialize<'de> for Value -{ +impl<'de> Deserialize<'de> for Value { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { struct ValueVisitor; - impl<'de> Visitor<'de> for ValueVisitor - { + impl<'de> Visitor<'de> for ValueVisitor { type Value = Value; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { @@ -57,7 +55,7 @@ impl<'de> Deserialize<'de> for Value Ok(Value::Str(v)) } } - + deserializer.deserialize_any(ValueVisitor) } } @@ -66,45 +64,33 @@ impl Value { /// Returns the text value, provided the value is of the `Str` type. /// /// # Panics - /// If the value is not of type `Str` + /// If the value is not of type `Str` pub fn text(&self) -> &str { match *self { - Value::Str(ref text) => { - text - } - _ => { - panic!("This is not a text field.") - } + Value::Str(ref text) => text, + _ => panic!("This is not a text field."), } } - + /// Returns the u64-value, provided the value is of the `U64` type. /// /// # Panics - /// If the value is not of type `U64` + /// If the value is not of type `U64` pub fn u64_value(&self) -> u64 { match *self { - Value::U64(ref value) => { - *value - } - _ => { - panic!("This is not a text field.") - } + Value::U64(ref value) => *value, + _ => panic!("This is not a text field."), } } /// Returns the i64-value, provided the value is of the `I64` type. /// /// # Panics - /// If the value is not of type `I64` + /// If the value is not of type `I64` pub fn i64_value(&self) -> i64 { match *self { - Value::I64(ref value) => { - *value - } - _ => { - panic!("This is not a text field.") - } + Value::I64(ref value) => *value, + _ => panic!("This is not a text field."), } } } @@ -150,15 +136,15 @@ mod binary_serialize { Value::Str(ref text) => { written_size += try!(TEXT_CODE.serialize(writer)); written_size += try!(text.serialize(writer)); - }, + } Value::U64(ref val) => { written_size += try!(U64_CODE.serialize(writer)); written_size += try!(val.serialize(writer)); - }, + } Value::I64(ref val) => { written_size += try!(I64_CODE.serialize(writer)); written_size += try!(val.serialize(writer)); - }, + } } Ok(written_size) } @@ -178,9 +164,11 @@ mod binary_serialize { Ok(Value::I64(value)) } _ => { - Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code))) + Err(io::Error::new(io::ErrorKind::InvalidData, + format!("No field type is associated with code {:?}", + type_code))) } - } + } } } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 5dcde7266..ff5a274bb 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -18,9 +18,10 @@ mod tests { fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema { let mut schema_builder = SchemaBuilder::default(); let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored()); - let field_title = schema_builder.add_text_field("title", TextOptions::default().set_stored()); + let field_title = schema_builder + .add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); - let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."); + let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.",); { let mut store_writer = StoreWriter::new(writer); for i in 0..num_docs { @@ -41,7 +42,7 @@ mod tests { } schema } - + #[test] fn test_store() { let path = Path::new("store"); @@ -52,18 +53,19 @@ mod tests { let store_source = directory.open_read(path).unwrap(); let store = StoreReader::from(store_source); for i in 0..1_000 { - assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(), format!("Doc {}", i)); + assert_eq!(*store.get(i).unwrap().get_first(field_title).unwrap().text(), + format!("Doc {}", i)); } } - + #[bench] fn bench_store_encode(b: &mut Bencher) { let mut directory = MmapDirectory::create_from_tempdir().unwrap(); let path = Path::new("store"); b.iter(|| { - write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); - directory.delete(path).unwrap(); - }); + write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); + directory.delete(path).unwrap(); + }); } @@ -74,9 +76,7 @@ mod tests { write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000); let store_source = directory.open_read(path).unwrap(); let store = StoreReader::from(store_source); - b.iter(|| { - store.get(12).unwrap(); - }); + b.iter(|| { store.get(12).unwrap(); }); } } diff --git a/src/store/reader.rs b/src/store/reader.rs index 3a9918f9d..79eb4d4dd 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -34,14 +34,14 @@ impl StoreReader { let total_buffer = self.data.as_slice(); let mut cursor = &total_buffer[block_offset..]; let block_length = u32::deserialize(&mut cursor).unwrap(); - let block_array: &[u8] = - &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)]; + let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize).. + (block_offset + 4 + block_length as usize)]; let mut lz4_decoder = try!(lz4::Decoder::new(block_array)); *self.current_block_offset.borrow_mut() = usize::max_value(); try!(lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ())); *self.current_block_offset.borrow_mut() = block_offset; } - Ok(()) + Ok(()) } pub fn get(&self, doc_id: DocId) -> Result { diff --git a/src/store/writer.rs b/src/store/writer.rs index 426648381..1221a46ec 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -67,7 +67,8 @@ impl StoreWriter { try!(self.write_and_compress_block()); } let header_offset: u64 = self.written; - try!(self.offset_index_writer.write::>(&mut self.writer)); + try!(self.offset_index_writer + .write::>(&mut self.writer)); try!(header_offset.serialize(&mut self.writer)); try!(self.doc.serialize(&mut self.writer)); self.writer.flush() From 03564214e73d556a87ef78353abf4f3b8eaae3c0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 15 May 2017 22:46:43 +0900 Subject: [PATCH 16/51] Added check for rustfmt in travis --- .travis.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.travis.yml b/.travis.yml index cbfbc222b..e07842219 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,13 +22,16 @@ before_script: - | pip install 'travis-cargo<0.2' --user && export PATH=$HOME/.local/bin:$PATH + - (cargo install rustfmt || true) script: + - cargo fmt -- --write-mode=diff - | travis-cargo build && travis-cargo test && travis-cargo bench && travis-cargo doc - cargo run --example simple_search + - cargo rust after_success: - bash ./script/build-doc.sh - travis-cargo doc-upload From 0606a8ae735216b0264cd9a70cb4d5defa766b55 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 16 May 2017 00:22:11 +0900 Subject: [PATCH 17/51] Bugfix in travis yml --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e07842219..4bca1ec5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,6 @@ script: travis-cargo bench && travis-cargo doc - cargo run --example simple_search - - cargo rust after_success: - bash ./script/build-doc.sh - travis-cargo doc-upload From 4d90d8fc1d0a61d8550f043cf6bc20f6d27b213f Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 13:18:29 +0300 Subject: [PATCH 18/51] Move the random sampling helpers to the tests module --- src/compression/composite.rs | 38 ++++++++++---------- src/compression/mod.rs | 57 +++++++++++------------------ src/lib.rs | 69 ++++++++++++++++++++++++------------ 3 files changed, 87 insertions(+), 77 deletions(-) diff --git a/src/compression/composite.rs b/src/compression/composite.rs index a92023405..9f0f52290 100644 --- a/src/compression/composite.rs +++ b/src/compression/composite.rs @@ -8,14 +8,14 @@ pub struct CompositeEncoder { } impl CompositeEncoder { - + pub fn new() -> CompositeEncoder { CompositeEncoder { block_encoder: BlockEncoder::new(), output: Vec::with_capacity(500_000), } } - + pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] { self.output.clear(); let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; @@ -30,7 +30,7 @@ impl CompositeEncoder { self.output.extend_from_slice(vint_compressed); &self.output } - + pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] { self.output.clear(); let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK; @@ -57,9 +57,9 @@ impl CompositeDecoder { CompositeDecoder { block_decoder: BlockDecoder::new(), vals: Vec::with_capacity(500_000), - } + } } - + pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] { if uncompressed_len > self.vals.capacity() { let extra_capacity = uncompressed_len - self.vals.capacity(); @@ -77,7 +77,7 @@ impl CompositeDecoder { self.vals.extend_from_slice(self.block_decoder.output_array()); &self.vals } - + pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] { self.vals.clear(); let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK; @@ -100,14 +100,14 @@ impl Into> for CompositeDecoder { #[cfg(test)] pub mod tests { - + use test::Bencher; use super::*; - use compression::tests::generate_array; + use tests; #[test] fn test_composite_unsorted() { - let data = generate_array(10_000, 0.1); + let data = tests::generate_array(10_000, 0.1); let mut encoder = CompositeEncoder::new(); let compressed = encoder.compress_unsorted(&data); assert!(compressed.len() <= 19_794); @@ -115,12 +115,12 @@ pub mod tests { let result = decoder.uncompress_unsorted(&compressed, data.len()); for i in 0..data.len() { assert_eq!(data[i], result[i]); - } + } } #[test] fn test_composite_sorted() { - let data = generate_array(10_000, 0.1); + let data = tests::generate_array(10_000, 0.1); let mut encoder = CompositeEncoder::new(); let compressed = encoder.compress_sorted(&data); assert!(compressed.len() <= 7_826); @@ -128,27 +128,27 @@ pub mod tests { let result = decoder.uncompress_sorted(&compressed, data.len()); for i in 0..data.len() { assert_eq!(data[i], result[i]); - } + } } - - + + const BENCH_NUM_INTS: usize = 99_968; - + #[bench] fn bench_compress(b: &mut Bencher) { let mut encoder = CompositeEncoder::new(); - let data = generate_array(BENCH_NUM_INTS, 0.1); + let data = tests::generate_array(BENCH_NUM_INTS, 0.1); b.iter(|| { encoder.compress_sorted(&data); }); } - + #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = CompositeEncoder::new(); - let data = generate_array(BENCH_NUM_INTS, 0.1); + let data = tests::generate_array(BENCH_NUM_INTS, 0.1); let compressed = encoder.compress_sorted(&data); - let mut decoder = CompositeDecoder::new(); + let mut decoder = CompositeDecoder::new(); b.iter(|| { decoder.uncompress_sorted(compressed, BENCH_NUM_INTS); }); diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 722521c2c..e59f2a497 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -43,18 +43,18 @@ pub trait VIntDecoder { } impl VIntEncoder for BlockEncoder { - + fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] { vint::compress_sorted(input, &mut self.output, offset) } - + fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] { vint::compress_unsorted(input, &mut self.output) } } impl VIntDecoder for BlockDecoder { - + fn uncompress_vint_sorted<'a>( &mut self, compressed_data: &'a [u8], @@ -63,14 +63,14 @@ impl VIntDecoder for BlockDecoder { self.output_len = num_els; vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) } - + fn uncompress_vint_unsorted<'a>( &mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8] { self.output_len = num_els; vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) - } + } } @@ -79,24 +79,9 @@ pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the #[cfg(test)] pub mod tests { - use rand::Rng; - use rand::SeedableRng; - use rand::XorShiftRng; use super::*; + use tests; use test::Bencher; - - fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { - let seed: &[u32; 4] = &[1, 2, 3, seed_val]; - let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); - (0..u32::max_value()) - .filter(|_| rng.next_f32()< ratio) - .take(n) - .collect() - } - - pub fn generate_array(n: usize, ratio: f32) -> Vec { - generate_array_with_seed(n, ratio, 4) - } #[test] fn test_encode_sorted_block() { @@ -105,7 +90,7 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 0); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); + let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0); assert_eq!(remaining_data.len(), 0); } for i in 0..128 { @@ -120,14 +105,14 @@ pub mod tests { let compressed_data = encoder.compress_block_sorted(&vals, 10); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); + let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10); assert_eq!(remaining_data.len(), 0); } for i in 0..128 { assert_eq!(vals[i], decoder.output(i)); } } - + #[test] fn test_encode_sorted_block_with_junk() { let mut compressed: Vec = Vec::new(); @@ -139,7 +124,7 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); + let remaining_data = decoder.uncompress_block_sorted(&compressed, 10); assert_eq!(remaining_data.len(), 1); assert_eq!(remaining_data[0], 173u8); } @@ -159,7 +144,7 @@ pub mod tests { compressed.push(173u8); let mut decoder = BlockDecoder::new(); { - let remaining_data = decoder.uncompress_block_unsorted(&compressed); + let remaining_data = decoder.uncompress_block_unsorted(&compressed); assert_eq!(remaining_data.len(), 1); assert_eq!(remaining_data[0], 173u8); } @@ -167,8 +152,8 @@ pub mod tests { assert_eq!(vals[i], decoder.output(i)); } } - - + + #[test] fn test_encode_vint() { { @@ -193,18 +178,18 @@ pub mod tests { #[bench] fn bench_compress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); b.iter(|| { encoder.compress_block_sorted(&data, 0u32); }); } - + #[bench] fn bench_uncompress(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1); + let data = tests::generate_array(NUM_DOCS_PER_BLOCK, 0.1); let compressed = encoder.compress_block_sorted(&data, 0u32); - let mut decoder = BlockDecoder::new(); + let mut decoder = BlockDecoder::new(); b.iter(|| { decoder.uncompress_block_sorted(compressed, 0u32); }); @@ -216,18 +201,18 @@ pub mod tests { #[bench] fn bench_compress_vint(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); + let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001); b.iter(|| { encoder.compress_vint_sorted(&data, 0u32); }); } - + #[bench] fn bench_uncompress_vint(b: &mut Bencher) { let mut encoder = BlockEncoder::new(); - let data = generate_array(NUM_INTS_BENCH_VINT, 0.001); + let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001); let compressed = encoder.compress_vint_sorted(&data, 0u32); - let mut decoder = BlockDecoder::new(); + let mut decoder = BlockDecoder::new(); b.iter(|| { decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT); }); diff --git a/src/lib.rs b/src/lib.rs index 7f889691d..35ff7b1a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,7 @@ //! # `tantivy` //! -//! Tantivy is a search engine library. +//! Tantivy is a search engine library. //! Think `Lucene`, but in Rust. //! //! A good place for you to get started is to check out @@ -138,14 +138,14 @@ pub use core::TermIterator; /// whether it was compiled with the simd compression. pub fn version() -> &'static str { if cfg!(feature="simdcompression") { - concat!(version!(), "-simd") + concat!(version!(), "-simd") } else { - concat!(version!(), "-nosimd") + concat!(version!(), "-nosimd") } } -/// Tantivy's makes it possible to personalize when +/// Tantivy's makes it possible to personalize when /// the indexer should merge its segments pub mod merge_policy { pub use indexer::MergePolicy; @@ -167,7 +167,7 @@ pub type Score = f32; pub type SegmentLocalId = u32; impl DocAddress { - + /// Return the segment ordinal. /// The segment ordinal is an id identifying the segment /// hosting the document. It is only meaningful, in the context @@ -175,7 +175,7 @@ impl DocAddress { pub fn segment_ord(&self,) -> SegmentLocalId { self.0 } - + /// Return the segment local `DocId` pub fn doc(&self,) -> DocId { self.1 @@ -183,12 +183,12 @@ impl DocAddress { } -/// `DocAddress` contains all the necessary information +/// `DocAddress` contains all the necessary information /// to identify a document given a `Searcher` object. -/// -/// It consists in an id identifying its segment, and +/// +/// It consists in an id identifying its segment, and /// its segment-local `DocId`. -/// +/// /// The id used for the segment is actually an ordinal /// in the list of segment hold by a `Searcher`. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -208,7 +208,32 @@ mod tests { use IndexWriter; use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader}; use Postings; + use rand::{XorShiftRng, Rng, SeedableRng}; + fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { + let seed: &[u32; 4] = &[1, 2, 3, seed_val]; + let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); + (0..u32::max_value()) + .filter(|_| rng.next_f32()< ratio) + .take(n) + .collect() + } + + pub fn generate_array(n: usize, ratio: f32) -> Vec { + generate_array_with_seed(n, ratio, 4) + } + + fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec { + let seed: &[u32; 4] = &[1, 2, 3, seed_val]; + let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); + (0..n) + .filter(|_| rng.next_f32() < ratio) + .collect() + } + + pub fn sample(n: u32, ratio: f32) -> Vec { + sample_with_seed(n, ratio, 4) + } #[test] fn test_indexing() { @@ -275,8 +300,8 @@ mod tests { assert_eq!(searcher.doc_freq(&term_d), 0); } } - - + + #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -406,14 +431,14 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { + { let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } - { + { index_writer.delete_term(Term::from_field_text(text_field, "c")); } - index_writer = index_writer.rollback().unwrap(); + index_writer = index_writer.rollback().unwrap(); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); } @@ -449,7 +474,7 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("value", INT_INDEXED); let schema = schema_builder.build(); - + let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); index_writer.add_document( @@ -470,7 +495,7 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let value_field = schema_builder.add_i64_field("value", INT_INDEXED); let schema = schema_builder.build(); - + let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let negative_val = -1i64; @@ -495,15 +520,15 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - + // writing the segment let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); - + let add_document = |index_writer: &mut IndexWriter, val: &'static str| { let doc = doc!(text_field=>val); index_writer.add_document(doc); }; - + let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { let delterm = Term::from_field_text(text_field, val); index_writer.delete_term(delterm); @@ -701,13 +726,13 @@ mod tests { let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } - + { let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } - + } } From 2cc826adc7654c1d2e11966bf4eb3977f5b52bb6 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 14:52:19 +0300 Subject: [PATCH 19/51] Add a bench for SegmentPostings::SkipNext --- src/postings/mod.rs | 83 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index b48811fc3..89ac1ab20 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -34,7 +34,7 @@ pub use common::HasLen; #[cfg(test)] mod tests { - + use super::*; use schema::{Document, TEXT, STRING, SchemaBuilder, Term}; use core::SegmentComponent; @@ -48,9 +48,10 @@ mod tests { use schema::Field; use test::Bencher; use indexer::operation::AddOperation; + use tests; use rand::{XorShiftRng, Rng, SeedableRng}; - - + + #[test] pub fn test_position_write() { let mut schema_builder = SchemaBuilder::default(); @@ -70,7 +71,7 @@ mod tests { let read = segment.open_read(SegmentComponent::POSITIONS).unwrap(); assert!(read.len() <= 16); } - + #[test] pub fn test_position_and_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); @@ -87,7 +88,7 @@ mod tests { doc.add_text(text_field, "d d d d a"); // checking that position works if the field has two values. let op = AddOperation { opstamp: 0u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -96,7 +97,7 @@ mod tests { doc.add_text(text_field, "b a"); let op = AddOperation { opstamp: 1u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -107,7 +108,7 @@ mod tests { doc.add_text(text_field, &text); let op = AddOperation { opstamp: 2u64, - document: doc, + document: doc, }; segment_writer.add_document(&op, &schema).unwrap(); } @@ -164,7 +165,7 @@ mod tests { } } } - + #[test] pub fn test_position_and_fieldnorm2() { let mut schema_builder = SchemaBuilder::default(); @@ -196,7 +197,7 @@ mod tests { assert_eq!(term_scorer.doc(), 1u32); assert_eq!(term_scorer.postings().positions(), &[1u32, 4]); } - + #[test] fn test_intersection() { { @@ -219,8 +220,8 @@ mod tests { assert!(!intersection.advance()); } } - - + + lazy_static! { static ref TERM_A: Term = { let field = Field(0); @@ -234,10 +235,10 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", STRING); let schema = schema_builder.build(); - + let seed: &[u32; 4] = &[1, 2, 3, 4]; let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); - + let index = Index::create_in_ram(schema); let mut count_a = 0; let mut count_b = 0; @@ -266,18 +267,18 @@ mod tests { index }; } - + #[bench] fn bench_segment_postings(b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); - + b.iter(|| { let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); while segment_postings.advance() {} }); - } - + } + #[bench] fn bench_segment_intersection(b: &mut Bencher) { let searcher = INDEX.searcher(); @@ -288,5 +289,51 @@ mod tests { let mut intersection = IntersectionDocSet::from(vec!(segment_postings_a, segment_postings_b)); while intersection.advance() {} }); - } + } + + fn bench_skip_next(p: f32, b: &mut Bencher) { + let searcher = INDEX.searcher(); + let segment_reader = searcher.segment_reader(0); + let docs = tests::sample(segment_reader.num_docs(), p); + + let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); + let mut existing_docs = Vec::new(); + for doc in &docs { + if *doc >= segment_postings.doc() { + existing_docs.push(*doc); + if segment_postings.skip_next(*doc) == SkipResult::End { + break; + } + } + } + + b.iter(|| { + let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); + for doc in &existing_docs { + if segment_postings.skip_next(*doc) == SkipResult::End { + break; + } + } + }); + } + + #[bench] + fn bench_skip_next_p01(b: &mut Bencher) { + bench_skip_next(0.001, b); + } + + #[bench] + fn bench_skip_next_p1(b: &mut Bencher) { + bench_skip_next(0.01, b); + } + + #[bench] + fn bench_skip_next_p10(b: &mut Bencher) { + bench_skip_next(0.1, b); + } + + #[bench] + fn bench_skip_next_p90(b: &mut Bencher) { + bench_skip_next(0.9, b); + } } From e21913ecdc894f66716f2f52b0949f7d7684d7d6 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Sun, 14 May 2017 20:57:16 +0300 Subject: [PATCH 20/51] Use binary search for SegmentPostings::skip_next --- src/postings/segment_postings.rs | 89 +++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ff11a74e4..a1efe1242 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,6 +1,6 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; -use postings::{Postings, FreqHandler, DocSet, HasLen}; +use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; use std::num::Wrapping; use fastfield::DeleteBitSet; @@ -14,6 +14,7 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { len: usize, + block_len: usize, doc_offset: u32, block_decoder: BlockDecoder, freq_handler: FreqHandler, @@ -30,10 +31,12 @@ impl<'a> SegmentPostings<'a> { .uncompress_block_sorted(self.remaining_data, self.doc_offset); self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data); self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1); + self.block_len = NUM_DOCS_PER_BLOCK; } else { self.remaining_data = self.block_decoder .uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs); self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs); + self.block_len = num_remaining_docs; } } @@ -49,6 +52,7 @@ impl<'a> SegmentPostings<'a> { freq_handler: FreqHandler) -> SegmentPostings<'a> { SegmentPostings { len: len as usize, + block_len: len as usize, doc_offset: 0, block_decoder: BlockDecoder::new(), freq_handler: freq_handler, @@ -62,6 +66,7 @@ impl<'a> SegmentPostings<'a> { pub fn empty() -> SegmentPostings<'static> { SegmentPostings { len: 0, + block_len: 0, doc_offset: 0, block_decoder: BlockDecoder::new(), freq_handler: FreqHandler::new_without_freq(), @@ -76,6 +81,13 @@ impl<'a> SegmentPostings<'a> { fn index_within_block(&self) -> usize { self.cur.0 % NUM_DOCS_PER_BLOCK } + + /// Sets the current position to a location relative + /// to the current block + #[inline] + fn set_within_block(&mut self, inner_pos: usize) { + self.cur = Wrapping(self.cur.0 & !(NUM_DOCS_PER_BLOCK - 1)) + Wrapping(inner_pos) + } } @@ -98,6 +110,81 @@ impl<'a> DocSet for SegmentPostings<'a> { } } + fn skip_next(&mut self, target: DocId) -> SkipResult { + if !self.advance() { + return SkipResult::End; + } + + let mut pos = self.index_within_block(); + // skip blocks until one that might contain the target + loop { + // check if we need to go to the next block + if target > self.block_decoder.output(self.block_len - 1) { + self.cur += Wrapping(self.block_len - pos); + self.load_next_block(); + pos = 0; + + // there was no more data + if self.cur.0 == self.len { + return SkipResult::End; + } + } else if target < self.block_decoder.output(pos) { + // We've overpassed the target after the first `advance` call + // or we're at the beginning of a block. + // Either way, we're on the first `DocId` greater than `target` + return SkipResult::OverStep; + } else { + break; + } + } + + debug_assert!(target >= self.block_decoder.output(pos)); + debug_assert!(target <= self.block_decoder.output(self.block_len - 1)); + + // we're in the right block, do a binary search + let mut start = pos; + let mut count = self.block_len - start; + while count > 0 { + let step = count / 2; + let mid = start + step; + let doc = self.block_decoder.output(mid); + if doc < target { + start = mid + 1; + count -= step + 1; + } else if doc > target { + count = step; + } else { + self.set_within_block(mid); + + if !self.delete_bitset.is_deleted(doc) { + return SkipResult::Reached; + } + + if self.advance() { + return SkipResult::OverStep; + } + return SkipResult::End; + } + } + + // `doc` is now >= `target` + let doc = self.block_decoder.output(start); + self.set_within_block(start); + + if !self.delete_bitset.is_deleted(doc) { + if doc == target { + return SkipResult::Reached; + } else { + return SkipResult::OverStep; + } + } + + if self.advance() { + return SkipResult::OverStep; + } + return SkipResult::End; + } + #[inline] fn doc(&self) -> DocId { self.block_decoder.output(self.index_within_block()) From 55905377398385ddcd82fa98a00645406f1f5b0e Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 21:18:06 +0300 Subject: [PATCH 21/51] Disable early exit --- src/postings/segment_postings.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 72decad88..ad1e2715a 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -157,19 +157,8 @@ impl<'a> DocSet for SegmentPostings<'a> { if doc < target { start = mid + 1; count -= step + 1; - } else if doc > target { - count = step; } else { - self.set_within_block(mid); - - if !self.delete_bitset.is_deleted(doc) { - return SkipResult::Reached; - } - - if self.advance() { - return SkipResult::OverStep; - } - return SkipResult::End; + count = step; } } From 1dabe263955dba7f853d77a437f8d31f9731a8b4 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 21:26:28 +0300 Subject: [PATCH 22/51] Add comment about block_len --- src/postings/segment_postings.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index ad1e2715a..6fa370549 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -14,6 +14,8 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0]; /// Positions on the other hand, are optionally entirely decoded upfront. pub struct SegmentPostings<'a> { len: usize, + // Removing this makes the code slower + // See https://github.com/tantivy-search/tantivy/issues/89 block_len: usize, doc_offset: u32, block_decoder: BlockDecoder, From 3dde748b254161e282fc4659632ea37d0d114242 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 22:41:19 +0300 Subject: [PATCH 23/51] Make rustfmt happy --- examples/simple_search.rs | 52 +++++++++++++++++--------- src/core/index.rs | 7 +++- src/core/mod.rs | 2 +- src/core/segment_reader.rs | 12 +++--- src/datastruct/stacker/heap.rs | 3 +- src/directory/mmap_directory.rs | 32 +++++++++------- src/directory/ram_directory.rs | 40 ++++++++++---------- src/fastfield/mod.rs | 2 +- src/fastfield/writer.rs | 2 +- src/indexer/index_writer.rs | 8 ++-- src/indexer/merger.rs | 3 +- src/indexer/segment_updater.rs | 23 ++++++++---- src/indexer/segment_writer.rs | 2 +- src/lib.rs | 8 +++- src/postings/mod.rs | 3 +- src/postings/serializer.rs | 2 +- src/query/query.rs | 3 +- src/query/query_parser/query_parser.rs | 28 +++++++++----- src/schema/field_entry.rs | 8 ++-- src/schema/field_type.rs | 22 ++++++----- src/schema/mod.rs | 29 +++++++------- src/schema/schema.rs | 4 +- src/schema/text_options.rs | 3 +- src/schema/value.rs | 4 +- src/store/mod.rs | 9 ++++- 25 files changed, 188 insertions(+), 123 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index f62cf3e47..0d35f0e42 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -1,6 +1,9 @@ extern crate tantivy; extern crate tempdir; +#[macro_use] +extern crate serde_json; + use std::path::Path; use tempdir::TempDir; use tantivy::Index; @@ -63,8 +66,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // // This will actually just save a meta.json // with our schema in the directory. - let index = try!(Index::create(index_path, schema.clone())); - + let index = Index::create(index_path, schema.clone())?; // To insert document we need an index writer. @@ -74,7 +76,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // // Here we use a buffer of 50MB per thread. Using a bigger // heap for the indexer can increase its throughput. - let mut index_writer = try!(index.writer(50_000_000)); + let mut index_writer = index.writer(50_000_000)?; // Let's index our documents! // We first need a handle on the title and the body field. @@ -98,23 +100,37 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // ### Create a document directly from json. // - // Alternatively, we can use our schema to parse - // a document object directly from json. - - let mice_and_men_doc = try!(schema.parse_document(r#"{ + // Alternatively, we can use our schema to parse a + // document object directly from json. + // The document is a string, but we use the `json` macro + // from `serde_json` for the convenience of multi-line support. + let json = json!({ "title": "Of Mice and Men", - "body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool" - }"#)); + "body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + }); + let mice_and_men_doc = schema.parse_document(&json.to_string())?; index_writer.add_document(mice_and_men_doc); // Multi-valued field are allowed, they are // expressed in JSON by an array. // The following document has two titles. - let frankenstein_doc = try!(schema.parse_document(r#"{ - "title": ["Frankenstein", "The Modern Promotheus"], - "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." - }"#)); + let json = json!({ + "title": ["Frankenstein", "The Modern Prometheus"], + "body": "You will rejoice to hear that no disaster has accompanied the commencement of an \ + enterprise which you have regarded with such evil forebodings. I arrived here \ + yesterday, and my first task is to assure my dear sister of my welfare and \ + increasing confidence in the success of my undertaking." + }); + let frankenstein_doc = schema.parse_document(&json.to_string())?; + index_writer.add_document(frankenstein_doc); // This is an example, so we will only index 3 documents @@ -135,7 +151,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // the existence of new documents. // // This call is blocking. - try!(index_writer.commit()); + index_writer.commit()?; // If `.commit()` returns correctly, then all of the // documents that have been added are guaranteed to be @@ -151,7 +167,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // Let's search our index. Start by reloading // searchers in the index. This should be done // after every commit(). - try!(index.load_searchers()); + index.load_searchers()?; // Afterwards create one (or more) searchers. // @@ -168,7 +184,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // QueryParser may fail if the query is not in the right // format. For user facing applications, this can be a problem. // A ticket has been opened regarding this problem. - let query = try!(query_parser.parse_query("sea whale")); + let query = query_parser.parse_query("sea whale")?; // A query defines a set of documents, as @@ -186,7 +202,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { let mut top_collector = TopCollector::with_limit(10); // We can now perform our query. - try!(searcher.search(&*query, &mut top_collector)); + searcher.search(&*query, &mut top_collector)?; // Our top collector now contains the 10 // most relevant doc ids... @@ -200,7 +216,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // a title. for doc_address in doc_addresses { - let retrieved_doc = try!(searcher.doc(&doc_address)); + let retrieved_doc = searcher.doc(&doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/src/core/index.rs b/src/core/index.rs index 0e0fdcf76..e43cfc7da 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -48,8 +48,11 @@ impl Index { /// This should only be used for unit tests. pub fn create_in_ram(schema: Schema) -> Index { let ram_directory = RAMDirectory::create(); - let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail."); - Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here + // unwrap is ok here + let directory = ManagedDirectory::new(ram_directory) + .expect("Creating a managed directory from a brand new RAM directory \ + should never fail."); + Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") } /// Creates a new index in a given filepath. diff --git a/src/core/mod.rs b/src/core/mod.rs index 719246f9a..6f29254d9 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -27,7 +27,7 @@ lazy_static! { /// The meta file contains all the information about the list of segments and the schema /// of the index. pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); - + /// The managed file contains a list of files that were created by the tantivy /// and will therefore be garbage collected when they are deemed useless by tantivy. /// diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f34acb4d8..ab947a93c 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -108,7 +108,8 @@ impl SegmentReader { /// Accessor to the segment's `Field norms`'s reader. /// /// Field norms are the length (in tokens) of the fields. - /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). + /// It is used in the computation of the [TfIdf] + /// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html). /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. @@ -190,8 +191,9 @@ impl SegmentReader { /// the requested options, the returned `SegmentPostings` the method does not fail /// and returns a `SegmentPostings` with as much information as possible. /// - /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions` - /// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies. + /// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a + /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` + /// with `DocId`s and frequencies. pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) @@ -215,8 +217,8 @@ impl SegmentReader { } SegmentPostingsOption::FreqAndPositions => { if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition { - let offseted_position_data = &self.positions_data[term_info.positions_offset as - usize..]; + let offset = term_info.positions_offset as usize; + let offseted_position_data = &self.positions_data[offset..]; FreqHandler::new_with_freq_and_position(offseted_position_data) } else if indexing_options.is_termfreq_enabled() { FreqHandler::new_with_freq() diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index a38a24d10..f7ea070f7 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -142,7 +142,8 @@ impl InnerHeap { addr } else { if self.next_heap.is_none() { - warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.",); + warn!("Exceeded heap size. The margin was apparently unsufficient. The segment \ + will be committed right after indexing this very last document."); self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize))); } self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index a64ff715c..16b180845 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -270,9 +270,12 @@ impl Directory for MmapDirectory { let mut mmap_cache = self.mmap_cache .write() - .map_err(|_| OpenReadError::IOError( - make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path)) - ))?; + .map_err(|_| { + let msg = format!("Failed to acquired write lock \ + on mmap cache while reading {:?}", + path); + OpenReadError::IOError(make_io_err(msg)) + })?; Ok(mmap_cache .get_mmap(full_path)? @@ -290,12 +293,12 @@ impl Directory for MmapDirectory { .create_new(true) .open(full_path); - let mut file = try!(open_res.map_err(|err| if err.kind() == - io::ErrorKind::AlreadyExists { - OpenWriteError::FileAlreadyExists(PathBuf::from(path)) - } else { - OpenWriteError::IOError(err) - })); + let mut file = open_res + .map_err(|err| if err.kind() == io::ErrorKind::AlreadyExists { + OpenWriteError::FileAlreadyExists(PathBuf::from(path)) + } else { + OpenWriteError::IOError(err) + })?; // making sure the file is created. try!(file.flush()); @@ -311,11 +314,14 @@ impl Directory for MmapDirectory { fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { debug!("Deleting file {:?}", path); let full_path = self.resolve_path(path); - let mut mmap_cache = try!(self.mmap_cache + let mut mmap_cache = self.mmap_cache .write() - .map_err(|_| - DeleteError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path)))) - ); + .map_err(|_| { + let msg = format!("Failed to acquired write lock \ + on mmap cache while deleting {:?}", + path); + DeleteError::IOError(make_io_err(msg)) + })?; // Removing the entry in the MMap cache. // The munmap will appear on Drop, // when the last reference is gone. diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 4a1df936e..656eb739d 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -93,16 +93,17 @@ impl InnerDirectory { self.0 .read() .map_err(|_| { - let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path)); - OpenReadError::IOError(io_err) - }) + let msg = format!("Failed to acquire read lock for the \ + directory when trying to read {:?}", + path); + let io_err = make_io_err(msg); + OpenReadError::IOError(io_err) + }) .and_then(|readable_map| { readable_map - .get(path) - .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) - .map(|data| { - ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone())) - }) + .get(path) + .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) + .map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))) }) } @@ -110,19 +111,16 @@ impl InnerDirectory { self.0 .write() .map_err(|_| { - let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path)); - DeleteError::IOError(io_err) - }) - .and_then(|mut writable_map| { - match writable_map.remove(path) { - Some(_) => { - Ok(()) - }, - None => { - Err(DeleteError::FileDoesNotExist(PathBuf::from(path))) - } - } - }) + let msg = format!("Failed to acquire write lock for the \ + directory when trying to delete {:?}", + path); + let io_err = make_io_err(msg); + DeleteError::IOError(io_err) + }) + .and_then(|mut writable_map| match writable_map.remove(path) { + Some(_) => Ok(()), + None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), + }) } fn exists(&self, path: &Path) -> bool { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 71f44ba85..d470f041f 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -57,7 +57,7 @@ mod tests { schema_builder.add_u64_field("field", FAST); schema_builder.build() }; - static ref FIELD: Field = { + static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() }; } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 55db0419f..a827c7f36 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -170,7 +170,7 @@ impl IntFastFieldWriter { _ => panic!("Expected a u64field, got {:?} ", v), } } - None => self.val_if_missing, + None => self.val_if_missing, } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index d6b20d7bf..1c8afa8f7 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -35,13 +35,14 @@ use super::segment_updater::SegmentUpdater; use std::thread; // Size of the margin for the heap. A segment is closed when the remaining memory -// in the heap goes below MARGIN_IN_BYTES. +// in the heap goes below `MARGIN_IN_BYTES`. pub const MARGIN_IN_BYTES: u32 = 10_000_000u32; // We impose the memory per thread to be at least 30 MB. pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32; -// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS +// Add document will block if the number of docs waiting in the queue to be indexed +// reaches `PIPELINE_MAX_SIZE_IN_DOCS` const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; type DocumentSender = chan::Sender; @@ -595,7 +596,8 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); assert_eq!(format!("{:?}", index_writer.get_merge_policy()), - "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); + "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ + level_log_size: 0.75 }"); let merge_policy = box NoMergePolicy::default(); index_writer.set_merge_policy(merge_policy); assert_eq!(format!("{:?}", index_writer.get_merge_policy()), diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 4bb52587a..154cc2780 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -218,7 +218,8 @@ impl IndexMerger { // segment are stacked so that : // - Segment 0's doc ids become doc id [0, seg.max_doc] // - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc] - // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc] + // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, + // seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... let term = merged_terms.term(); let mut term_written = false; diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 2a7108ff7..6c4548618 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -117,9 +117,10 @@ fn perform_merge(segment_ids: &[SegmentId], } segment_entries.push(segment_entry); } else { - error!("Error, had to abort merge as some of the segment is not managed anymore.a"); - return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", - segment_id))); + error!("Error, had to abort merge as some of the segment is not managed anymore."); + let msg = format!("Segment {:?} requested for merge is not managed.", + segment_id); + return Err(Error::InvalidArgument(msg)); } } @@ -390,24 +391,30 @@ impl SegmentUpdater { if let Some(delete_operation) = delete_cursor.get() { let committed_opstamp = segment_updater.0.index.opstamp(); if delete_operation.opstamp < committed_opstamp { - let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone()); - match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) { + let index = &segment_updater.0.index; + let segment = index.segment(after_merge_segment_entry.meta().clone()); + match advance_deletes(segment, + &mut after_merge_segment_entry, + committed_opstamp) { Ok(file_protection_opt_res) => { _file_protection_opt = file_protection_opt_res; } Err(e) => { - error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e); + error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", + before_merge_segment_ids, e); // ... cancel merge if cfg!(test) { panic!("Merge failed."); } - segment_updater.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry.segment_id()); + segment_updater.cancel_merge(&before_merge_segment_ids, + after_merge_segment_entry.segment_id()); return; } } } } - segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry); + segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, + after_merge_segment_entry); segment_updater.consider_merge_options(); segment_updater.save_metas(segment_updater.0.index.opstamp()); }).wait() diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c56b1131c..871cd64ac 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -62,7 +62,7 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, } _ => SpecializedPostingsWriter::::new_boxed(heap), } - } + } FieldType::U64(_) => SpecializedPostingsWriter::::new_boxed(heap), FieldType::I64(_) => SpecializedPostingsWriter::::new_boxed(heap), } diff --git a/src/lib.rs b/src/lib.rs index e13ad28c9..db9f277dc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,9 @@ //! Think `Lucene`, but in Rust. //! //! A good place for you to get started is to check out -//! the example code ( [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) +//! the example code ( +//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / +//! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) #[macro_use] extern crate lazy_static; @@ -701,7 +703,9 @@ mod tests { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT); - let document = doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short"); + let document = doc!(text_field => "tantivy", + text_field => "some other value", + other_text_field => "short"); assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index de592576a..fd3fd869f 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -85,8 +85,9 @@ mod tests { .unwrap(); { let mut doc = Document::default(); + // checking that position works if the field has two values doc.add_text(text_field, "a b a c a d a a."); - doc.add_text(text_field, "d d d d a"); // checking that position works if the field has two values. + doc.add_text(text_field, "d d d d a"); let op = AddOperation { opstamp: 0u64, document: doc, diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 9941ab848..ab941d983 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -122,7 +122,7 @@ impl PostingsSerializer { } else { TextIndexingOptions::Untokenized } - } + } }; } diff --git a/src/query/query.rs b/src/query/query.rs index ad530120f..c73ea01d2 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -33,7 +33,8 @@ use std::any::Any; /// - a `Weight` is this recipe tied to a specific `Searcher`. It may for instance /// hold statistics about the different term of the query. It is created by the query. /// - a `Scorer` is a cursor over the set of matching documents, for a specific -/// [`SegmentReader`](../struct.SegmentReader.html). It is created by the [`Weight`](./trait.Weight.html). +/// [`SegmentReader`](../struct.SegmentReader.html). It is created by the +/// [`Weight`](./trait.Weight.html). /// /// When implementing a new type of `Query`, it is normal to implement a /// dedicated `Query`, `Weight` and `Scorer`. diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 49fa5d3cd..f9d204d1d 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -273,7 +273,7 @@ fn compose_occur(left: Occur, right: Occur) -> Occur { } else { Occur::MustNot } - } + } } } @@ -374,7 +374,8 @@ mod test { #[test] pub fn test_parse_query_untokenized() { test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"", - "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])", + "Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \ + 101, 32, 119, 111, 114, 100, 116, 119, 111])", false); } @@ -417,18 +418,21 @@ mod test { "Term([0, 0, 0, 0, 116, 111, 116, 111])", false); test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \ - 105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ + Term([0, 0, 0, 1, 116, 105, 116, 105])))", false); assert_eq!(parse_query_to_logical_ast("-title:toto", false) .err() .unwrap(), QueryParserError::AllButQueryForbidden); test_parse_query_to_logical_ast_helper("title:a b", - "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))", + "(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \ + Term([0, 0, 0, 1, 98])))", false); test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"", + "\"[Term([0, 0, 0, 0, 97]), \ + Term([0, 0, 0, 0, 98])]\"", false); } @@ -441,18 +445,22 @@ mod test { "Term([0, 0, 0, 0, 116, 111, 116, 111])", true); test_parse_query_to_logical_ast_helper("+title:toto -titi", - "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \ - 105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))", + "(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \ + -(Term([0, 0, 0, 0, 116, 105, 116, 105]) \ + Term([0, 0, 0, 1, 116, 105, 116, 105])))", true); assert_eq!(parse_query_to_logical_ast("-title:toto", true) .err() .unwrap(), QueryParserError::AllButQueryForbidden); test_parse_query_to_logical_ast_helper("title:a b", - "(+Term([0, 0, 0, 0, 97]) +(Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))", + "(+Term([0, 0, 0, 0, 97]) \ + +(Term([0, 0, 0, 0, 98]) \ + Term([0, 0, 0, 1, 98])))", true); test_parse_query_to_logical_ast_helper("title:\"a b\"", - "\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"", + "\"[Term([0, 0, 0, 0, 97]), \ + Term([0, 0, 0, 0, 98])]\"", true); } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 1df77f5cc..4f658ac33 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -159,7 +159,9 @@ impl<'de> Deserialize<'de> for FieldEntry { Field::Options => { match ty { None => { - return Err(de::Error::custom("The `type` field must be specified before `options`",),) + let msg = "The `type` field must be \ + specified before `options`"; + return Err(de::Error::custom(msg)); } Some(ty) => { match ty { @@ -173,8 +175,8 @@ impl<'de> Deserialize<'de> for FieldEntry { field_type = Some(FieldType::I64(map.next_value()?)) } _ => { - return Err(de::Error::custom(format!("Unrecognised type {}", - ty))) + let msg = format!("Unrecognised type {}", ty); + return Err(de::Error::custom(msg)); } } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 2d3ec0bb3..8f8e4046e 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -10,7 +10,8 @@ use schema::Value; pub enum ValueParsingError { /// Encounterred a numerical value that overflows or underflow its integer type. OverflowError(String), - /// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u64 type) + /// The json node is not of the correct type. + /// (e.g. 3 for a `Str` type or `"abc"` for a u64 type) /// Tantivy will try to autocast values. TypeError(String), } @@ -61,28 +62,29 @@ impl FieldType { if let Some(field_val_i64) = field_val_num.as_i64() { Ok(Value::I64(field_val_i64)) } else { - Err(ValueParsingError::OverflowError(format!("Expected an i64 int, got {:?}", - json))) + let msg = format!("Expected an i64 int, got {:?}", json); + Err(ValueParsingError::OverflowError(msg)) } } FieldType::U64(_) => { if let Some(field_val_u64) = field_val_num.as_u64() { Ok(Value::U64(field_val_u64)) } else { - Err(ValueParsingError::OverflowError(format!("Expected an u64 int, got {:?}", - json))) + let msg = format!("Expected a u64 int, got {:?}", json); + Err(ValueParsingError::OverflowError(msg)) } } FieldType::Str(_) => { - Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", - json))) + let msg = format!("Expected a string, got {:?}", json); + Err(ValueParsingError::TypeError(msg)) } } } _ => { - Err(ValueParsingError::TypeError(format!("Json value not supported error {:?}. Expected {:?}", - json, - self))) + let msg = format!("Json value not supported error {:?}. Expected {:?}", + json, + self); + Err(ValueParsingError::TypeError(msg)) } } } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index daa3f15b4..0c99b0eb5 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -4,12 +4,12 @@ # Schema definition Tantivy has a very strict schema. -The schema defines information about the fields your index contains, that is, for each field : +The schema defines information about the fields your index contains, that is, for each field: * the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`) * the type of the field (currently only `text` and `u64` are supported) * how the field should be indexed / stored. - + This very last point is critical as it will enable / disable some of the functionality for your index. @@ -38,15 +38,18 @@ let schema = schema_builder.build(); We can split the problem of generating a search result page into two phases : * identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`) -* for each of these documents, retrieving the information required to generate the serp page. (`doc_ids[] -> Document[]`) +* for each of these documents, retrieving the information required to generate the serp page. + (`doc_ids[] -> Document[]`) -In the first phase, the ability to search for documents by the given field is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our -[`TextOptions`](struct.TextOptions.html). +In the first phase, the ability to search for documents by the given field is determined by the +[`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`] +(struct.TextOptions.html). -The effect of each possible setting is described more in detail [`TextIndexingOptions`](enum.TextIndexingOptions.html). +The effect of each possible setting is described more in detail [`TextIndexingOptions`] +(enum.TextIndexingOptions.html). -On the other hand setting the field as stored or not determines whether the field should be returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) -is called. +On the other hand setting the field as stored or not determines whether the field should be returned +when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called. ### Shortcuts @@ -60,7 +63,7 @@ use tantivy::schema::*; let mut schema_builder = SchemaBuilder::default(); schema_builder.add_text_field("title_options", TEXT | STORED); let schema = schema_builder.build(); -``` +``` @@ -80,13 +83,13 @@ let schema = schema_builder.build(); Just like for Text fields (see above), setting the field as stored defines whether the field will be -returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called, +returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called, and setting the field as indexed means that we will be able perform queries such as `num_stars:10`. -Note that unlike text fields, u64 can only be indexed in one way for the moment. +Note that unlike text fields, u64 can only be indexed in one way for the moment. This may change when we will start supporting range queries. -The `fast` option on the other hand is specific to u64 fields, and is only relevant -if you are implementing your own queries. This functionality is somewhat similar to Lucene's +The `fast` option on the other hand is specific to u64 fields, and is only relevant +if you are implementing your own queries. This functionality is somewhat similar to Lucene's `DocValues`. u64 that are indexed as fast will be stored in a special data structure that will diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 62359f50d..871c11985 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -450,7 +450,7 @@ mod tests { "author": "fulmicoton", "count": 4, "popularity": 10, - "jambon": "bayonne" + "jambon": "bayonne" }"#); match json_err { Err(DocParsingError::NoSuchFieldInSchema(field_name)) => { @@ -467,7 +467,7 @@ mod tests { "author": "fulmicoton", "count": "5", "popularity": "10", - "jambon": "bayonne" + "jambon": "bayonne" }"#); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => { diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 36e8fd993..472bd3e1e 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -70,7 +70,8 @@ pub enum TextIndexingOptions { TokenizedWithFreq, /// Like TokenizedWithFreq, but also encodes the positions of the /// terms in a separate file. This option is required for phrase queries. - /// Don't use this if you are certain you won't need it, the term positions file can be very big. + /// Don't use this if you are certain you won't need it, the term positions file + /// can be very big. #[serde(rename="position")] TokenizedWithFreqAndPosition, } diff --git a/src/schema/value.rs b/src/schema/value.rs index 7d89cec72..8e5ee4153 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -140,11 +140,11 @@ mod binary_serialize { Value::U64(ref val) => { written_size += try!(U64_CODE.serialize(writer)); written_size += try!(val.serialize(writer)); - } + } Value::I64(ref val) => { written_size += try!(I64_CODE.serialize(writer)); written_size += try!(val.serialize(writer)); - } + } } Ok(written_size) } diff --git a/src/store/mod.rs b/src/store/mod.rs index ff5a274bb..dd7fa9390 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -21,7 +21,14 @@ mod tests { let field_title = schema_builder .add_text_field("title", TextOptions::default().set_stored()); let schema = schema_builder.build(); - let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.",); + let lorem = String::from("Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \ + do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ + Ut enim ad minim veniam, quis nostrud exercitation ullamco \ + laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \ + dolor in reprehenderit in voluptate velit esse cillum dolore eu \ + fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \ + proident, sunt in culpa qui officia deserunt mollit anim id est \ + laborum."); { let mut store_writer = StoreWriter::new(writer); for i in 0..num_docs { From 2bf93e9e51875e3b651afd069c1acfe82aad3d21 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 00:30:06 +0300 Subject: [PATCH 24/51] Avoid rebuilding simdcomp when running tests --- build.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.rs b/build.rs index 5fea0b408..b97dd250a 100644 --- a/build.rs +++ b/build.rs @@ -46,6 +46,8 @@ mod build { if !cfg!(debug_assertions) && cfg!(target_env = "msvc") { println!("cargo:rustc-link-lib=dylib=simdcomp"); } + + println!("cargo:rerun-if-changed=cpp"); } } From f64ff774249e3d2f4a43d52823e9d3ffbdb01181 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Mon, 15 May 2017 23:34:18 +0300 Subject: [PATCH 25/51] Use an exponential search --- src/postings/segment_postings.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 6fa370549..095cfc8d3 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,6 +1,7 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen, SkipResult}; +use std::cmp; use std::num::Wrapping; use fastfield::DeleteBitSet; @@ -149,9 +150,23 @@ impl<'a> DocSet for SegmentPostings<'a> { debug_assert!(target >= self.block_decoder.output(pos)); debug_assert!(target <= self.block_decoder.output(self.block_len - 1)); - // we're in the right block, do a binary search + // we're in the right block now, start with an exponential search let mut start = pos; - let mut count = self.block_len - start; + let mut end = self.block_len; + let mut count = 1; + loop { + let new = start + count; + if new < end && self.block_decoder.output(new) < target { + start = new; + count *= 2; + } else { + break; + } + } + end = cmp::min(start + count, end); + + // now do a binary search + let mut count = end - start; while count > 0 { let step = count / 2; let mid = start + step; From 49dbe4722f7f684b5e710862afb8ac2fe8d43e68 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 10:53:22 +0300 Subject: [PATCH 26/51] Add a test for SegmentPostings::skip_len --- src/postings/mod.rs | 167 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index fd3fd869f..671efe0ae 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -36,7 +36,7 @@ pub use common::HasLen; mod tests { use super::*; - use schema::{Document, TEXT, STRING, SchemaBuilder, Term}; + use schema::{Document, INT_INDEXED, TEXT, STRING, SchemaBuilder, Term}; use core::SegmentComponent; use indexer::SegmentWriter; use core::SegmentReader; @@ -201,6 +201,171 @@ mod tests { assert_eq!(term_scorer.postings().positions(), &[1u32, 4]); } + #[test] + fn test_skip_next() { + let term_0 = Term::from_field_u64(Field(0), 0); + let term_1 = Term::from_field_u64(Field(0), 1); + let term_2 = Term::from_field_u64(Field(0), 2); + + let num_docs = 300u32; + + let index = { + let mut schema_builder = SchemaBuilder::default(); + let value_field = schema_builder.add_u64_field("value", INT_INDEXED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + for i in 0..num_docs { + let mut doc = Document::default(); + doc.add_u64(value_field, 2); + doc.add_u64(value_field, (i % 2) as u64); + + index_writer.add_document(doc); + } + assert!(index_writer.commit().is_ok()); + } + index.load_searchers().unwrap(); + + index + }; + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + + // check that the basic usage works + for i in 0..num_docs - 1 { + for j in i + 1..num_docs { + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + assert_eq!(segment_postings.skip_next(i), SkipResult::Reached); + assert_eq!(segment_postings.doc(), i); + + assert_eq!(segment_postings.skip_next(j), SkipResult::Reached); + assert_eq!(segment_postings.doc(), j); + } + } + + { + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + // check that `skip_next` advances the iterator + assert!(segment_postings.advance()); + assert_eq!(segment_postings.doc(), 0); + + assert_eq!(segment_postings.skip_next(1), SkipResult::Reached); + assert_eq!(segment_postings.doc(), 1); + + assert_eq!(segment_postings.skip_next(1), SkipResult::OverStep); + assert_eq!(segment_postings.doc(), 2); + + // check that going beyond the end is handled + assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End); + } + + // check that filtering works + { + let mut segment_postings = segment_reader + .read_postings(&term_0, SegmentPostingsOption::NoFreq) + .unwrap(); + + for i in 0..num_docs / 2 { + assert_eq!(segment_postings.skip_next(i * 2), SkipResult::Reached); + assert_eq!(segment_postings.doc(), i * 2); + } + + let mut segment_postings = segment_reader + .read_postings(&term_0, SegmentPostingsOption::NoFreq) + .unwrap(); + + for i in 0..num_docs / 2 - 1 { + assert_eq!(segment_postings.skip_next(i * 2 + 1), SkipResult::OverStep); + assert_eq!(segment_postings.doc(), (i + 1) * 2); + } + } + + // delete some of the documents + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.delete_term(term_0); + + assert!(index_writer.commit().is_ok()); + } + index.load_searchers().unwrap(); + + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + + // make sure seeking still works + for i in 0..num_docs { + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + if i % 2 == 0 { + assert_eq!(segment_postings.skip_next(i), SkipResult::OverStep); + assert_eq!(segment_postings.doc(), i + 1); + } else { + assert_eq!(segment_postings.skip_next(i), SkipResult::Reached); + assert_eq!(segment_postings.doc(), i); + } + } + + // now try with a longer sequence + { + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + let mut last = 2; // start from 5 to avoid seeking to 3 twice + let mut cur = 3; + loop { + match segment_postings.skip_next(cur) { + SkipResult::End => break, + SkipResult::Reached => assert_eq!(segment_postings.doc(), cur), + SkipResult::OverStep => assert_eq!(segment_postings.doc(), cur + 1), + } + + let next = cur + last; + last = cur; + cur = next; + } + + assert_eq!(cur, 377); + } + + // delete everything else + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.delete_term(term_1); + + assert!(index_writer.commit().is_ok()); + } + index.load_searchers().unwrap(); + + let searcher = index.searcher(); + let segment_reader = searcher.segment_reader(0); + + // finally, check that it's empty + { + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + assert_eq!(segment_postings.skip_next(0), SkipResult::End); + + let mut segment_postings = segment_reader + .read_postings(&term_2, SegmentPostingsOption::NoFreq) + .unwrap(); + + assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End); + } + } + #[test] fn test_intersection() { { From e5c7c0b8b9542ad8006b89c64f2095edc4703127 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 16 May 2017 21:13:33 +0900 Subject: [PATCH 27/51] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c45eb005..03fd7e369 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Tantivy 0.4.0 ========================== - Raise the limit of number of fields (previously 256 fields) - Removed u32 fields. They are replaced by u64 and i64 fields (#65) +- Optimized skip in SegmentPostings (#130) (@lnicola) - Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola - QueryParser: - Explicit error returned when searched for a term that is not indexed From ac02c76b1e23019ec2a464bc846f59be26ab8990 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:13:00 +0300 Subject: [PATCH 28/51] clippy: fix doc_markdown warnings --- src/core/segment_id.rs | 4 ++-- src/core/segment_meta.rs | 2 +- src/fastfield/delete.rs | 2 +- src/fastfield/error.rs | 2 +- src/fastfield/reader.rs | 6 +++--- src/indexer/index_writer.rs | 2 +- src/indexer/log_merge_policy.rs | 2 +- src/indexer/merge_policy.rs | 4 ++-- src/indexer/mod.rs | 2 +- src/indexer/segment_entry.rs | 10 +++++----- src/query/occur_filter.rs | 2 +- src/query/scorer.rs | 2 +- src/query/term_query/term_query.rs | 6 +++--- 13 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 52978c152..b9612b46a 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -5,13 +5,13 @@ use std::cmp::{Ordering, Ord}; #[cfg(test)] use std::sync::atomic; -/// Tantivy SegmentId. +/// Tantivy `SegmentId`. /// /// Tantivy's segment are identified /// by a UUID which is used to prefix the filenames /// of all of the file associated with the segment. /// -/// In unit test, for reproducability, the SegmentId are +/// In unit test, for reproducability, the `SegmentId` are /// simply generated in an autoincrement fashion. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct SegmentId(Uuid); diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 5c9194e6e..623b22442 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -9,7 +9,7 @@ struct DeleteMeta { opstamp: u64, } -/// SegmentMeta contains simple meta information about a segment. +/// `SegmentMeta` contains simple meta information about a segment. /// /// For instance the number of docs it contains, /// how many are deleted, etc. diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 08ad41e8e..8923437c8 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -6,7 +6,7 @@ use directory::ReadOnlySource; use DocId; use common::HasLen; -/// Write a delete BitSet +/// Write a delete `BitSet` /// /// where `delete_bitset` is the set of deleted `DocId`. pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> { diff --git a/src/fastfield/error.rs b/src/fastfield/error.rs index 88902833b..e28474702 100644 --- a/src/fastfield/error.rs +++ b/src/fastfield/error.rs @@ -1,7 +1,7 @@ use std::result; use schema::FieldEntry; -/// FastFieldNotAvailableError is returned when the +/// `FastFieldNotAvailableError` is returned when the /// user requested for a fast field reader, and the field was not /// defined in the schema as a fast field. #[derive(Debug)] diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 74b69cb9e..ad281e377 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -37,7 +37,7 @@ pub trait FastFieldReader: Sized { fn is_enabled(field_type: &FieldType) -> bool; } -/// FastFieldReader for unsigned 64-bits integers. +/// `FastFieldReader` for unsigned 64-bits integers. pub struct U64FastFieldReader { _data: ReadOnlySource, bit_unpacker: BitUnpacker, @@ -133,7 +133,7 @@ impl From> for U64FastFieldReader { } } -/// FastFieldReader for signed 64-bits integers. +/// `FastFieldReader` for signed 64-bits integers. pub struct I64FastFieldReader { underlying: U64FastFieldReader, } @@ -189,7 +189,7 @@ impl FastFieldReader for I64FastFieldReader { -/// The FastFieldsReader` is the datastructure containing +/// The `FastFieldsReader` is the datastructure containing /// all of the fast fields' data. /// /// It contains a mapping that associated these fields to diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 1c8afa8f7..5c0fa7ddf 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -96,7 +96,7 @@ impl !Sync for IndexWriter {} /// `IndexWriter` on the system is accessing the index directory, /// it is safe to manually delete the lockfile. /// -/// num_threads specifies the number of indexing workers that +/// `num_threads` specifies the number of indexing workers that /// should work at the same time. /// # Errors /// If the lockfile already exists, returns `Error::FileAlreadyExists`. diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 0fea3d3ee..47f496998 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -9,7 +9,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000; const DEFAULT_MIN_MERGE_SIZE: usize = 8; -/// LogMergePolicy tries tries to merge segments that have a similar number of +/// `LogMergePolicy` tries tries to merge segments that have a similar number of /// documents. #[derive(Debug, Clone)] pub struct LogMergePolicy { diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 69a958b39..9f8b59748 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -9,7 +9,7 @@ use std::fmt::Debug; pub struct MergeCandidate(pub Vec); -/// The Merge policy defines which segments should be merged. +/// The `MergePolicy` defines which segments should be merged. /// /// Every time a the list of segments changes, the segment updater /// asks the merge policy if some segments should be merged. @@ -52,7 +52,7 @@ pub mod tests { use core::SegmentMeta; - /// Merge policy useful for test purposes. + /// `MergePolicy` useful for test purposes. /// /// Everytime there is more than one segment, /// it will suggest to merge them. diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 4dcd9fe12..f76477567 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -22,5 +22,5 @@ pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy}; pub use self::segment_manager::SegmentManager; -/// Alias for the default merge policy, which is the LogMergePolicy. +/// Alias for the default merge policy, which is the `LogMergePolicy`. pub type DefaultMergePolicy = LogMergePolicy; diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 728cf2e55..082f9e1c1 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -24,16 +24,16 @@ impl SegmentState { /// A segment entry describes the state of /// a given segment, at a given instant. /// -/// In addition to segment meta, +/// In addition to segment `meta`, /// it contains a few transient states -/// - state expresses whether the segment is already in the +/// - `state` expresses whether the segment is already in the /// middle of a merge -/// - delete_bitset is a bitset describing +/// - `delete_bitset` is a bitset describing /// documents that were deleted during the commit /// itself. -/// - Delete cursor, is the position in the delete queue. +/// - `delete_cursor` is the position in the delete queue. /// Deletes happening before the cursor are reflected either -/// in the .del file or in the delete_bitset. +/// in the .del file or in the `delete_bitset`. #[derive(Clone)] pub struct SegmentEntry { meta: SegmentMeta, diff --git a/src/query/occur_filter.rs b/src/query/occur_filter.rs index 42246770d..1b21dea6e 100644 --- a/src/query/occur_filter.rs +++ b/src/query/occur_filter.rs @@ -1,7 +1,7 @@ use query::Occur; -/// An OccurFilter represents a filter over a bitset of +/// An `OccurFilter` represents a filter over a bitset of // at most 64 elements. /// /// It wraps some simple bitmask to compute the filter diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 4ac05cd83..20b6e7e72 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -36,7 +36,7 @@ impl<'a> Scorer for Box { } } -/// EmptyScorer is a dummy Scorer in which no document matches. +/// `EmptyScorer` is a dummy `Scorer` in which no document matches. /// /// It is useful for tests and handling edge cases. pub struct EmptyScorer; diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 330138edc..5c468c442 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -13,9 +13,9 @@ use std::any::Any; /// The score associated is defined as /// `idf` * sqrt(`term_freq` / `field norm`) /// in which : -/// * idf - inverse document frequency. -/// * term_freq - number of occurrences of the term in the field -/// * field norm - number of tokens in the field. +/// * `idf` - inverse document frequency. +/// * `term_freq` - number of occurrences of the term in the field +/// * `field norm` - number of tokens in the field. #[derive(Debug)] pub struct TermQuery { term: Term, From 3e2ad7542d040845b4116c2ce0979ed22127d9c8 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:30:57 +0300 Subject: [PATCH 29/51] clippy: fix needless_return warnings --- src/indexer/delete_queue.rs | 2 +- src/postings/segment_postings.rs | 5 +++-- src/query/query_parser/query_parser.rs | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index ae20d9866..5dd8aad81 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -163,7 +163,7 @@ impl NextBlock { } } *next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); - return Some(next_block); + Some(next_block) } } } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 095cfc8d3..debc423c1 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -192,9 +192,10 @@ impl<'a> DocSet for SegmentPostings<'a> { } if self.advance() { - return SkipResult::OverStep; + SkipResult::OverStep + } else { + SkipResult::End } - return SkipResult::End; } #[inline] diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f9d204d1d..a38962dfe 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -153,12 +153,12 @@ impl QueryParser { &FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; let term = Term::from_field_i64(field, val); - return Ok(Some(LogicalLiteral::Term(term))); + Ok(Some(LogicalLiteral::Term(term))) } &FieldType::U64(_) => { let val: u64 = u64::from_str(phrase)?; let term = Term::from_field_u64(field, val); - return Ok(Some(LogicalLiteral::Term(term))); + Ok(Some(LogicalLiteral::Term(term))) } &FieldType::Str(ref str_options) => { let mut terms: Vec = Vec::new(); @@ -176,11 +176,11 @@ impl QueryParser { terms.push(Term::from_field_text(field, phrase)); } if terms.is_empty() { - return Ok(None); + Ok(None) } else if terms.len() == 1 { - return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))); + Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap()))) } else { - return Ok(Some(LogicalLiteral::Phrase(terms))); + Ok(Some(LogicalLiteral::Phrase(terms))) } } } From feec2e262023298715109c54448965de39be6f61 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:33:14 +0300 Subject: [PATCH 30/51] clippy: fix needless_bool warnings --- src/fastfield/reader.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index ad281e377..b4b761e3a 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -175,13 +175,7 @@ impl FastFieldReader for I64FastFieldReader { fn is_enabled(field_type: &FieldType) -> bool { match field_type { - &FieldType::I64(ref integer_options) => { - if integer_options.is_fast() { - true - } else { - false - } - } + &FieldType::I64(ref integer_options) => integer_options.is_fast(), _ => false, } } From 1af1f7e0d14aa712e5813a6882dffa159a3c3357 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:36:19 +0300 Subject: [PATCH 31/51] clippy: fix if_let_redundant_pattern_matching warnings --- src/directory/managed_directory.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index bc3e42185..7b87bddb7 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -175,7 +175,7 @@ impl ManagedDirectory { managed_paths_write.remove(delete_file); } } - if let Err(_) = save_managed_paths(self.directory.as_mut(), &meta_informations_wlock) { + if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() { error!("Failed to save the list of managed files."); } } From 6ae34d2a77e2891b67dc4a7490d194a0524023ff Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:40:54 +0300 Subject: [PATCH 32/51] clippy: fix toplevel_ref_arg warnings --- src/indexer/segment_updater.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 6c4548618..1718840b4 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -101,7 +101,7 @@ fn perform_merge(segment_ids: &[SegmentId], // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids); - let ref index = segment_updater.0.index; + let index = &segment_updater.0.index; let schema = index.schema(); let mut segment_entries = vec![]; From 92f383fa518bc74f6f603b965dee44642b0c7ded Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:42:44 +0300 Subject: [PATCH 33/51] clippy: fix let_unit_value warnings --- src/indexer/segment_updater.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 1718840b4..56bb25b84 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -56,7 +56,7 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) - /// Save the index meta file. -/// This operation is atomic : +/// This operation is atomic: /// Either // - it fails, in which case an error is returned, /// and the `meta.json` remains untouched, @@ -76,10 +76,9 @@ pub fn save_metas(segment_metas: Vec, }; let mut w = try!(serde_json::to_vec_pretty(&metas)); try!(write!(&mut w, "\n")); - let res = directory.atomic_write(&META_FILEPATH, &w[..])?; + directory.atomic_write(&META_FILEPATH, &w[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); - Ok(res) - + Ok(()) } From 1cd0b378fb041b676e1c747ff6a648cad69f3847 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:44:41 +0300 Subject: [PATCH 34/51] clippy: fix map_clone warnings --- src/indexer/segment_register.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 9b8fc8427..2ffc3a5b0 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -70,9 +70,7 @@ impl SegmentRegister { } pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { - self.segment_states - .get(&segment_id) - .map(|segment_entry| segment_entry.clone()) + self.segment_states.get(&segment_id).cloned() } pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { From 3965b26cd290d97d0a7716636067ca3621036a48 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:50:41 +0300 Subject: [PATCH 35/51] clippy: fix useless_let_if_seq warnings --- src/common/bitpacker.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index e16df3967..14a96312f 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -117,9 +117,8 @@ impl BitUnpacker { let addr = (idx * self.num_bits) / 8; let bit_shift = idx * self.num_bits - addr * 8; let val_unshifted_unmasked: u64; - if addr + 8 <= self.data_len { - val_unshifted_unmasked = - unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) }; + val_unshifted_unmasked = if addr + 8 <= self.data_len { + unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) } } else { let mut arr = [0u8; 8]; if addr < self.data_len { @@ -127,8 +126,8 @@ impl BitUnpacker { arr[i] = unsafe { *self.data_ptr.offset((addr + i) as isize) }; } } - val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) }; - } + unsafe { mem::transmute::<[u8; 8], u64>(arr) } + }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & self.mask) } From 103ba6ba351b93764c62e79385d8b7b84553401d Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 19:55:44 +0300 Subject: [PATCH 36/51] clippy: fix match_ref_pats warnings --- src/fastfield/reader.rs | 8 ++++---- src/fastfield/writer.rs | 6 +++--- src/indexer/index_writer.rs | 2 +- src/query/query_parser/query_parser.rs | 8 ++++---- src/schema/field_type.rs | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index b4b761e3a..982c9daef 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -73,8 +73,8 @@ impl FastFieldReader for U64FastFieldReader { } fn is_enabled(field_type: &FieldType) -> bool { - match field_type { - &FieldType::U64(ref integer_options) => integer_options.is_fast(), + match *field_type { + FieldType::U64(ref integer_options) => integer_options.is_fast(), _ => false, } } @@ -174,8 +174,8 @@ impl FastFieldReader for I64FastFieldReader { } fn is_enabled(field_type: &FieldType) -> bool { - match field_type { - &FieldType::I64(ref integer_options) => integer_options.is_fast(), + match *field_type { + FieldType::I64(ref integer_options) => integer_options.is_fast(), _ => false, } } diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index a827c7f36..4b663a93b 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -20,8 +20,8 @@ impl FastFieldsWriter { .enumerate() .flat_map(|(field_id, field_entry)| { let field = Field(field_id as u32); - match field_entry.field_type() { - &FieldType::I64(ref int_options) => { + match *field_entry.field_type() { + FieldType::I64(ref int_options) => { if int_options.is_fast() { let mut fast_field_writer = IntFastFieldWriter::new(field); fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64)); @@ -30,7 +30,7 @@ impl FastFieldsWriter { None } } - &FieldType::U64(ref int_options) => { + FieldType::U64(ref int_options) => { if int_options.is_fast() { Some(IntFastFieldWriter::new(field)) } else { diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 5c0fa7ddf..07f8494ab 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -325,7 +325,7 @@ impl IndexWriter { .wait_merging_thread() .map_err(|_| Error::ErrorInThread("Failed to join merging thread.".to_string())); - if let &Err(ref e) = &result { + if let Err(ref e) = result { error!("Some merging thread failed {:?}", e); } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index a38962dfe..a16d5892e 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -149,18 +149,18 @@ impl QueryParser { let field_name = field_entry.name().to_string(); return Err(QueryParserError::FieldNotIndexed(field_name)); } - match field_type { - &FieldType::I64(_) => { + match *field_type { + FieldType::I64(_) => { let val: i64 = i64::from_str(phrase)?; let term = Term::from_field_i64(field, val); Ok(Some(LogicalLiteral::Term(term))) } - &FieldType::U64(_) => { + FieldType::U64(_) => { let val: u64 = u64::from_str(phrase)?; let term = Term::from_field_u64(field, val); Ok(Some(LogicalLiteral::Term(term))) } - &FieldType::Str(ref str_options) => { + FieldType::Str(ref str_options) => { let mut terms: Vec = Vec::new(); if str_options.get_indexing_options().is_tokenized() { let mut token_iter = self.analyzer.tokenize(phrase); diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 8f8e4046e..1b20b304d 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -32,10 +32,10 @@ pub enum FieldType { impl FieldType { /// returns true iff the field is indexed. pub fn is_indexed(&self) -> bool { - match self { - &FieldType::Str(ref text_options) => text_options.get_indexing_options().is_indexed(), - &FieldType::U64(ref int_options) => int_options.is_indexed(), - &FieldType::I64(ref int_options) => int_options.is_indexed(), + match *self { + FieldType::Str(ref text_options) => text_options.get_indexing_options().is_indexed(), + FieldType::U64(ref int_options) => int_options.is_indexed(), + FieldType::I64(ref int_options) => int_options.is_indexed(), } } From 8e407bb314b440b4c4d474886eaa1488e0037ba0 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:01:35 +0300 Subject: [PATCH 37/51] clippy: fix needless_borrow warnings --- src/core/segment_reader.rs | 2 +- src/fastfield/reader.rs | 2 +- src/indexer/index_writer.rs | 6 +++--- src/indexer/merger.rs | 4 ++-- src/indexer/segment_manager.rs | 8 ++++---- src/indexer/segment_register.rs | 2 +- src/indexer/segment_updater.rs | 2 +- src/indexer/segment_writer.rs | 2 +- src/query/query.rs | 2 +- src/query/query_parser/query_parser.rs | 2 +- src/schema/schema.rs | 4 ++-- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index ab947a93c..33b8f5a06 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -200,7 +200,7 @@ impl SegmentReader { -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); - let term_info = get!(self.get_term_info(&term)); + let term_info = get!(self.get_term_info(term)); let offset = term_info.postings_offset as usize; let postings_data = &self.postings_data[offset..]; let freq_handler = match *field_entry.field_type() { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 982c9daef..7f4684fda 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -127,7 +127,7 @@ impl From> for U64FastFieldReader { fast_field_writers.serialize(&mut serializer).unwrap(); serializer.close().unwrap(); } - let source = directory.open_read(&path).unwrap(); + let source = directory.open_read(path).unwrap(); let fast_field_readers = FastFieldsReader::open(source).unwrap(); fast_field_readers.open_reader(field).unwrap() } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 07f8494ab..9ee0fab7a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -215,7 +215,7 @@ pub fn advance_deletes(mut segment: Segment, let max_doc = segment_reader.max_doc(); let mut delete_bitset: BitSet = match segment_entry.delete_bitset() { - Some(ref previous_delete_bitset) => (*previous_delete_bitset).clone(), + Some(previous_delete_bitset) => (*previous_delete_bitset).clone(), None => BitSet::with_capacity(max_doc as usize), }; @@ -256,9 +256,9 @@ fn index_documents(heap: &mut Heap, -> Result { heap.clear(); let segment_id = segment.id(); - let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?; + let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), schema)?; for doc in document_iterator { - try!(segment_writer.add_document(&doc, &schema)); + try!(segment_writer.add_document(&doc, schema)); if segment_writer.is_buffer_full() { info!("Buffer limit reached, flushing segment with maxdoc={}.", segment_writer.max_doc()); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 154cc2780..1dda7d580 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -229,7 +229,7 @@ impl IndexMerger { .cloned() .flat_map(|segment_ord| { self.readers[segment_ord] - .read_postings_all_info(&term) + .read_postings_all_info(term) .map(|segment_postings| (segment_ord, segment_postings)) }) .collect::>(); @@ -249,7 +249,7 @@ impl IndexMerger { if !term_written { // we make sure to only write the term iff // there is at least one document. - postings_serializer.new_term(&term)?; + postings_serializer.new_term(term)?; term_written = true; } let delta_positions: &[u32] = diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 2fea73b25..35c264cdc 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -152,11 +152,11 @@ impl SegmentManager { target_segment_register = { if registers_lock .uncommitted - .contains_all(&before_merge_segment_ids) { + .contains_all(before_merge_segment_ids) { &mut registers_lock.uncommitted } else if registers_lock .committed - .contains_all(&before_merge_segment_ids) { + .contains_all(before_merge_segment_ids) { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); @@ -197,11 +197,11 @@ impl SegmentManager { let mut target_register: &mut SegmentRegister = { if registers_lock .uncommitted - .contains_all(&before_merge_segment_ids) { + .contains_all(before_merge_segment_ids) { &mut registers_lock.uncommitted } else if registers_lock .committed - .contains_all(&before_merge_segment_ids) { + .contains_all(before_merge_segment_ids) { &mut registers_lock.committed } else { warn!("couldn't find segment in SegmentManager"); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 2ffc3a5b0..af7e778d1 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -70,7 +70,7 @@ impl SegmentRegister { } pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { - self.segment_states.get(&segment_id).cloned() + self.segment_states.get(segment_id).cloned() } pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 56bb25b84..ec8c0a7a0 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -374,7 +374,7 @@ impl SegmentUpdater { after_merge_segment_entry: SegmentId) { self.0 .segment_manager - .cancel_merge(&before_merge_segment_ids, after_merge_segment_entry); + .cancel_merge(before_merge_segment_ids, after_merge_segment_entry); } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 871cd64ac..8db52c960 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -178,7 +178,7 @@ impl<'a> SegmentWriter<'a> { } } self.fieldnorms_writer.fill_val_up_to(doc_id); - self.fast_field_writers.add_document(&doc); + self.fast_field_writers.add_document(doc); let stored_fieldvalues: Vec<&FieldValue> = doc.field_values() .iter() .filter(|field_value| schema.get_field_entry(field_value.field()).is_stored()) diff --git a/src/query/query.rs b/src/query/query.rs index c73ea01d2..d9f7cccc5 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -68,7 +68,7 @@ pub trait Query: fmt::Debug { let mut segment_search_timer = search_timer.open("segment_search"); { let _ = segment_search_timer.open("set_segment"); - try!(collector.set_segment(segment_ord as SegmentLocalId, &segment_reader)); + try!(collector.set_segment(segment_ord as SegmentLocalId, segment_reader)); } let mut scorer = try!(weight.scorer(segment_reader)); { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index a16d5892e..e5be6274f 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -222,7 +222,7 @@ impl QueryParser { UserInputAST::Leaf(literal) => { let term_phrases: Vec<(Field, String)> = match literal.field_name { Some(ref field_name) => { - let field = try!(self.resolve_field_name(&field_name)); + let field = try!(self.resolve_field_name(field_name)); vec![(field, literal.phrase.clone())] } None => { diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 871c11985..fcf4c655a 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -226,7 +226,7 @@ impl Schema { JsonValue::Array(ref json_items) => { for json_item in json_items { let value = try!(field_type - .value_from_json(&json_item) + .value_from_json(json_item) .map_err(|e| { DocParsingError::ValueError(field_name.clone(), e) })); @@ -235,7 +235,7 @@ impl Schema { } _ => { let value = try!(field_type - .value_from_json(&json_value) + .value_from_json(json_value) .map_err(|e| { DocParsingError::ValueError(field_name.clone(), e) })); From 5c831530356f45e241ffbe2a5c1a7260447d50a8 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:04:54 +0300 Subject: [PATCH 38/51] clippy: fix or_fun_call warnings --- src/collector/top_collector.rs | 2 +- src/directory/mmap_directory.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index a02141fca..7d3c33c9e 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -27,7 +27,7 @@ impl Ord for GlobalScoredDoc { other .score .partial_cmp(&self.score) - .unwrap_or(other.doc_address.cmp(&self.doc_address)) + .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) } } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 16b180845..311169837 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -281,7 +281,7 @@ impl Directory for MmapDirectory { .get_mmap(full_path)? .map(MmapReadOnly::from) .map(ReadOnlySource::Mmap) - .unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))) + .unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) } fn open_write(&mut self, path: &Path) -> Result { From 36f51e289ec491362cf42a0aa736be4c5400e4ba Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:12:54 +0300 Subject: [PATCH 39/51] clippy: fix match_same_arms warnings --- src/indexer/doc_opstamp_mapping.rs | 3 +-- src/indexer/segment_writer.rs | 2 +- src/postings/serializer.rs | 8 +------- src/schema/field_entry.rs | 6 +++--- src/schema/field_type.rs | 2 +- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index 4928da45e..03556ef17 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -40,8 +40,7 @@ impl DocToOpstampMapping { match *self { DocToOpstampMapping::WithMap(ref doc_opstamps) => { match doc_opstamps.binary_search(&target_opstamp) { - Ok(doc_id) => doc_id as DocId, - Err(doc_id) => doc_id as DocId, + Ok(doc_id) | Err(doc_id) => doc_id as DocId, } } DocToOpstampMapping::None => DocId::max_value(), diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8db52c960..2c626bdfe 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -63,7 +63,7 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, _ => SpecializedPostingsWriter::::new_boxed(heap), } } - FieldType::U64(_) => SpecializedPostingsWriter::::new_boxed(heap), + FieldType::U64(_) | FieldType::I64(_) => SpecializedPostingsWriter::::new_boxed(heap), } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index ab941d983..4baeb869f 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -109,13 +109,7 @@ impl PostingsSerializer { let field_entry: &FieldEntry = self.schema.get_field_entry(field); self.text_indexing_options = match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options.get_indexing_options(), - FieldType::U64(ref int_options) => { - if int_options.is_indexed() { - TextIndexingOptions::Unindexed - } else { - TextIndexingOptions::Untokenized - } - } + FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => { if int_options.is_indexed() { TextIndexingOptions::Unindexed diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 4f658ac33..883dc49ff 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -63,7 +63,7 @@ impl FieldEntry { pub fn is_indexed(&self) -> bool { match self.field_type { FieldType::Str(ref options) => options.get_indexing_options().is_indexed(), - FieldType::U64(ref options) => options.is_indexed(), + FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_indexed(), } } @@ -71,7 +71,7 @@ impl FieldEntry { /// Returns true iff the field is a int (signed or unsigned) fast field pub fn is_int_fast(&self) -> bool { match self.field_type { - FieldType::U64(ref options) => options.is_fast(), + FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_fast(), _ => false, } @@ -80,7 +80,7 @@ impl FieldEntry { /// Returns true iff the field is stored pub fn is_stored(&self) -> bool { match self.field_type { - FieldType::U64(ref options) => options.is_stored(), + FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_stored(), FieldType::Str(ref options) => options.is_stored(), } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 1b20b304d..67a0d42ff 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -34,7 +34,7 @@ impl FieldType { pub fn is_indexed(&self) -> bool { match *self { FieldType::Str(ref text_options) => text_options.get_indexing_options().is_indexed(), - FieldType::U64(ref int_options) => int_options.is_indexed(), + FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => int_options.is_indexed(), } } From 39958ec476cf9d4561061e1bce4619ce1d0ba1cc Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:14:33 +0300 Subject: [PATCH 40/51] clippy: fix single_match warnings --- src/indexer/delete_queue.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 5dd8aad81..2c9b92017 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -135,11 +135,8 @@ impl NextBlock { let next_read_lock = self.0 .read() .expect("Failed to acquire write lock in delete queue"); - match *next_read_lock { - InnerNextBlock::Closed(ref block) => { - return Some(block.clone()); - } - _ => {} + if let InnerNextBlock::Closed(ref block) = *next_read_lock { + return Some(block.clone()); } } let next_block; From 6fea510869c0211d209a9c2c0021f2b0d04ae331 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:18:18 +0300 Subject: [PATCH 41/51] clippy: fix redundant_closure warnings --- src/directory/mmap_directory.rs | 6 +++--- src/query/query_parser/query_grammar.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 311169837..f01c813bd 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -30,7 +30,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadE OpenReadError::IOError(err) }; let file = File::open(&full_path).map_err(convert_file_error)?; - let meta_data = file.metadata().map_err(|e| OpenReadError::IOError(e))?; + let meta_data = file.metadata().map_err(OpenReadError::IOError)?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible // to mmap the file, so we return an anonymous mmap_cache @@ -327,7 +327,7 @@ impl Directory for MmapDirectory { // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { - Ok(_) => self.sync_directory().map_err(|e| DeleteError::IOError(e)), + Ok(_) => self.sync_directory().map_err(DeleteError::IOError), Err(e) => { if e.kind() == io::ErrorKind::NotFound { Err(DeleteError::FileDoesNotExist(path.to_owned())) @@ -349,7 +349,7 @@ impl Directory for MmapDirectory { match File::open(&full_path) { Ok(mut file) => { file.read_to_end(&mut buffer) - .map_err(|e| OpenReadError::IOError(e))?; + .map_err(OpenReadError::IOError)?; Ok(buffer) } Err(e) => { diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index ec45a56ee..08f167b25 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -34,7 +34,7 @@ fn literal(input: I) -> ParseResult }); try(term_query) .or(term_default_field) - .map(|query_literal| UserInputAST::from(query_literal)) + .map(UserInputAST::from) .parse_stream(input) } From 4e48bbf0ea249b81a39bef389b4dbe0d89ba9a3e Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:19:36 +0300 Subject: [PATCH 42/51] clippy: fix needless_lifetimes warnings --- src/core/searcher.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 2faa919f3..a869fa69c 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -57,7 +57,7 @@ impl Searcher { /// /// # Warning /// This API is very likely to change in the future. - pub fn terms<'a>(&'a self) -> TermIterator<'a> { + pub fn terms(&self) -> TermIterator { TermIterator::from(self.segment_readers()) } From f5fb29422aa3441517971625e87d8b4a70598707 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:27:58 +0300 Subject: [PATCH 43/51] clippy: fix while_let_loop warnings --- src/indexer/delete_queue.rs | 1 + src/indexer/index_writer.rs | 1 + src/query/query_parser/query_parser.rs | 10 +++------- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 2c9b92017..fc22dbc84 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -186,6 +186,7 @@ impl DeleteCursor { /// `opstamp >= target_opstamp`. pub fn skip_to(&mut self, target_opstamp: u64) { // TODO Can be optimize as we work with block. + #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] loop { if let Some(operation) = self.get() { if operation.opstamp >= target_opstamp { diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 9ee0fab7a..3db8aa339 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -165,6 +165,7 @@ pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, let mut might_have_changed = false; + #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] loop { if let Some(delete_op) = delete_cursor.get() { if delete_op.opstamp > target_opstamp { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index e5be6274f..5bb25ffcb 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -164,13 +164,9 @@ impl QueryParser { let mut terms: Vec = Vec::new(); if str_options.get_indexing_options().is_tokenized() { let mut token_iter = self.analyzer.tokenize(phrase); - loop { - if let Some(token) = token_iter.next() { - let term = Term::from_field_text(field, token); - terms.push(term); - } else { - break; - } + while let Some(token) = token_iter.next() { + let term = Term::from_field_text(field, token); + terms.push(term); } } else { terms.push(Term::from_field_text(field, phrase)); From a67caee141b8753eb03cae8593d8292be7f3648c Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:29:59 +0300 Subject: [PATCH 44/51] clippy: fix len_zero warnings --- src/indexer/merger.rs | 2 +- src/query/query_parser/query_parser.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1dda7d580..248a59baa 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -236,7 +236,7 @@ impl IndexMerger { // We can remove the term if all documents which // contained it have been deleted. - if segment_postings.len() > 0 { + if !segment_postings.is_empty() { // We can now serialize this postings, by pushing each document to the // postings serializer. diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 5bb25ffcb..0b6b43efe 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -222,7 +222,7 @@ impl QueryParser { vec![(field, literal.phrase.clone())] } None => { - if self.default_fields.len() == 0 { + if self.default_fields.is_empty() { return Err(QueryParserError::NoDefaultFieldDeclared); } else { self.default_fields @@ -238,7 +238,7 @@ impl QueryParser { asts.push(LogicalAST::Leaf(box ast)); } } - let result_ast = if asts.len() == 0 { + let result_ast = if asts.is_empty() { // this should never happen return Err(QueryParserError::SyntaxError); } else if asts.len() == 1 { From 0404df3fd5791bbeeccad5c8084efe42bdcf8f60 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:56:43 +0300 Subject: [PATCH 45/51] Fix typo in docstring --- src/common/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/mod.rs b/src/common/mod.rs index 4629ed95f..9c9fa41ce 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -33,7 +33,7 @@ const HIGHEST_BIT: u64 = 1 << 63; /// Maps `i64` to `u64` so that /// `-2^63 .. 2^63-1` is mapped /// to -/// `0 .. 2^64` +/// `0 .. 2^64-1` /// in that order. /// /// This is more suited than simply casting (`val as u64`) From 0d5ea981324bfb6e08f431a507f1dbce37b90433 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:57:31 +0300 Subject: [PATCH 46/51] clippy: fix inline_always warnings --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index db9f277dc..d37ff3b44 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] #![allow(unknown_lints)] // for the clippy lint options #![allow(module_inception)] +#![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![feature(box_syntax)] #![feature(optin_builtin_traits)] From c0538dbe9af23787d68a26343d7f376801200503 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:58:40 +0300 Subject: [PATCH 47/51] clippy: fix mut_from_ref warnings --- src/datastruct/stacker/heap.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index f7ea070f7..4339b98d0 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -19,6 +19,7 @@ pub struct Heap { inner: UnsafeCell, } +#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))] impl Heap { /// Creates a new heap with a given capacity pub fn with_capacity(num_bytes: usize) -> Heap { From 1352b95b074014a86c9861ea9c9799957c435ccd Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Tue, 16 May 2017 20:58:21 +0300 Subject: [PATCH 48/51] clippy: fix never_loop warnings --- src/postings/recorder.rs | 12 +++++------- src/query/phrase_query/phrase_scorer.rs | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 05586858a..d0948fd33 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -120,14 +120,12 @@ impl Recorder for TermFrequencyRecorder { heap: &Heap) -> io::Result<()> { let mut doc_iter = self.stack.iter(self_addr, heap); - loop { - if let Some(doc) = doc_iter.next() { - if let Some(term_freq) = doc_iter.next() { - try!(serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)); - continue; - } + while let Some(doc) = doc_iter.next() { + if let Some(term_freq) = doc_iter.next() { + serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?; + } else { + break; } - break; } Ok(()) } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 6a41330e9..f80416712 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -25,6 +25,7 @@ impl<'a> PhraseScorer<'a> { positions_arr[0] = &(positions_arr[0])[1..]; let mut count_matching = 1; + #[cfg_attr(feature = "cargo-clippy", allow(never_loop))] 'outer: loop { let target = pos_candidate + ord; let positions = positions_arr[ord as usize]; From 113917c5217b5e0593bd8c1fc6dab38726bb0849 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 17 May 2017 10:01:25 +0900 Subject: [PATCH 49/51] Making clippy happy. + Simplifying bitpacking by adding a 7 byte padding. + Bugfix in a unit test. --- src/common/bitpacker.rs | 20 +++++++------------- src/fastfield/mod.rs | 10 +++++----- src/indexer/index_writer.rs | 6 +++--- src/query/boolean_query/mod.rs | 10 +++++----- src/query/phrase_query/phrase_scorer.rs | 3 +-- src/store/reader.rs | 2 +- 6 files changed, 22 insertions(+), 29 deletions(-) diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 14a96312f..1a8b8208d 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -82,6 +82,9 @@ impl BitPacker { pub fn close(&mut self, output: &mut TWrite) -> io::Result { self.flush(output)?; + // Padding the write file to simplify reads. + output.write_all(&[0u8; 7])?; + self.written_size += 7; Ok(self.written_size) } } @@ -117,17 +120,8 @@ impl BitUnpacker { let addr = (idx * self.num_bits) / 8; let bit_shift = idx * self.num_bits - addr * 8; let val_unshifted_unmasked: u64; - val_unshifted_unmasked = if addr + 8 <= self.data_len { - unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) } - } else { - let mut arr = [0u8; 8]; - if addr < self.data_len { - for i in 0..self.data_len - addr { - arr[i] = unsafe { *self.data_ptr.offset((addr + i) as isize) }; - } - } - unsafe { mem::transmute::<[u8; 8], u64>(arr) } - }; + debug_assert!(addr + 8 <= self.data_len, "The fast field field should have been padded with 7 bytes."); + val_unshifted_unmasked = unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & self.mask) } @@ -135,7 +129,7 @@ impl BitUnpacker { - + #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker, compute_num_bits}; @@ -163,7 +157,7 @@ mod test { bitpacker.write(val, &mut data).unwrap(); } let num_bytes = bitpacker.close(&mut data).unwrap(); - assert_eq!(num_bytes, (num_bits * len + 7) / 8); + assert_eq!(num_bytes, (num_bits * len + 7) / 8 + 7); assert_eq!(data.len(), num_bytes); let bitunpacker = BitUnpacker::new(&data, num_bits); for (i, val) in vals.iter().enumerate() { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index d470f041f..061ed3910 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -92,7 +92,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 31 as usize); + assert_eq!(source.len(), 38 as usize); } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); @@ -126,7 +126,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 56 as usize); + assert_eq!(source.len(), 63 as usize); } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); @@ -162,7 +162,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 29 as usize); + assert_eq!(source.len(), 36 as usize); } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); @@ -195,7 +195,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 80037 as usize); + assert_eq!(source.len(), 80044 as usize); } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); @@ -232,7 +232,7 @@ mod tests { } let source = directory.open_read(&path).unwrap(); { - assert_eq!(source.len(), 17704 as usize); + assert_eq!(source.len(), 17711 as usize); } { let fast_field_readers = FastFieldsReader::open(source).unwrap(); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3db8aa339..94d9d9567 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -159,7 +159,7 @@ pub fn open_index_writer(index: &Index, pub fn compute_deleted_bitset(delete_bitset: &mut BitSet, segment_reader: &SegmentReader, delete_cursor: &mut DeleteCursor, - doc_opstamps: DocToOpstampMapping, + doc_opstamps: &DocToOpstampMapping, target_opstamp: u64) -> Result { @@ -225,7 +225,7 @@ pub fn advance_deletes(mut segment: Segment, compute_deleted_bitset(&mut delete_bitset, &segment_reader, delete_cursor, - DocToOpstampMapping::None, + &DocToOpstampMapping::None, target_opstamp)?; for doc in 0u32..max_doc { @@ -285,7 +285,7 @@ fn index_documents(heap: &mut Heap, let may_have_deletes = compute_deleted_bitset(&mut deleted_bitset, &segment_reader, &mut delete_cursor, - doc_to_opstamps, + &doc_to_opstamps, last_docstamp)?; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 21ea30575..c9dfdfb53 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -113,7 +113,7 @@ mod tests { let occurs = vec![Occur::Should, Occur::Should]; let occur_filter = OccurFilter::new(&occurs); - let left_fieldnorms = U64FastFieldReader::from(vec![100, 200, 300]); + let left_fieldnorms = U64FastFieldReader::from((0u64..9u64).map(|doc| doc*3).collect::>()); let left = VecPostings::from(vec![1, 2, 3]); let left_scorer = TermScorer { @@ -122,7 +122,7 @@ mod tests { postings: left, }; - let right_fieldnorms = U64FastFieldReader::from(vec![15, 25, 35]); + let right_fieldnorms = U64FastFieldReader::from((0u64..9u64).map(|doc| doc*5).collect::>()); let right = VecPostings::from(vec![1, 3, 8]); let right_scorer = TermScorer { @@ -133,12 +133,12 @@ mod tests { let mut boolean_scorer = BooleanScorer::new(vec![left_scorer, right_scorer], occur_filter); assert_eq!(boolean_scorer.next(), Some(1u32)); - assert!(abs_diff(boolean_scorer.score(), 0.8707107) < 0.001); + assert!(abs_diff(boolean_scorer.score(), 2.3662047) < 0.001); assert_eq!(boolean_scorer.next(), Some(2u32)); - assert!(abs_diff(boolean_scorer.score(), 0.028867513) < 0.001f32); + assert!(abs_diff(boolean_scorer.score(), 0.20412415) < 0.001f32); assert_eq!(boolean_scorer.next(), Some(3u32)); assert_eq!(boolean_scorer.next(), Some(8u32)); - assert!(abs_diff(boolean_scorer.score(), 0.5163978) < 0.001f32); + assert!(abs_diff(boolean_scorer.score(), 0.31622776) < 0.001f32); assert!(!boolean_scorer.advance()); } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index f80416712..23721037b 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -29,8 +29,7 @@ impl<'a> PhraseScorer<'a> { 'outer: loop { let target = pos_candidate + ord; let positions = positions_arr[ord as usize]; - for i in 0..positions.len() { - let pos_i = positions[i]; + for (i, pos_i) in positions.iter().cloned().enumerate() { if pos_i < target { continue; } diff --git a/src/store/reader.rs b/src/store/reader.rs index 79eb4d4dd..060be4525 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -64,7 +64,7 @@ impl StoreReader { } } - +#[allow(needless_pass_by_value)] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); From a13122d392002faa71a68a2a1617b1a605901719 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Wed, 17 May 2017 08:34:08 +0300 Subject: [PATCH 50/51] use explicit drop instead of suppression --- src/common/bitpacker.rs | 5 +++-- src/query/boolean_query/mod.rs | 6 ++++-- src/store/reader.rs | 5 +++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 1a8b8208d..fe947453d 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -120,7 +120,8 @@ impl BitUnpacker { let addr = (idx * self.num_bits) / 8; let bit_shift = idx * self.num_bits - addr * 8; let val_unshifted_unmasked: u64; - debug_assert!(addr + 8 <= self.data_len, "The fast field field should have been padded with 7 bytes."); + debug_assert!(addr + 8 <= self.data_len, + "The fast field field should have been padded with 7 bytes."); val_unshifted_unmasked = unsafe { *(self.data_ptr.offset(addr as isize) as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; (val_shifted & self.mask) @@ -129,7 +130,7 @@ impl BitUnpacker { - + #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker, compute_num_bits}; diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index c9dfdfb53..01ef9e824 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -113,7 +113,8 @@ mod tests { let occurs = vec![Occur::Should, Occur::Should]; let occur_filter = OccurFilter::new(&occurs); - let left_fieldnorms = U64FastFieldReader::from((0u64..9u64).map(|doc| doc*3).collect::>()); + let left_fieldnorms = + U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 3).collect::>()); let left = VecPostings::from(vec![1, 2, 3]); let left_scorer = TermScorer { @@ -122,7 +123,8 @@ mod tests { postings: left, }; - let right_fieldnorms = U64FastFieldReader::from((0u64..9u64).map(|doc| doc*5).collect::>()); + let right_fieldnorms = + U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 5).collect::>()); let right = VecPostings::from(vec![1, 3, 8]); let right_scorer = TermScorer { diff --git a/src/store/reader.rs b/src/store/reader.rs index 060be4525..5569a11a5 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -64,7 +64,6 @@ impl StoreReader { } } -#[allow(needless_pass_by_value)] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); @@ -73,7 +72,9 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) let offset = u64::deserialize(&mut serialized_offset_buf).unwrap(); let offset = offset as usize; let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap(); - (data.slice(0, offset), data.slice(offset, footer_offset), max_doc) + let res = (data.slice(0, offset), data.slice(offset, footer_offset), max_doc); + drop(data); + res } From b3f39f234317aba78733111a260be6d7d03b7e94 Mon Sep 17 00:00:00 2001 From: Laurentiu Nicola Date: Wed, 17 May 2017 09:03:08 +0300 Subject: [PATCH 51/51] Remove unneeded suppressions, make clippy lints explicit --- src/datastruct/fstmap.rs | 2 -- src/error.rs | 2 -- src/indexer/segment_updater.rs | 2 -- src/lib.rs | 3 +-- 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index c79c65b67..79c35a4e2 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -1,5 +1,3 @@ -#![allow(should_implement_trait)] - use std::io; use std::io::Write; use fst; diff --git a/src/error.rs b/src/error.rs index 471d21c57..83077c633 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,5 +1,3 @@ -#![allow(enum_variant_names)] - /// Definition of Tantivy's error and result. use std::io; diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index ec8c0a7a0..3a17d093c 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,5 +1,3 @@ -#![allow(for_kv_map)] - use core::Index; use core::IndexMeta; use core::META_FILEPATH; diff --git a/src/lib.rs b/src/lib.rs index d37ff3b44..5566bcc59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] -#![allow(unknown_lints)] // for the clippy lint options -#![allow(module_inception)] +#![cfg_attr(feature = "cargo-clippy", allow(module_inception))] #![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![feature(box_syntax)]