From 4867be3d3b0c1e47d1ee85238c879dcdf72d7faf Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 10 Jul 2019 19:24:54 +0900 Subject: [PATCH] Kompass master (#590) * Use once_cell in place of lazy_static * Minor changes --- Cargo.toml | 2 +- src/core/index.rs | 2 +- src/core/mod.rs | 24 ++++----- src/core/segment_id.rs | 12 +++-- src/core/segment_meta.rs | 5 +- src/directory/directory_lock.rs | 51 +++++++++--------- src/directory/managed_directory.rs | 9 ++-- src/directory/tests.rs | 5 +- src/fastfield/mod.rs | 16 +++--- src/indexer/segment_manager.rs | 2 +- src/lib.rs | 3 -- src/postings/mod.rs | 86 +++++++++++++++--------------- src/query/fuzzy_query.rs | 23 ++++---- src/schema/facet.rs | 5 +- src/schema/mod.rs | 6 +-- 15 files changed, 121 insertions(+), 130 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 02926ffd0..fd802f0a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ edition = "2018" [dependencies] base64 = "0.10.0" byteorder = "1.0" -lazy_static = "1" +once_cell = "0.2" regex = "1.0" tantivy-fst = "0.1" memmap = {version = "0.7", optional=true} diff --git a/src/core/index.rs b/src/core/index.rs index b31142b53..ad55c53bf 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -38,7 +38,7 @@ fn load_metas(directory: &dyn Directory) -> Result { serde_json::from_str(&meta_string) .map_err(|e| { DataCorruption::new( - META_FILEPATH.clone(), + META_FILEPATH.to_path_buf(), format!("Meta file cannot be deserialized. {:?}.", e), ) }) diff --git a/src/core/mod.rs b/src/core/mod.rs index 9e5717afa..fa2f0dd6a 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -21,18 +21,16 @@ pub use self::segment_id::SegmentId; pub use self::segment_meta::SegmentMeta; pub use self::segment_reader::SegmentReader; -use std::path::PathBuf; +use once_cell::sync::Lazy; +use std::path::Path; -lazy_static! { +/// The meta file contains all the information about the list of segments and the schema +/// of the index. +pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json")); - /// The meta file contains all the information about the list of segments and the schema - /// of the index. - pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); - - /// The managed file contains a list of files that were created by the tantivy - /// and will therefore be garbage collected when they are deemed useless by tantivy. - /// - /// Removing this file is safe, but will prevent the garbage collection of all of the file that - /// are currently in the directory - pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); -} +/// The managed file contains a list of files that were created by the tantivy +/// and will therefore be garbage collected when they are deemed useless by tantivy. +/// +/// Removing this file is safe, but will prevent the garbage collection of all of the file that +/// are currently in the directory +pub static MANAGED_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new(".managed.json")); diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index e86d08535..dda71bb8b 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -2,6 +2,8 @@ use std::cmp::{Ord, Ordering}; use std::fmt; use uuid::Uuid; +#[cfg(test)] +use once_cell::sync::Lazy; #[cfg(test)] use std::sync::atomic; @@ -17,10 +19,10 @@ use std::sync::atomic; pub struct SegmentId(Uuid); #[cfg(test)] -lazy_static! { - static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default(); - static ref ZERO_ARRAY: [u8; 8] = [0u8; 8]; -} +static AUTO_INC_COUNTER: Lazy = Lazy::new(|| atomic::AtomicUsize::default()); + +#[cfg(test)] +const ZERO_ARRAY: [u8; 8] = [0u8; 8]; // During tests, we generate the segment id in a autoincrement manner // for consistency of segment id between run. @@ -30,7 +32,7 @@ lazy_static! { #[cfg(test)] fn create_uuid() -> Uuid { let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst); - Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap() + Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap() } #[cfg(not(test))] diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 1834c5514..7adfaa3bc 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -2,14 +2,13 @@ use super::SegmentComponent; use crate::core::SegmentId; use crate::Opstamp; use census::{Inventory, TrackedObject}; +use once_cell::sync::Lazy; use serde; use std::collections::HashSet; use std::fmt; use std::path::PathBuf; -lazy_static! { - static ref INVENTORY: Inventory = { Inventory::new() }; -} +static INVENTORY: Lazy> = Lazy::new(Inventory::new); #[derive(Clone, Debug, Serialize, Deserialize)] struct DeleteMeta { diff --git a/src/directory/directory_lock.rs b/src/directory/directory_lock.rs index 67c2585dd..b726aa087 100644 --- a/src/directory/directory_lock.rs +++ b/src/directory/directory_lock.rs @@ -1,3 +1,4 @@ +use once_cell::sync::Lazy; use std::path::PathBuf; /// A directory lock. @@ -28,29 +29,27 @@ pub struct Lock { pub is_blocking: bool, } -lazy_static! { - /// Only one process should be able to write tantivy's index at a time. - /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. - /// - /// If the process is killed and this file remains, it is safe to remove it manually. - /// - /// Failing to acquire this lock usually means a misuse of tantivy's API, - /// (creating more than one instance of the `IndexWriter`), are a spurious - /// lock file remaining after a crash. In the latter case, removing the file after - /// checking no process running tantivy is running is safe. - pub static ref INDEX_WRITER_LOCK: Lock = Lock { - filepath: PathBuf::from(".tantivy-writer.lock"), - is_blocking: false - }; - /// The meta lock file is here to protect the segment files being opened by - /// `IndexReader::reload()` from being garbage collected. - /// It makes it possible for another process to safely consume - /// our index in-writing. Ideally, we may have prefered `RWLock` semantics - /// here, but it is difficult to achieve on Windows. - /// - /// Opening segment readers is a very fast process. - pub static ref META_LOCK: Lock = Lock { - filepath: PathBuf::from(".tantivy-meta.lock"), - is_blocking: true - }; -} +/// Only one process should be able to write tantivy's index at a time. +/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. +/// +/// If the process is killed and this file remains, it is safe to remove it manually. +/// +/// Failing to acquire this lock usually means a misuse of tantivy's API, +/// (creating more than one instance of the `IndexWriter`), are a spurious +/// lock file remaining after a crash. In the latter case, removing the file after +/// checking no process running tantivy is running is safe. +pub static INDEX_WRITER_LOCK: Lazy = Lazy::new(|| Lock { + filepath: PathBuf::from(".tantivy-writer.lock"), + is_blocking: false, +}); +/// The meta lock file is here to protect the segment files being opened by +/// `IndexReader::reload()` from being garbage collected. +/// It makes it possible for another process to safely consume +/// our index in-writing. Ideally, we may have prefered `RWLock` semantics +/// here, but it is difficult to achieve on Windows. +/// +/// Opening segment readers is a very fast process. +pub static META_LOCK: Lazy = Lazy::new(|| Lock { + filepath: PathBuf::from(".tantivy-meta.lock"), + is_blocking: true, +}); diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index e70a0d342..6a9fd09a9 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -69,7 +69,7 @@ impl ManagedDirectory { let managed_files: HashSet = serde_json::from_str(&managed_files_json) .map_err(|e| { DataCorruption::new( - MANAGED_FILEPATH.clone(), + MANAGED_FILEPATH.to_path_buf(), format!("Managed file cannot be deserialized: {:?}. ", e), ) })?; @@ -264,13 +264,12 @@ mod tests { mod mmap_specific { use super::super::*; + use once_cell::sync::Lazy; use std::path::Path; use tempdir::TempDir; - lazy_static! { - static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test"); - static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2"); - } + static TEST_PATH1: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test")); + static TEST_PATH2: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test2")); use crate::directory::MmapDirectory; use std::io::Write; diff --git a/src/directory/tests.rs b/src/directory/tests.rs index f27dc9113..2ad857e4c 100644 --- a/src/directory/tests.rs +++ b/src/directory/tests.rs @@ -1,4 +1,5 @@ use super::*; +use once_cell::sync::Lazy; use std::io::Write; use std::mem; use std::path::{Path, PathBuf}; @@ -9,9 +10,7 @@ use std::thread; use std::time; use std::time::Duration; -lazy_static! { - static ref TEST_PATH: &'static Path = Path::new("some_path_for_test"); -} +static TEST_PATH: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test")); #[test] fn test_ram_directory() { diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 7b47447a1..aa5104d1f 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -133,20 +133,20 @@ mod tests { use crate::schema::Field; use crate::schema::Schema; use crate::schema::FAST; + use once_cell::sync::Lazy; use rand::prelude::SliceRandom; use rand::rngs::StdRng; use rand::SeedableRng; use std::collections::HashMap; use std::path::Path; - lazy_static! { - pub static ref SCHEMA: Schema = { - let mut schema_builder = Schema::builder(); - schema_builder.add_u64_field("field", FAST); - schema_builder.build() - }; - pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() }; - } + pub static SCHEMA: Lazy = Lazy::new(|| { + let mut schema_builder = Schema::builder(); + schema_builder.add_u64_field("field", FAST); + schema_builder.build() + }); + + pub static FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("field").unwrap()); #[test] pub fn test_fastfield() { diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index e4113a45f..5b551155e 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -81,7 +81,7 @@ impl SegmentManager { /// but have not yet been deleted by the garbage collector. pub fn list_files(&self) -> HashSet { let mut files = HashSet::new(); - files.insert(META_FILEPATH.clone()); + files.insert(META_FILEPATH.to_path_buf()); for segment_meta in SegmentMeta::all() { files.extend(segment_meta.list_files()); } diff --git a/src/lib.rs b/src/lib.rs index ebae6e1f0..8eefccb3e 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -108,9 +108,6 @@ //! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / //! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) -#[macro_use] -extern crate lazy_static; - #[macro_use] extern crate serde_derive; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 0981e872b..b8b511064 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -63,6 +63,7 @@ pub mod tests { use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN}; use crate::DocId; use crate::Score; + use once_cell::sync::Lazy; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use std::iter; @@ -509,53 +510,52 @@ pub mod tests { } } - lazy_static! { - pub static ref TERM_A: Term = { - let field = Field(0); - Term::from_field_text(field, "a") - }; - pub static ref TERM_B: Term = { - let field = Field(0); - Term::from_field_text(field, "b") - }; - pub static ref TERM_C: Term = { - let field = Field(0); - Term::from_field_text(field, "c") - }; - pub static ref TERM_D: Term = { - let field = Field(0); - Term::from_field_text(field, "d") - }; - pub static ref INDEX: Index = { - let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", STRING); - let schema = schema_builder.build(); + pub static TERM_A: Lazy = Lazy::new(|| { + let field = Field(0); + Term::from_field_text(field, "a") + }); + pub static TERM_B: Lazy = Lazy::new(|| { + let field = Field(0); + Term::from_field_text(field, "b") + }); + pub static TERM_C: Lazy = Lazy::new(|| { + let field = Field(0); + Term::from_field_text(field, "c") + }); + pub static TERM_D: Lazy = Lazy::new(|| { + let field = Field(0); + Term::from_field_text(field, "d") + }); - let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + pub static INDEX: Lazy = Lazy::new(|| { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", STRING); + let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let posting_list_size = 1_000_000; - { - let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); - for _ in 0..posting_list_size { - let mut doc = Document::default(); - if rng.gen_bool(1f64 / 15f64) { - doc.add_text(text_field, "a"); - } - if rng.gen_bool(1f64 / 10f64) { - doc.add_text(text_field, "b"); - } - if rng.gen_bool(1f64 / 5f64) { - doc.add_text(text_field, "c"); - } - doc.add_text(text_field, "d"); - index_writer.add_document(doc); + let mut rng: StdRng = StdRng::from_seed([1u8; 32]); + + let index = Index::create_in_ram(schema); + let posting_list_size = 1_000_000; + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for _ in 0..posting_list_size { + let mut doc = Document::default(); + if rng.gen_bool(1f64 / 15f64) { + doc.add_text(text_field, "a"); } - assert!(index_writer.commit().is_ok()); + if rng.gen_bool(1f64 / 10f64) { + doc.add_text(text_field, "b"); + } + if rng.gen_bool(1f64 / 5f64) { + doc.add_text(text_field, "c"); + } + doc.add_text(text_field, "d"); + index_writer.add_document(doc); } - index - }; - } + assert!(index_writer.commit().is_ok()); + } + index + }); /// Wraps a given docset, and forward alls call but the /// `.skip_next(...)`. This is useful to test that a specialized diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 982031b7c..cffdb0502 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -3,21 +3,20 @@ use crate::schema::Term; use crate::Result; use crate::Searcher; use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA}; +use once_cell::sync::Lazy; use std::collections::HashMap; -lazy_static! { - static ref LEV_BUILDER: HashMap<(u8, bool), LevenshteinAutomatonBuilder> = { - let mut lev_builder_cache = HashMap::new(); - // TODO make population lazy on a `(distance, val)` basis - for distance in 0..3 { - for &transposition in &[false, true] { - let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition); - lev_builder_cache.insert((distance, transposition), lev_automaton_builder); - } +static LEV_BUILDER: Lazy> = Lazy::new(|| { + let mut lev_builder_cache = HashMap::new(); + // TODO make population lazy on a `(distance, val)` basis + for distance in 0..3 { + for &transposition in &[false, true] { + let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition); + lev_builder_cache.insert((distance, transposition), lev_automaton_builder); } - lev_builder_cache - }; -} + } + lev_builder_cache +}); /// A Fuzzy Query matches all of the documents /// containing a specific term that is within diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 28d2ea81d..4477982ee 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -1,4 +1,5 @@ use crate::common::BinarySerializable; +use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::Borrow; @@ -183,9 +184,7 @@ impl Display for Facet { } fn escape_slashes(s: &str) -> Cow<'_, str> { - lazy_static! { - static ref SLASH_PTN: Regex = Regex::new(r"[\\/]").unwrap(); - } + static SLASH_PTN: Lazy = Lazy::new(|| Regex::new(r"[\\/]").unwrap()); SLASH_PTN.replace_all(s, "\\/") } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index a1d90bef9..7ac3a1448 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -146,6 +146,7 @@ pub use self::flags::{FAST, INDEXED, STORED}; pub use self::int_options::Cardinality; pub use self::int_options::IntOptions; +use once_cell::sync::Lazy; use regex::Regex; /// Validator for a potential `field_name`. @@ -154,9 +155,8 @@ use regex::Regex; /// A field name must start by a letter `[a-zA-Z]`. /// The other characters can be any alphanumic character `[a-ZA-Z0-9]` or `_`. pub fn is_valid_field_name(field_name: &str) -> bool { - lazy_static! { - static ref FIELD_NAME_PTN: Regex = Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap(); - } + static FIELD_NAME_PTN: Lazy = + Lazy::new(|| Regex::new("^[a-zA-Z][_a-zA-Z0-9]*$").unwrap()); FIELD_NAME_PTN.is_match(field_name) }