From cc50bdb06abd4c8549ac12ac28d36f59621bc894 Mon Sep 17 00:00:00 2001 From: Dru Sellers Date: Sat, 18 Aug 2018 18:07:54 -0500 Subject: [PATCH 01/62] Add a basic faceted search example (#383) * Add a basic faceted search example * quieting the compiler --- examples/faceted_search.rs | 81 +++++++++++++++++++++++++++++++++++ examples/working_with_json.rs | 8 ++-- 2 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 examples/faceted_search.rs diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs new file mode 100644 index 000000000..76d167778 --- /dev/null +++ b/examples/faceted_search.rs @@ -0,0 +1,81 @@ +// # Basic Example +// +// This example covers the basic functionalities of +// tantivy. +// +// We will : +// - define our schema +// = create an index in a directory +// - index few documents in our index +// - search for the best document matchings "sea whale" +// - retrieve the best document original content. + +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::FacetCollector; +use tantivy::query::AllQuery; +use tantivy::schema::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_facet_example_dir")?; + let mut schema_builder = SchemaBuilder::default(); + + schema_builder.add_text_field("name", TEXT | STORED); + + // this is our faceted field + schema_builder.add_facet_field("tags"); + + let schema = schema_builder.build(); + + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let name = schema.get_field("name").unwrap(); + let tags = schema.get_field("tags").unwrap(); + + // For convenience, tantivy also comes with a macro to + // reduce the boilerplate above. + index_writer.add_document(doc!( + name => "the ditch", + tags => Facet::from("/pools/north") + )); + + index_writer.add_document(doc!( + name => "little stacey", + tags => Facet::from("/pools/south") + )); + + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + let mut facet_collector = FacetCollector::for_field(tags); + facet_collector.add_facet("/pools"); + + searcher.search(&AllQuery, &mut facet_collector).unwrap(); + + let counts = facet_collector.harvest(); + // This lists all of the facet counts + let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); + assert_eq!( + facets, + vec![ + (&Facet::from("/pools/north"), 1), + (&Facet::from("/pools/south"), 1) + ] + ); + + Ok(()) +} + +use tempdir::TempDir; diff --git a/examples/working_with_json.rs b/examples/working_with_json.rs index 5de285df2..3c8e3c1ca 100644 --- a/examples/working_with_json.rs +++ b/examples/working_with_json.rs @@ -1,7 +1,6 @@ extern crate tantivy; use tantivy::schema::*; - // # Document from json // // For convenience, `Document` can be parsed directly from json. @@ -23,8 +22,8 @@ fn main() -> tantivy::Result<()> { }"#; // We can parse our document - let mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?; - + let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?; + // Multi-valued field are allowed, they are // expressed in JSON by an array. // The following document has two titles. @@ -32,8 +31,7 @@ fn main() -> tantivy::Result<()> { "title": ["Frankenstein", "The Modern Prometheus"], "year": 1818 }"#; - let frankenstein_doc = schema.parse_document(&frankenstein_json)?; - + let _frankenstein_doc = schema.parse_document(&frankenstein_json)?; // Note that the schema is saved in your index directory. // From 0feeef26843e17d731cd06080d8f2a8f7819bd90 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" Date: Mon, 20 Aug 2018 09:08:11 +0900 Subject: [PATCH 02/62] Update owning_ref requirement from 0.3 to 0.4 (#379) Updates the requirements on [owning_ref](https://github.com/Kimundi/owning-ref-rs) to permit the latest version. - [Release notes](https://github.com/Kimundi/owning-ref-rs/releases) - [Commits](https://github.com/Kimundi/owning-ref-rs/commits) Signed-off-by: dependabot[bot] --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d852f17a7..e0ba03e6a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ crossbeam-channel = "0.2" futures = "0.1" futures-cpupool = "0.1" error-chain = "0.8" -owning_ref = "0.3" +owning_ref = "0.4" stable_deref_trait = "1.0.0" rust-stemmers = "1" downcast = { version="0.9" } From a0a284fe91c0ca16961875959300fcf917327c43 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 20 Aug 2018 09:21:32 +0900 Subject: [PATCH 03/62] Added a full fledge empty query and relyign on it in QueryParser, instead of using an empty clause. --- src/query/empty_query.rs | 81 ++++++++++++++++++++++++++ src/query/mod.rs | 3 +- src/query/query_parser/query_parser.rs | 33 +++++++++-- src/query/scorer.rs | 45 -------------- 4 files changed, 111 insertions(+), 51 deletions(-) create mode 100644 src/query/empty_query.rs diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs new file mode 100644 index 000000000..06c15c3f3 --- /dev/null +++ b/src/query/empty_query.rs @@ -0,0 +1,81 @@ +use super::Scorer; +use DocSet; +use Score; +use DocId; +use query::Query; +use Result; +use Searcher; +use query::Weight; +use SegmentReader; + +/// `EmptyQuery` is a dummy `Query` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +#[derive(Clone, Debug)] +pub struct EmptyQuery; + +impl Query for EmptyQuery { + fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result> { + Ok(Box::new(EmptyWeight)) + } + + fn count(&self, _searcher: &Searcher) -> Result { + Ok(0) + } +} + +/// `EmptyWeight` is a dummy `Weight` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +pub struct EmptyWeight; +impl Weight for EmptyWeight { + fn scorer(&self, _reader: &SegmentReader) -> Result> { + Ok(Box::new(EmptyScorer)) + } +} + +/// `EmptyScorer` is a dummy `Scorer` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +pub struct EmptyScorer; + +impl DocSet for EmptyScorer { + fn advance(&mut self) -> bool { + false + } + + fn doc(&self) -> DocId { + panic!( + "You may not call .doc() on a scorer \ + where the last call to advance() did not return true." + ); + } + + fn size_hint(&self) -> u32 { + 0 + } +} + +impl Scorer for EmptyScorer { + fn score(&mut self) -> Score { + 0f32 + } +} + +#[cfg(test)] +mod tests { + use query::EmptyScorer; + use DocSet; + + #[test] + fn test_empty_scorer() { + let mut empty_scorer = EmptyScorer; + assert!(!empty_scorer.advance()); + } + + #[test] + #[should_panic] + fn test_empty_scorer_panic_on_doc_call() { + EmptyScorer.doc(); + } +} diff --git a/src/query/mod.rs b/src/query/mod.rs index 7b6368c00..7546465fb 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -3,6 +3,7 @@ Query */ mod all_query; +mod empty_query; mod automaton_weight; mod bitset; mod bm25; @@ -34,6 +35,7 @@ pub use self::union::Union; pub use self::vec_docset::VecDocSet; pub use self::all_query::{AllQuery, AllScorer, AllWeight}; +pub use self::empty_query::{EmptyQuery, EmptyWeight, EmptyScorer}; pub use self::automaton_weight::AutomatonWeight; pub use self::bitset::BitSetDocSet; pub use self::boolean_query::BooleanQuery; @@ -49,7 +51,6 @@ pub use self::range_query::RangeQuery; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::scorer::ConstScorer; -pub use self::scorer::EmptyScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 0b413345b..abe6b404f 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -18,6 +18,7 @@ use std::ops::Bound; use std::str::FromStr; use tokenizer::TokenizerManager; use combine::Parser; +use query::EmptyQuery; /// Possible error that may happen when parsing a query. @@ -438,11 +439,15 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { fn convert_to_query(logical_ast: LogicalAST) -> Box { match logical_ast { LogicalAST::Clause(clause) => { - let occur_subqueries = clause - .into_iter() - .map(|(occur, subquery)| (occur, convert_to_query(subquery))) - .collect::>(); - Box::new(BooleanQuery::from(occur_subqueries)) + if clause.is_empty() { + Box::new(EmptyQuery) + } else { + let occur_subqueries = clause + .into_iter() + .map(|(occur, subquery)| (occur, convert_to_query(subquery))) + .collect::>(); + Box::new(BooleanQuery::from(occur_subqueries)) + } } LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), } @@ -541,6 +546,24 @@ mod test { ); } + #[test] + pub fn test_parse_query_empty() { + test_parse_query_to_logical_ast_helper( + "", + "", + false, + ); + test_parse_query_to_logical_ast_helper( + " ", + "", + false, + ); + let query_parser = make_query_parser(); + let query_result = query_parser.parse_query(""); + let query = query_result.unwrap(); + assert_eq!(format!("{:?}", query), "EmptyQuery"); + } + #[test] pub fn test_parse_query_ints() { let query_parser = make_query_parser(); diff --git a/src/query/scorer.rs b/src/query/scorer.rs index a94b03a5b..186e75a22 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -50,33 +50,6 @@ impl Scorer for Box { } } -/// `EmptyScorer` is a dummy `Scorer` in which no document matches. -/// -/// It is useful for tests and handling edge cases. -pub struct EmptyScorer; - -impl DocSet for EmptyScorer { - fn advance(&mut self) -> bool { - false - } - - fn doc(&self) -> DocId { - panic!( - "You may not call .doc() on a scorer \ - where the last call to advance() did not return true." - ); - } - - fn size_hint(&self) -> u32 { - 0 - } -} - -impl Scorer for EmptyScorer { - fn score(&mut self) -> Score { - 0f32 - } -} /// Wraps a `DocSet` and simply returns a constant `Scorer`. /// The `ConstScorer` is useful if you have a `DocSet` where @@ -135,21 +108,3 @@ impl Scorer for ConstScorer { 1f32 } } - -#[cfg(test)] -mod tests { - use super::EmptyScorer; - use DocSet; - - #[test] - fn test_empty_scorer() { - let mut empty_scorer = EmptyScorer; - assert!(!empty_scorer.advance()); - } - - #[test] - #[should_panic] - fn test_empty_scorer_panic_on_doc_call() { - EmptyScorer.doc(); - } -} From ef3a16a129699ce1286fccc12ebd0eb5387ae5ab Mon Sep 17 00:00:00 2001 From: Dru Sellers Date: Sun, 19 Aug 2018 19:40:45 -0500 Subject: [PATCH 04/62] Switch from error-chain to failure crate (#376) * Switch from error-chain to failure crate * Added deprecated alias for * Started editing the changeld --- CHANGELOG.md | 8 ++ Cargo.toml | 2 +- src/core/index.rs | 7 +- src/core/segment_reader.rs | 10 +- src/directory/managed_directory.rs | 5 +- src/error.rs | 151 +++++++++++-------------- src/fastfield/error.rs | 3 +- src/indexer/index_writer.rs | 19 ++-- src/indexer/merger.rs | 2 +- src/indexer/segment_manager.rs | 6 +- src/indexer/segment_updater.rs | 14 +-- src/lib.rs | 23 ++-- src/query/phrase_query/mod.rs | 7 +- src/query/phrase_query/phrase_query.rs | 23 ++-- src/query/range_query.rs | 4 +- src/query/regex_query.rs | 4 +- 16 files changed, 142 insertions(+), 146 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0be47a0b7..ef956ac52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ + +Tantivy 0.7 +===================== +- Skip data for doc ids and positions (@fulmicoton), + greatly improving performance +- Tantivy error now rely on the failure crate (@drusellers) + + Tantivy 0.6.1 ========================= - Bugfix #324. GC removing was removing file that were still in useful diff --git a/Cargo.toml b/Cargo.toml index e0ba03e6a..ab767d3fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,6 @@ crossbeam = "0.4" crossbeam-channel = "0.2" futures = "0.1" futures-cpupool = "0.1" -error-chain = "0.8" owning_ref = "0.4" stable_deref_trait = "1.0.0" rust-stemmers = "1" @@ -48,6 +47,7 @@ bitpacking = "0.5" census = "0.1" fnv = "1.0.6" owned-read = "0.4" +failure = "0.1" [target.'cfg(windows)'.dependencies] winapi = "0.2" diff --git a/src/core/index.rs b/src/core/index.rs index c88292ba1..efdfedc5f 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,5 +1,5 @@ use core::SegmentId; -use error::{ErrorKind, ResultExt}; +use error::TantivyError; use schema::Schema; use serde_json; use std::borrow::BorrowMut; @@ -17,10 +17,10 @@ use core::IndexMeta; use core::SegmentMeta; use core::SegmentReader; use core::META_FILEPATH; -use directory::{ManagedDirectory, DirectoryClone}; #[cfg(feature = "mmap")] use directory::MmapDirectory; use directory::{Directory, RAMDirectory}; +use directory::{DirectoryClone, ManagedDirectory}; use indexer::index_writer::open_index_writer; use indexer::index_writer::HEAP_SIZE_MIN; use indexer::segment_updater::save_new_metas; @@ -33,7 +33,8 @@ use IndexWriter; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_string = String::from_utf8_lossy(&meta_data); - serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone())) + serde_json::from_str(&meta_string) + .map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone())) } /// Search Index diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f0edb86b3..37b950332 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -5,7 +5,7 @@ use core::Segment; use core::SegmentComponent; use core::SegmentId; use core::SegmentMeta; -use error::ErrorKind; +use error::TantivyError; use fastfield::DeleteBitSet; use fastfield::FacetReader; use fastfield::FastFieldReader; @@ -171,7 +171,7 @@ impl SegmentReader { pub fn facet_reader(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); if field_entry.field_type() != &FieldType::HierarchicalFacet { - return Err(ErrorKind::InvalidArgument(format!( + return Err(TantivyError::InvalidArgument(format!( "The field {:?} is not a \ hierarchical facet.", field_entry @@ -179,7 +179,7 @@ impl SegmentReader { } let term_ords_reader = self.multi_fast_field_reader(field)?; let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { - ErrorKind::InvalidArgument(format!( + TantivyError::InvalidArgument(format!( "The field \"{}\" is a hierarchical \ but this segment does not seem to have the field term \ dictionary.", @@ -462,9 +462,7 @@ mod test { index.load_searchers().unwrap(); let searcher = index.searcher(); - let docs: Vec = searcher.segment_reader(0) - .doc_ids_alive() - .collect(); + let docs: Vec = searcher.segment_reader(0).doc_ids_alive().collect(); assert_eq!(vec![0u32, 2u32], docs); } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 2977337c6..cf59b9ace 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -1,7 +1,7 @@ use core::MANAGED_FILEPATH; use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; -use error::{ErrorKind, Result, ResultExt}; +use error::TantivyError; use serde_json; use std::collections::HashSet; use std::io; @@ -11,6 +11,7 @@ use std::result; use std::sync::RwLockWriteGuard; use std::sync::{Arc, RwLock}; use Directory; +use Result; /// Wrapper of directories that keeps track of files created by Tantivy. /// @@ -51,7 +52,7 @@ impl ManagedDirectory { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); let managed_files: HashSet = serde_json::from_str(&managed_files_json) - .chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?; + .map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?; Ok(ManagedDirectory { directory: Box::new(directory), meta_informations: Arc::new(RwLock::new(MetaInformation { diff --git a/src/error.rs b/src/error.rs index 4ec4bfe25..8fa5cb1ce 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,129 +10,114 @@ use serde_json; use std::path::PathBuf; use std::sync::PoisonError; -error_chain!( - errors { - /// Path does not exist. - PathDoesNotExist(buf: PathBuf) { - description("path does not exist") - display("path does not exist: '{:?}'", buf) - } - /// File already exists, this is a problem when we try to write into a new file. - FileAlreadyExists(buf: PathBuf) { - description("file already exists") - display("file already exists: '{:?}'", buf) - } - /// IO Error. - IOError(err: IOError) { - description("an IO error occurred") - display("an IO error occurred: '{}'", err) - } - /// The data within is corrupted. - /// - /// For instance, it contains invalid JSON. - CorruptedFile(buf: PathBuf) { - description("file contains corrupted data") - display("file contains corrupted data: '{:?}'", buf) - } - /// A thread holding the locked panicked and poisoned the lock. - Poisoned { - description("a thread holding the locked panicked and poisoned the lock") - } - /// Invalid argument was passed by the user. - InvalidArgument(arg: String) { - description("an invalid argument was passed") - display("an invalid argument was passed: '{}'", arg) - } - /// An Error happened in one of the thread. - ErrorInThread(err: String) { - description("an error occurred in a thread") - display("an error occurred in a thread: '{}'", err) - } - /// An Error appeared related to the schema. - SchemaError(message: String) { - description("the schema is not matching expectations.") - display("Schema error: '{}'", message) - } - /// Tried to access a fastfield reader for a field not configured accordingly. - FastFieldError(err: FastFieldNotAvailableError) { - description("fast field not available") - display("fast field not available: '{:?}'", err) - } - } -); +/// The library's failure based error enum +#[derive(Debug, Fail)] +pub enum TantivyError { + /// Path does not exist. + #[fail(display = "path does not exist: '{:?}'", _0)] + PathDoesNotExist(PathBuf), + /// File already exists, this is a problem when we try to write into a new file. + #[fail(display = "file already exists: '{:?}'", _0)] + FileAlreadyExists(PathBuf), + /// IO Error. + #[fail(display = "an IO error occurred: '{}'", _0)] + IOError(#[cause] IOError), + /// The data within is corrupted. + /// + /// For instance, it contains invalid JSON. + #[fail(display = "file contains corrupted data: '{:?}'", _0)] + CorruptedFile(PathBuf), + /// A thread holding the locked panicked and poisoned the lock. + #[fail(display = "a thread holding the locked panicked and poisoned the lock")] + Poisoned, + /// Invalid argument was passed by the user. + #[fail(display = "an invalid argument was passed: '{}'", _0)] + InvalidArgument(String), + /// An Error happened in one of the thread. + #[fail(display = "an error occurred in a thread: '{}'", _0)] + ErrorInThread(String), + /// An Error appeared related to the schema. + #[fail(display = "Schema error: '{}'", _0)] + SchemaError(String), + /// Tried to access a fastfield reader for a field not configured accordingly. + #[fail(display = "fast field not available: '{:?}'", _0)] + FastFieldError(#[cause] FastFieldNotAvailableError), +} -impl From for Error { - fn from(fastfield_error: FastFieldNotAvailableError) -> Error { - ErrorKind::FastFieldError(fastfield_error).into() +impl From for TantivyError { + fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { + TantivyError::FastFieldError(fastfield_error).into() } } -impl From for Error { - fn from(io_error: IOError) -> Error { - ErrorKind::IOError(io_error).into() +impl From for TantivyError { + fn from(io_error: IOError) -> TantivyError { + TantivyError::IOError(io_error).into() } } -impl From for Error { - fn from(io_error: io::Error) -> Error { - ErrorKind::IOError(io_error.into()).into() +impl From for TantivyError { + fn from(io_error: io::Error) -> TantivyError { + TantivyError::IOError(io_error.into()).into() } } -impl From for Error { - fn from(parsing_error: query::QueryParserError) -> Error { - ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into() +impl From for TantivyError { + fn from(parsing_error: query::QueryParserError) -> TantivyError { + TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into() } } -impl From> for Error { - fn from(_: PoisonError) -> Error { - ErrorKind::Poisoned.into() +impl From> for TantivyError { + fn from(_: PoisonError) -> TantivyError { + TantivyError::Poisoned.into() } } -impl From for Error { - fn from(error: OpenReadError) -> Error { +impl From for TantivyError { + fn from(error: OpenReadError) -> TantivyError { match error { OpenReadError::FileDoesNotExist(filepath) => { - ErrorKind::PathDoesNotExist(filepath).into() + TantivyError::PathDoesNotExist(filepath).into() } - OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(), + OpenReadError::IOError(io_error) => TantivyError::IOError(io_error).into(), } } } -impl From for Error { - fn from(error: schema::DocParsingError) -> Error { - ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into() +impl From for TantivyError { + fn from(error: schema::DocParsingError) -> TantivyError { + TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)).into() } } -impl From for Error { - fn from(error: OpenWriteError) -> Error { +impl From for TantivyError { + fn from(error: OpenWriteError) -> TantivyError { match error { - OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath), - OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), + OpenWriteError::FileAlreadyExists(filepath) => { + TantivyError::FileAlreadyExists(filepath) + } + OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), }.into() } } -impl From for Error { - fn from(error: OpenDirectoryError) -> Error { +impl From for TantivyError { + fn from(error: OpenDirectoryError) -> TantivyError { match error { OpenDirectoryError::DoesNotExist(directory_path) => { - ErrorKind::PathDoesNotExist(directory_path).into() + TantivyError::PathDoesNotExist(directory_path).into() } - OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument( + OpenDirectoryError::NotADirectory(directory_path) => TantivyError::InvalidArgument( format!("{:?} is not a directory", directory_path), ).into(), } } } -impl From for Error { - fn from(error: serde_json::Error) -> Error { +impl From for TantivyError { + fn from(error: serde_json::Error) -> TantivyError { let io_err = io::Error::from(error); - ErrorKind::IOError(io_err.into()).into() + TantivyError::IOError(io_err.into()).into() } } diff --git a/src/fastfield/error.rs b/src/fastfield/error.rs index a05ef2284..df6c2febe 100644 --- a/src/fastfield/error.rs +++ b/src/fastfield/error.rs @@ -4,7 +4,8 @@ use std::result; /// `FastFieldNotAvailableError` is returned when the /// user requested for a fast field reader, and the field was not /// defined in the schema as a fast field. -#[derive(Debug)] +#[derive(Debug, Fail)] +#[fail(display = "field not available: '{:?}'", field_name)] pub struct FastFieldNotAvailableError { field_name: String, } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 59cfb6661..982140fbc 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -2,15 +2,15 @@ use super::operation::AddOperation; use super::segment_updater::SegmentUpdater; use super::PreparedCommit; use bit_set::BitSet; -use crossbeam_channel as channel; use core::Index; use core::Segment; use core::SegmentComponent; use core::SegmentId; use core::SegmentMeta; use core::SegmentReader; +use crossbeam_channel as channel; use docset::DocSet; -use error::{Error, ErrorKind, Result, ResultExt}; +use error::TantivyError; use fastfield::write_delete_bitset; use futures::sync::oneshot::Receiver; use indexer::delete_queue::{DeleteCursor, DeleteQueue}; @@ -29,6 +29,7 @@ use std::mem; use std::mem::swap; use std::thread; use std::thread::JoinHandle; +use Result; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. @@ -122,11 +123,11 @@ pub fn open_index_writer( "The heap size per thread needs to be at least {}.", HEAP_SIZE_MIN ); - bail!(ErrorKind::InvalidArgument(err_msg)); + return Err(TantivyError::InvalidArgument(err_msg)); } if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX { let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX); - bail!(ErrorKind::InvalidArgument(err_msg)); + return Err(TantivyError::InvalidArgument(err_msg)); } let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); @@ -334,13 +335,15 @@ impl IndexWriter { join_handle .join() .expect("Indexing Worker thread panicked") - .chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?; + .map_err(|_| { + TantivyError::ErrorInThread("Error in indexing worker thread.".into()) + })?; } drop(self.workers_join_handle); let result = self.segment_updater .wait_merging_thread() - .chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into())); + .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into())); if let Err(ref e) = result { error!("Some merging thread failed {:?}", e); @@ -559,7 +562,7 @@ impl IndexWriter { for worker_handle in former_workers_join_handle { let indexing_worker_result = worker_handle .join() - .map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?; + .map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?; indexing_worker_result?; // add a new worker for the next generation. @@ -654,7 +657,7 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(Error(ErrorKind::FileAlreadyExists(_), _)) => {} + Err(TantivyError::FileAlreadyExists(_)) => {} _ => panic!("Expected FileAlreadyExists error"), } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1a5d4c026..e79551a4c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -2,7 +2,6 @@ use core::Segment; use core::SegmentReader; use core::SerializableSegment; use docset::DocSet; -use error::Result; use fastfield::DeleteBitSet; use fastfield::FastFieldReader; use fastfield::FastFieldSerializer; @@ -23,6 +22,7 @@ use store::StoreWriter; use termdict::TermMerger; use termdict::TermOrdinal; use DocId; +use Result; fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { let mut total_tokens = 0u64; diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index b82af0823..18175c774 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -2,8 +2,7 @@ use super::segment_register::SegmentRegister; use core::SegmentId; use core::SegmentMeta; use core::{LOCKFILE_FILEPATH, META_FILEPATH}; -use error::ErrorKind; -use error::Result as TantivyResult; +use error::TantivyError; use indexer::delete_queue::DeleteCursor; use indexer::SegmentEntry; use std::collections::hash_set::HashSet; @@ -11,6 +10,7 @@ use std::fmt::{self, Debug, Formatter}; use std::path::PathBuf; use std::sync::RwLock; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; +use Result as TantivyResult; #[derive(Default)] struct SegmentRegisters { @@ -141,7 +141,7 @@ impl SegmentManager { let error_msg = "Merge operation sent for segments that are not \ all uncommited or commited." .to_string(); - bail!(ErrorKind::InvalidArgument(error_msg)) + return Err(TantivyError::InvalidArgument(error_msg)); } Ok(segment_entries) } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index faaef38c0..732270ea1 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -7,7 +7,7 @@ use core::SegmentMeta; use core::SerializableSegment; use core::META_FILEPATH; use directory::{Directory, DirectoryClone}; -use error::{Error, ErrorKind, Result, ResultExt}; +use error::TantivyError; use futures::oneshot; use futures::sync::oneshot::Receiver; use futures::Future; @@ -34,6 +34,7 @@ use std::sync::Arc; use std::sync::RwLock; use std::thread; use std::thread::JoinHandle; +use Result; /// Save the index meta file. /// This operation is atomic : @@ -114,12 +115,9 @@ fn perform_merge( // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment) - .chain_err(|| "Creating index serializer failed")?; + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?; - let num_docs = merger - .write(segment_serializer) - .chain_err(|| "Serializing merged index failed")?; + let num_docs = merger.write(segment_serializer)?; let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs); @@ -186,7 +184,7 @@ impl SegmentUpdater { fn run_async T>( &self, f: F, - ) -> CpuFuture { + ) -> CpuFuture { let me_clone = self.clone(); self.0.pool.spawn_fn(move || Ok(f(me_clone))) } @@ -463,7 +461,7 @@ impl SegmentUpdater { merging_thread_handle .join() .map(|_| ()) - .map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?; + .map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?; } // Our merging thread may have queued their completed self.run_async(move |_| {}).wait()?; diff --git a/src/lib.rs b/src/lib.rs index c01226c55..985d68a84 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,7 @@ #![allow(new_without_default)] #![allow(decimal_literal_representation)] #![warn(missing_docs)] -#![recursion_limit="80"] +#![recursion_limit = "80"] //! # `tantivy` //! @@ -124,7 +124,7 @@ extern crate serde_json; extern crate log; #[macro_use] -extern crate error_chain; +extern crate failure; #[cfg(feature = "mmap")] extern crate atomicwrites; @@ -179,13 +179,16 @@ mod functional_test; #[macro_use] mod macros; -pub use error::{Error, ErrorKind, ResultExt}; +pub use error::TantivyError; + +#[deprecated(since="0.7.0", note="please use `tantivy::TantivyError` instead")] +pub use error::TantivyError as Error; extern crate census; extern crate owned_read; /// Tantivy result. -pub type Result = std::result::Result; +pub type Result = std::result::Result; mod common; mod core; @@ -199,8 +202,8 @@ pub mod collector; pub mod directory; pub mod fastfield; pub mod fieldnorm; -pub mod postings; pub(crate) mod positions; +pub mod postings; pub mod query; pub mod schema; pub mod store; @@ -286,13 +289,13 @@ mod tests { use core::SegmentReader; use docset::DocSet; use query::BooleanQuery; + use rand::distributions::Bernoulli; use rand::distributions::Range; use rand::{Rng, SeedableRng, XorShiftRng}; use schema::*; use Index; use IndexWriter; use Postings; - use rand::distributions::Bernoulli; pub fn assert_nearly_equals(expected: f32, val: f32) { assert!( @@ -321,13 +324,7 @@ mod tests { .sample_iter(&Bernoulli::new(ratio)) .take(n as usize) .enumerate() - .filter_map(|(val, keep)| { - if keep { - Some(val as u32) - } else { - None - } - }) + .filter_map(|(val, keep)| if keep { Some(val as u32) } else { None }) .collect() } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index f4f974388..303301b0d 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -12,7 +12,7 @@ mod tests { use super::*; use collector::tests::TestCollector; use core::Index; - use error::ErrorKind; + use error::TantivyError; use schema::{SchemaBuilder, Term, TEXT}; use tests::assert_nearly_equals; @@ -92,10 +92,9 @@ mod tests { Term::from_field_text(text_field, "b"), ]); let mut test_collector = TestCollector::default(); - if let &ErrorKind::SchemaError(ref msg) = searcher + if let TantivyError::SchemaError(ref msg) = searcher .search(&phrase_query, &mut test_collector) .unwrap_err() - .kind() { assert_eq!( "Applied phrase query on field \"text\", which does not have positions indexed", @@ -191,7 +190,7 @@ mod tests { let mut test_collector = TestCollector::default(); let terms: Vec<(usize, Term)> = texts .iter() - .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)) ) + .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text))) .collect(); let phrase_query = PhraseQuery::new_with_offset(terms); searcher diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 9cabe8cc4..e501711ed 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -1,6 +1,6 @@ use super::PhraseWeight; use core::searcher::Searcher; -use error::ErrorKind; +use error::TantivyError; use query::bm25::BM25Weight; use query::Query; use query::Weight; @@ -38,11 +38,10 @@ impl PhraseQuery { PhraseQuery::new_with_offset(terms_with_offset) } - /// Creates a new `PhraseQuery` given a list of terms and there offsets. /// /// Can be used to provide custom offset for each term. - pub fn new_with_offset(mut terms: Vec<(usize, Term)>) ->PhraseQuery { + pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery { assert!( terms.len() > 1, "A phrase query is required to have strictly more than one term." @@ -66,9 +65,11 @@ impl PhraseQuery { /// `Term`s in the phrase without the associated offsets. pub fn phrase_terms(&self) -> Vec { - self.phrase_terms.iter().map(|(_, term)| term.clone()).collect::>() - } - + self.phrase_terms + .iter() + .map(|(_, term)| term.clone()) + .collect::>() + } } impl Query for PhraseQuery { @@ -85,15 +86,19 @@ impl Query for PhraseQuery { .unwrap_or(false); if !has_positions { let field_name = field_entry.name(); - bail!(ErrorKind::SchemaError(format!( + return Err(TantivyError::SchemaError(format!( "Applied phrase query on field {:?}, which does not have positions indexed", field_name - ))) + ))); } if scoring_enabled { let terms = self.phrase_terms(); let bm25_weight = BM25Weight::for_terms(searcher, &terms); - Ok(Box::new(PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, true))) + Ok(Box::new(PhraseWeight::new( + self.phrase_terms.clone(), + bm25_weight, + true, + ))) } else { Ok(Box::new(PhraseWeight::new( self.phrase_terms.clone(), diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 17d09657f..23efe1995 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -1,7 +1,7 @@ use common::BitSet; use core::Searcher; use core::SegmentReader; -use error::ErrorKind; +use error::TantivyError; use query::BitSetDocSet; use query::ConstScorer; use query::{Query, Scorer, Weight}; @@ -239,7 +239,7 @@ impl Query for RangeQuery { "Create a range query of the type {:?}, when the field given was of type {:?}", self.value_type, value_type ); - bail!(ErrorKind::SchemaError(err_msg)) + return Err(TantivyError::SchemaError(err_msg)); } Ok(Box::new(RangeWeight { field: self.field, diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 8b930212a..9b02fc7cf 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -1,4 +1,4 @@ -use error::ErrorKind; +use error::TantivyError; use fst_regex::Regex; use query::{AutomatonWeight, Query, Weight}; use schema::Field; @@ -80,7 +80,7 @@ impl RegexQuery { fn specialized_weight(&self) -> Result> { let automaton = Regex::new(&self.regex_pattern) - .map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?; + .map_err(|_| TantivyError::InvalidArgument(self.regex_pattern.clone()))?; Ok(AutomatonWeight::new(self.field.clone(), automaton)) } From c0641c2b47bf25a6f4714f93037f1b369ce011fc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 21 Aug 2018 08:26:46 +0900 Subject: [PATCH 05/62] Remove generate html script. It moved to tantivy-search.github.io --- examples/generate_html.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 examples/generate_html.sh diff --git a/examples/generate_html.sh b/examples/generate_html.sh deleted file mode 100755 index ec07322e2..000000000 --- a/examples/generate_html.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -for example in $(ls *.rs) -do - docco $example -o html -done From 3a8e524f77e96c281ed884aa64092969fdec9fba Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 21 Aug 2018 09:36:13 +0900 Subject: [PATCH 06/62] Added example to show how to access the inverted list directly --- examples/iterating_docs_and_positions.rs | 139 +++++++++++++++++++++++ src/core/inverted_index_reader.rs | 22 +++- src/macros.rs | 4 - 3 files changed, 157 insertions(+), 8 deletions(-) create mode 100644 examples/iterating_docs_and_positions.rs diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs new file mode 100644 index 000000000..9d3937617 --- /dev/null +++ b/examples/iterating_docs_and_positions.rs @@ -0,0 +1,139 @@ +// # Iterating docs and positioms. +// +// At its core of tantivy, relies on a data structure +// called an inverted index. +// +// This example shows how to manually iterate through +// the list of documents containing a term, getting +// its term frequency, and accessing its positions. + + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::schema::*; +use tantivy::Index; +use tantivy::{DocSet, DocId, Postings}; + +fn main() -> tantivy::Result<()> { + + + // We first create a schema for the sake of the + // example. Check the `basic_search` example for more information. + let mut schema_builder = SchemaBuilder::default(); + + // For this example, we need to make sure to index positions for our title + // field. `TEXT` precisely does this. + let title = schema_builder.add_text_field("title", TEXT | STORED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; + index_writer.add_document(doc!(title => "The Old Man and the Sea")); + index_writer.add_document(doc!(title => "Of Mice and Men")); + index_writer.add_document(doc!(title => "The modern Promotheus")); + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + // A tantivy index is actually a collection of segments. + // Similarly, a searcher just wraps a list `segment_reader`. + // + // (Because we indexed a very small number of documents over one thread + // there is actually only one segment here, but let's iterate through the list + // anyway) + for segment_reader in searcher.segment_readers() { + + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) { + + // this buffer will be used to request for positions + let mut positions: Vec = Vec::with_capacity(100); + while segment_postings.advance() { + + // the number of time the term appears in the document. + let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once. + + // This MAY contains deleted documents as well. + if segment_reader.is_deleted(doc_id) { + continue; + } + + // the number of time the term appears in the document. + let term_freq: u32 = segment_postings.term_freq(); + // accessing positions is slightly expensive and lazy, do not request + // for them if you don't need them for some documents. + segment_postings.positions(&mut positions); + + // By definition we should have `term_freq` positions. + assert_eq!(positions.len(), term_freq as usize); + + // This prints: + // ``` + // Doc 0: TermFreq 2: [0, 4] + // Doc 2: TermFreq 1: [0] + // ``` + println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions); + } + } + } + + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + // Some other powerful operations (especially `.skip_to`) may be useful to consume these + // posting lists rapidly. + // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait + // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait + + // Also, for some VERY specific high performance use case like an OLAP analysis of logs, + // you can get better performance by accessing directly the blocks of doc ids. + for segment_reader in searcher.segment_readers() { + + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) { + while block_segment_postings.advance() { + // Once again these docs MAY contains deleted documents as well. + let docs = block_segment_postings.docs(); + // Prints `Docs [0, 2].` + println!("Docs {:?}", docs); + } + } + } + + Ok(()) +} + diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 57716748a..b919e09b0 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -100,6 +100,20 @@ impl InvertedIndexReader { block_postings.reset(term_info.doc_freq, postings_reader); } + + /// Returns a block postings given a `Term`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> Option { + self.get_term_info(term) + .map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option)) + } + /// Returns a block postings given a `term_info`. /// This method is for an advanced usage only. /// @@ -159,8 +173,8 @@ impl InvertedIndexReader { /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) } pub(crate) fn read_postings_no_deletes( @@ -168,8 +182,8 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(|term_info| self.read_postings_from_terminfo(&term_info, option)) } /// Returns the number of documents containing the term. diff --git a/src/macros.rs b/src/macros.rs index 5e3d9b023..87d4d926e 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,7 +1,3 @@ -macro_rules! get( - ($e:expr) => (match $e { Some(e) => e, None => return None }) -); - /// `doc!` is a shortcut that helps building `Document` /// objects. /// From 3d73c0c2401102ee032cd2ed55ce320ed3a63e09 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 21 Aug 2018 10:59:08 +0900 Subject: [PATCH 07/62] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 19 +++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 14 ++++++++++++++ .github/ISSUE_TEMPLATE/question.md | 7 +++++++ 3 files changed, 40 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/ISSUE_TEMPLATE/question.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..b1a5aece3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,19 @@ +--- +name: Bug report +about: Create a report to help us improve + +--- + +**Describe the bug** +- What did you do? +- What happened? +- What was expected? + +**Which version of tantivy are you using?** +If "master", ideally give the specific sha1 revision. + +**To Reproduce** + +If your bug is deterministic, can you give a minimal reproducing code? +Some bugs are not deterministic. Can you describe with precision in which context it happened? +If this is possible, can you share your code? diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..3affc3c24 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,14 @@ +--- +name: Feature request +about: Suggest an idea for this project + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**[Optional] describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 000000000..e00e9a1b0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,7 @@ +--- +name: Question +about: Ask any question about tantivy's usage... + +--- + +Try to be specific about your use case... From af593b1116be8946c1dbce434a7118d70c962cce Mon Sep 17 00:00:00 2001 From: Dru Sellers Date: Tue, 21 Aug 2018 20:49:39 -0500 Subject: [PATCH 08/62] Add default EN stopwords to the default analyzer (#381) * Add a default list of en stopwords * Add the default en stopword filter to the standard tokenizers * code review feedback --- src/tokenizer/stop_word_filter.rs | 16 ++++++++++++++++ src/tokenizer/tokenizer_manager.rs | 1 + 2 files changed, 17 insertions(+) diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index f94ec632f..45691d470 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -39,6 +39,16 @@ impl StopWordFilter { StopWordFilter { words: set } } + + fn english() -> StopWordFilter { + let words: [&'static str; 33] = [ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", + "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", "with", + ]; + + StopWordFilter::remove(words.iter().map(|s| s.to_string()).collect()) + } } pub struct StopWordFilterStream @@ -98,3 +108,9 @@ where false } } + +impl Default for StopWordFilter { + fn default() -> StopWordFilter { + StopWordFilter::english() + } +} diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..410e7f30b 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -8,6 +8,7 @@ use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; use tokenizer::SimpleTokenizer; use tokenizer::Stemmer; +use tokenizer::StopWordFilter; use tokenizer::Tokenizer; /// The tokenizer manager serves as a store for From 537fc2723159d0ee108478248548c65e8ce1a3fa Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Aug 2018 08:55:13 +0900 Subject: [PATCH 09/62] Added bench line in features --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b3eede677..b8d95b6ee 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design. # Features - Full-text search +- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/)) :sparkles: - Tiny startup time (<10ms), perfect for command line tools - BM25 scoring (the same as lucene) - Basic query language (`+michael +jackson`) From d71fa43ca3788e88a7e356d921abec776a14c588 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Aug 2018 08:59:11 +0900 Subject: [PATCH 10/62] Moving emoticon on the right side of the parenthesis --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b8d95b6ee..57b637020 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design. # Features - Full-text search -- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/)) :sparkles: +- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Tiny startup time (<10ms), perfect for command line tools - BM25 scoring (the same as lucene) - Basic query language (`+michael +jackson`) From 948758ad78df961495672d5d6360d144171461b0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 27 Aug 2018 09:49:49 +0900 Subject: [PATCH 11/62] First commit for the documentation --- doc/.gitignore | 1 + doc/book.toml | 5 +++++ doc/src/SUMMARY.md | 14 ++++++++++++++ doc/src/avant-propos.md | 31 +++++++++++++++++++++++++++++++ doc/src/basis.md | 1 + doc/src/examples.md | 1 + doc/src/facetting.md | 5 +++++ doc/src/faq.md | 0 doc/src/innerworkings.md | 1 + doc/src/inverted_index.md | 1 + doc/src/schema.md | 1 + 11 files changed, 61 insertions(+) create mode 100644 doc/.gitignore create mode 100644 doc/book.toml create mode 100644 doc/src/SUMMARY.md create mode 100644 doc/src/avant-propos.md create mode 100644 doc/src/basis.md create mode 100644 doc/src/examples.md create mode 100644 doc/src/facetting.md create mode 100644 doc/src/faq.md create mode 100644 doc/src/innerworkings.md create mode 100644 doc/src/inverted_index.md create mode 100644 doc/src/schema.md diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 000000000..7585238ef --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1 @@ +book diff --git a/doc/book.toml b/doc/book.toml new file mode 100644 index 000000000..a8c8ec91a --- /dev/null +++ b/doc/book.toml @@ -0,0 +1,5 @@ +[book] +authors = ["Paul Masurel"] +multilingual = false +src = "src" +title = "Tantivy, the user guide" diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md new file mode 100644 index 000000000..a845e4f7c --- /dev/null +++ b/doc/src/SUMMARY.md @@ -0,0 +1,14 @@ +# Summary + + + +[Avant Propos](./avant-propos.md) + +- [Fundamental concepts](./basis.md) +- [Defining your schema](./schema.md) +- [Facetting](./facetting.md) +- [Innerworkings](./innerworkings.md) + - [Inverted index](./inverted_index.md) + +[Frequently Asked Questions](./faq.md) +[Examples](./examples.md) diff --git a/doc/src/avant-propos.md b/doc/src/avant-propos.md new file mode 100644 index 000000000..aa50cd02b --- /dev/null +++ b/doc/src/avant-propos.md @@ -0,0 +1,31 @@ +# Foreword, what is the scope of tantivy? + +> Tantivy is a **search** engine **library** for Rust. + +If you are familiar with Lucene, tantivy is heavily inspired by Lucene's design and +they both have the same scope and targetted users. + +If you are not familiar with Lucene, let's break down our little tagline. + +- **Search** here means full-text search : fundamentally, tantivy is here to help you +identify efficiently what are the documents matching a given query in your corpus. +But modern search UI are so much more : text processing, facetting, autocomplete, fuzzy search, good +relevancy, collapsing, highlighting, spatial search. + + While some of these features are not available in tantivy yet, all of these are relevant + feature requests. Tantivy's objective is to offer a solid toolbox to create the best search + experience. But keep in mind this is just a toolbox. + Which bring us to the second keyword... + +- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution. + + Sometimes a functionality will not be available in tantivy because it is too specific to your use case. By design, tantivy should make it possible to extend + the available set of features using the existing rock-solid datastructures. + + Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own + `Tokenizer/TokenFilter`... But some of your requirement may also be related to + architecture or operations. For instance, you may want to build a large corpus on Hadoop, + fine-tune the merge policy to keep your index sharded in a time-wise fashion, or you may want + to convert and existing index from a different format. + + Tantivy exposes its API to do all of these things. \ No newline at end of file diff --git a/doc/src/basis.md b/doc/src/basis.md new file mode 100644 index 000000000..dbb278946 --- /dev/null +++ b/doc/src/basis.md @@ -0,0 +1 @@ +# Basic concepts diff --git a/doc/src/examples.md b/doc/src/examples.md new file mode 100644 index 000000000..df635b4e6 --- /dev/null +++ b/doc/src/examples.md @@ -0,0 +1 @@ +# Examples diff --git a/doc/src/facetting.md b/doc/src/facetting.md new file mode 100644 index 000000000..a1d7dc061 --- /dev/null +++ b/doc/src/facetting.md @@ -0,0 +1,5 @@ +# Facetting + +wewew + +## weeewe diff --git a/doc/src/faq.md b/doc/src/faq.md new file mode 100644 index 000000000..e69de29bb diff --git a/doc/src/innerworkings.md b/doc/src/innerworkings.md new file mode 100644 index 000000000..f1de34898 --- /dev/null +++ b/doc/src/innerworkings.md @@ -0,0 +1 @@ +# Innerworkings diff --git a/doc/src/inverted_index.md b/doc/src/inverted_index.md new file mode 100644 index 000000000..f07f47e52 --- /dev/null +++ b/doc/src/inverted_index.md @@ -0,0 +1 @@ +# Inverted index diff --git a/doc/src/schema.md b/doc/src/schema.md new file mode 100644 index 000000000..eb661bd69 --- /dev/null +++ b/doc/src/schema.md @@ -0,0 +1 @@ +# Defining your schema From 4b7ff78c5aacc10a4e24c055b798b1c488bca29a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Aug 2018 08:09:27 +0900 Subject: [PATCH 12/62] Added fundamentalss --- doc/src/SUMMARY.md | 2 +- doc/src/basis.md | 49 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md index a845e4f7c..76dd29748 100644 --- a/doc/src/SUMMARY.md +++ b/doc/src/SUMMARY.md @@ -4,7 +4,7 @@ [Avant Propos](./avant-propos.md) -- [Fundamental concepts](./basis.md) +- [Segments](./basis.md) - [Defining your schema](./schema.md) - [Facetting](./facetting.md) - [Innerworkings](./innerworkings.md) diff --git a/doc/src/basis.md b/doc/src/basis.md index dbb278946..e52615f6d 100644 --- a/doc/src/basis.md +++ b/doc/src/basis.md @@ -1 +1,48 @@ -# Basic concepts +# Anatomy of an index + +## Straight from disk + +By default, tantivy accesses its data using its `MMapDirectory`. +While this design has some downsides, this greatly simplifies the source code of tantivy, +and entirely delegates the caching to the OS. + +`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. +As a result, the act of opening an indexing does not involve loading different datastructures +from the disk into random access memory : starting a process, opening an index, and performing a query +can typically be done in a matter of milliseconds. + +This is an interesting property for a command line search engine, or for some multi-tenant log search engine. +Spawning a new process for each new query can be a perfectly sensible solution in some use case. + +In later chapters, we will discuss tantivy's inverted index data layout. +One key take away is that to achieve great performance, search indexes are extremely compact. +Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM. + +Also, whenever possible the data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access +the data from your spinning hard disk, but this is also a great property when working with `SSD` or `RAM`, +as it makes our read patterns very predictable for the CPU. + + +## Segments, and the log method + +That kind compact layout comes at one cost: it prevents our datastructures from being dynamic. +In fact, a trait called `Directory` is in charge of abstracting all of tantivy's data access +and its API does not even allow editing these file once they are written. + +To allow the addition / deletion of documents, and create the illusion that +your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes +referred to as the *log method*. + +Let's forget about deletes for a moment. As you add documents, these documents are processed and stored in +a dedicated datastructure, in a `RAM` buffer. This datastructure is designed to be dynamic but +cannot be accessed for search. As you add documents, this buffer will reach its capacity and tantivy will +transparently stop adding document to it and start converting this datastructure to its final +read-only format on disk. Once written, an brand empty buffer is available to resume adding documents. + +The resulting chunk of index obtained after this serialization is called a `Segment`. + +> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files +are identified using the naming scheme : `.*`. + + +> A tantivy `Index` is a collection of `Segments`. \ No newline at end of file From ede97eded65f8df7a0f1b42c52d8632a061310e0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Aug 2018 09:54:04 +0900 Subject: [PATCH 13/62] Removed use --- src/tokenizer/tokenizer_manager.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 410e7f30b..cbb46af3b 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -8,7 +8,6 @@ use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; use tokenizer::SimpleTokenizer; use tokenizer::Stemmer; -use tokenizer::StopWordFilter; use tokenizer::Tokenizer; /// The tokenizer manager serves as a store for From 2649c8a7158e6b8c504aa6b43e5af98da8d2c420 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Aug 2018 11:03:54 +0900 Subject: [PATCH 14/62] Issue/246 (#393) * Moving Range and All to Leaves * Parsing OR/AND * Simplify user input ast * AND and OR supported. Returning an error when mixing syntax Closes #246 * Added support for NOT * Updated changelog --- CHANGELOG.md | 1 + README.md | 4 +- src/query/occur.rs | 35 +++++ src/query/query_parser/query_grammar.rs | 157 ++++++++++++++++++++--- src/query/query_parser/query_parser.rs | 118 ++++++++--------- src/query/query_parser/user_input_ast.rs | 152 ++++++++++++++++++---- 6 files changed, 355 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef956ac52..d2256923a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Tantivy 0.7 - Skip data for doc ids and positions (@fulmicoton), greatly improving performance - Tantivy error now rely on the failure crate (@drusellers) +- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax Tantivy 0.6.1 diff --git a/README.md b/README.md index 57b637020..62638292f 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,8 @@ Tantivy is, in fact, strongly inspired by Lucene's design. - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Tiny startup time (<10ms), perfect for command line tools - BM25 scoring (the same as lucene) -- Basic query language (`+michael +jackson`) -- Phrase queries search (\"michael jackson\"`) +- Natural query language `(michael AND jackson) OR "king of pop"` +- Phrase queries search (`"michael jackson"`) - Incremental indexing - Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop) - Mmap directory diff --git a/src/query/occur.rs b/src/query/occur.rs index 9bcf02bc2..1a9396de0 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -12,3 +12,38 @@ pub enum Occur { /// search. MustNot, } + +impl Occur { + /// Returns the one-char prefix symbol for this `Occur`. + /// - `Should` => '?', + /// - `Must` => '+' + /// - `Not` => '-' + pub fn to_char(&self) -> char { + match *self { + Occur::Should => '?', + Occur::Must => '+', + Occur::MustNot => '-', + } + } +} + +/// Compose two occur values. +pub fn compose_occur(left: Occur, right: Occur) -> Occur { + match left { + Occur::Should => right, + Occur::Must => { + if right == Occur::MustNot { + Occur::MustNot + } else { + Occur::Must + } + } + Occur::MustNot => { + if right == Occur::MustNot { + Occur::Must + } else { + Occur::MustNot + } + } + } +} \ No newline at end of file diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 352666e8a..557e38e24 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,6 +1,9 @@ use super::user_input_ast::*; use combine::char::*; use combine::*; +use combine::stream::StreamErrorFor; +use combine::error::StreamError; +use query::occur::Occur; use query::query_parser::user_input_ast::UserInputBound; parser! { @@ -17,18 +20,25 @@ parser! { fn word[I]()(I) -> String where [I: Stream] { many1(satisfy(|c: char| c.is_alphanumeric())) + .and_then(|s: String| { + match s.as_str() { + "OR" => Err(StreamErrorFor::::unexpected_static_message("OR")), + "AND" => Err(StreamErrorFor::::unexpected_static_message("AND")), + "NOT" => Err(StreamErrorFor::::unexpected_static_message("NOT")), + _ => Ok(s) + } + }) } } parser! { - fn literal[I]()(I) -> UserInputAST + fn literal[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s); phrase.or(word()) }; - let term_val_with_field = negative_number().or(term_val()); let term_query = (field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral { @@ -41,7 +51,7 @@ parser! { }); try(term_query) .or(term_default_field) - .map(UserInputAST::from) + .map(UserInputLeaf::from) } } @@ -55,7 +65,14 @@ parser! { } parser! { - fn range[I]()(I) -> UserInputAST + fn spaces1[I]()(I) -> () + where [I: Stream] { + skip_many1(space()) + } +} + +parser! { + fn range[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { word().or(negative_number()).or(char('*').map(|_| "*".to_string())) @@ -77,7 +94,7 @@ parser! { string("TO"), spaces(), upper_bound, - ).map(|(field, lower, _, _, _, upper)| UserInputAST::Range { + ).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range { field, lower, upper @@ -88,13 +105,53 @@ parser! { parser! { fn leaf[I]()(I) -> UserInputAST where [I: Stream] { - (char('-'), leaf()) - .map(|(_, expr)| UserInputAST::Not(Box::new(expr))) - .or((char('+'), leaf()).map(|(_, expr)| UserInputAST::Must(Box::new(expr)))) + (char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) ) + .or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) )) .or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr)) - .or(char('*').map(|_| UserInputAST::All)) - .or(try(range())) - .or(literal()) + .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) )) + .or(try( + (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))) + ) + .or( + try( + range() + .map(|leaf| UserInputAST::from(leaf)) + ) + ) + .or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf)))) + } +} + +enum BinaryOperand { + Or, And +} + +parser! { + fn binary_operand[I]()(I) -> BinaryOperand + where [I: Stream] { + (spaces1(), + ( + string("AND").map(|_| BinaryOperand::And) + .or(string("OR").map(|_| BinaryOperand::Or)) + ), + spaces1()).map(|(_, op,_)| op) + } +} + + +enum Element { + SingleEl(UserInputAST), + NormalDisjunctive(Vec>) +} + +impl Element { + pub fn into_dnf(self) -> Vec> { + match self { + Element::NormalDisjunctive(conjunctions) => + conjunctions, + Element::SingleEl(el) => + vec!(vec!(el)), + } } } @@ -102,14 +159,56 @@ parser! { pub fn parse_to_ast[I]()(I) -> UserInputAST where [I: Stream] { - sep_by(leaf(), spaces()) - .map(|subqueries: Vec| { - if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - } - }) + ( + try( + chainl1( + leaf().map(Element::SingleEl), + binary_operand().map(|op: BinaryOperand| + move |left: Element, right: Element| { + let mut dnf = left.into_dnf(); + if let Element::SingleEl(el) = right { + match op { + BinaryOperand::And => { + if let Some(last) = dnf.last_mut() { + last.push(el); + } + } + BinaryOperand::Or => { + dnf.push(vec!(el)); + } + } + } else { + unreachable!("Please report.") + } + Element::NormalDisjunctive(dnf) + } + ) + ) + .map(|el| el.into_dnf()) + .map(|fnd| { + if fnd.len() == 1 { + UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe + } else { + let conjunctions = fnd + .into_iter() + .map(|conjunction| UserInputAST::and(conjunction)) + .collect(); + UserInputAST::or(conjunctions) + } + }) + ) + .or( + sep_by(leaf(), spaces()) + .map(|subqueries: Vec| { + if subqueries.len() == 1 { + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().collect()) + } + }) + ) + ) + } } @@ -128,6 +227,26 @@ mod test { assert!(parse_to_ast().parse(query).is_err()); } + + #[test] + fn test_parse_query_to_ast_not_op() { + assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)"); + test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); + test_parse_query_to_ast_helper("NOT a", "-(\"a\")"); + } + + #[test] + fn test_parse_query_to_ast_binary_op() { + test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))"); + test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))"); + test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))"); + test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))"); + assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)"); + assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)"); + } + #[test] fn test_parse_query_to_ast() { test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))"); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index abe6b404f..f3a9f37c0 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -5,6 +5,7 @@ use core::Index; use query::AllQuery; use query::BooleanQuery; use query::Occur; +use query::occur::compose_occur; use query::PhraseQuery; use query::Query; use query::RangeQuery; @@ -79,12 +80,22 @@ impl From for QueryParserError { /// /// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`. /// +/// +/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is interpreted +/// as `(a AND b) OR c`. +/// +/// * In addition to the boolean operators, the `-`, `+` can help define. These operators +/// are sufficient to axpress all queries using boolean operators. For instance `x AND y OR z` can +/// be written (`(+x +y) z`). In addition, these operators can help define "required optional" +/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the score. +/// /// * negative terms: By prepending a term by a `-`, a term can be excluded /// from the search. This is useful for disambiguating a query. /// e.g. `apple -fruit` /// /// * must terms: By prepending a term by a `+`, a term can be made required for the search. /// +/// /// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. /// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed /// by "obama". @@ -315,56 +326,27 @@ impl QueryParser { let default_occur = self.default_occur(); let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new(); for sub_query in sub_queries { - let (occur, sub_ast) = self.compute_logical_ast_with_occur(*sub_query)?; + let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?; let new_occur = compose_occur(default_occur, occur); logical_sub_queries.push((new_occur, sub_ast)); } Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } - UserInputAST::Not(subquery) => { - let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries)) + UserInputAST::Unary(left_occur, subquery) => { + let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; + Ok((compose_occur(left_occur, right_occur), logical_sub_queries)) } - UserInputAST::Must(subquery) => { - let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::Must, occur), logical_sub_queries)) - } - UserInputAST::Range { - field, - lower, - upper, - } => { - let fields = self.resolved_fields(&field)?; - let mut clauses = fields - .iter() - .map(|&field| { - let field_entry = self.schema.get_field_entry(field); - let value_type = field_entry.field_type().value_type(); - Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range { - field, - value_type, - lower: self.resolve_bound(field, &lower)?, - upper: self.resolve_bound(field, &upper)?, - }))) - }) - .collect::, QueryParserError>>()?; - let result_ast = if clauses.len() == 1 { - clauses.pop().unwrap() - } else { - LogicalAST::Clause( - clauses - .into_iter() - .map(|clause| (Occur::Should, clause)) - .collect(), - ) - }; + UserInputAST::Leaf(leaf) => { + let result_ast = self.compute_logical_ast_from_leaf(*leaf)?; Ok((Occur::Should, result_ast)) } - UserInputAST::All => Ok(( - Occur::Should, - LogicalAST::Leaf(Box::new(LogicalLiteral::All)), - )), - UserInputAST::Leaf(literal) => { + } + } + + + fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result { + match leaf { + UserInputLeaf::Literal(literal) => { let term_phrases: Vec<(Field, String)> = match literal.field_name { Some(ref field_name) => { let field = self.resolve_field_name(field_name)?; @@ -395,30 +377,40 @@ impl QueryParser { } else { LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) }; - Ok((Occur::Should, result_ast)) + Ok(result_ast) + } + UserInputLeaf::All => { + Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))) + } + UserInputLeaf::Range { field, lower, upper } => { + let fields = self.resolved_fields(&field)?; + let mut clauses = fields + .iter() + .map(|&field| { + let field_entry = self.schema.get_field_entry(field); + let value_type = field_entry.field_type().value_type(); + Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::Range { + field, + value_type, + lower: self.resolve_bound(field, &lower)?, + upper: self.resolve_bound(field, &upper)?, + }))) + }) + .collect::, QueryParserError>>()?; + let result_ast = if clauses.len() == 1 { + clauses.pop().unwrap() + } else { + LogicalAST::Clause( + clauses + .into_iter() + .map(|clause| (Occur::Should, clause)) + .collect(), + ) + }; + Ok(result_ast) } } - } -} -/// Compose two occur values. -fn compose_occur(left: Occur, right: Occur) -> Occur { - match left { - Occur::Should => right, - Occur::Must => { - if right == Occur::MustNot { - Occur::MustNot - } else { - Occur::Must - } - } - Occur::MustNot => { - if right == Occur::MustNot { - Occur::Must - } else { - Occur::MustNot - } - } } } diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 96606915d..37adb94be 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -1,4 +1,41 @@ use std::fmt; +use std::fmt::{Debug, Formatter}; + +use query::Occur; + +pub enum UserInputLeaf { + Literal(UserInputLiteral), + All, + Range { + field: Option, + lower: UserInputBound, + upper: UserInputBound, + }, +} + +impl Debug for UserInputLeaf { + fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { + match self { + UserInputLeaf::Literal(literal) => { + literal.fmt(formatter) + } + UserInputLeaf::Range { + ref field, + ref lower, + ref upper, + } => { + if let &Some(ref field) = field { + write!(formatter, "{}:", field)?; + } + lower.display_lower(formatter)?; + write!(formatter, " TO ")?; + upper.display_upper(formatter)?; + Ok(()) + } + UserInputLeaf::All => write!(formatter, "*"), + } + } +} pub struct UserInputLiteral { pub field_name: Option, @@ -43,28 +80,99 @@ impl UserInputBound { } pub enum UserInputAST { - Clause(Vec>), - Not(Box), - Must(Box), - Range { - field: Option, - lower: UserInputBound, - upper: UserInputBound, - }, - All, - Leaf(Box), + Clause(Vec), + Unary(Occur, Box), +// Not(Box), +// Should(Box), +// Must(Box), + Leaf(Box), } -impl From for UserInputAST { - fn from(literal: UserInputLiteral) -> UserInputAST { - UserInputAST::Leaf(Box::new(literal)) + +impl UserInputAST { + pub fn unary(self, occur: Occur) -> UserInputAST { + UserInputAST::Unary(occur, Box::new(self)) + } + + fn compose(occur: Occur, asts: Vec) -> UserInputAST { + assert!(occur != Occur::MustNot); + assert!(!asts.is_empty()); + if asts.len() == 1 { + asts.into_iter().next().unwrap() //< safe + } else { + UserInputAST::Clause(asts + .into_iter() + .map(|ast: UserInputAST| + ast.unary(occur) + ) + .collect::>() + ) + } + } + + pub fn and(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Must, asts) + } + + pub fn or(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Should, asts) + } + +} + + + +/* +impl UserInputAST { + + fn compose_occur(self, occur: Occur) -> UserInputAST { + match self { + UserInputAST::Not(other) => { + let new_occur = compose_occur(Occur::MustNot, occur); + other.simplify() + } + _ => { + self + } + } + } + + pub fn simplify(self) -> UserInputAST { + match self { + UserInputAST::Clause(els) => { + if els.len() == 1 { + return els.into_iter().next().unwrap(); + } else { + return self; + } + } + UserInputAST::Not(els) => { + if els.len() == 1 { + return els.into_iter().next().unwrap(); + } else { + return self; + } + } + } + } +} +*/ + +impl From for UserInputLeaf { + fn from(literal: UserInputLiteral) -> UserInputLeaf { + UserInputLeaf::Literal(literal) + } +} + +impl From for UserInputAST { + fn from(leaf: UserInputLeaf) -> UserInputAST { + UserInputAST::Leaf(Box::new(leaf)) } } impl fmt::Debug for UserInputAST { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { - UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery), UserInputAST::Clause(ref subqueries) => { if subqueries.is_empty() { write!(formatter, "")?; @@ -78,21 +186,9 @@ impl fmt::Debug for UserInputAST { } Ok(()) } - UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery), - UserInputAST::Range { - ref field, - ref lower, - ref upper, - } => { - if let &Some(ref field) = field { - write!(formatter, "{}:", field)?; - } - lower.display_lower(formatter)?; - write!(formatter, " TO ")?; - upper.display_upper(formatter)?; - Ok(()) + UserInputAST::Unary(ref occur, ref subquery) => { + write!(formatter, "{}({:?})", occur.to_char(), subquery) } - UserInputAST::All => write!(formatter, "*"), UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery), } } From 57e1f8ed281d3d673288d89ec805e76e4a380658 Mon Sep 17 00:00:00 2001 From: CJP10 <14205938+CJP10@users.noreply.github.com> Date: Tue, 28 Aug 2018 10:17:59 -0400 Subject: [PATCH 15/62] Missed a closing bracket (#397) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62638292f..c1824b575 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design. - LZ4 compressed document store - Range queries - Faceted search -- Configurable indexing (optional term frequency and position indexing +- Configurable indexing (optional term frequency and position indexing) - Cheesy logo with a horse # Non-features From 19756bb7d6ae9414468059943edc6d8c0f45600c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 15 Aug 2018 10:52:50 +0900 Subject: [PATCH 16/62] Getting started on #368 --- src/lib.rs | 2 ++ src/snippet/mod.rs | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 src/snippet/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 985d68a84..0d64752d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -209,6 +209,8 @@ pub mod schema; pub mod store; pub mod termdict; +mod snippet; + mod docset; pub use self::docset::{DocSet, SkipResult}; diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs new file mode 100644 index 000000000..4cc1e41b9 --- /dev/null +++ b/src/snippet/mod.rs @@ -0,0 +1,88 @@ +use tokenizer::{TokenStream, Tokenizer}; +use std::collections::BTreeMap; +use Term; +use Document; +use Index; +use schema::FieldValue; +use schema::Value; +use tokenizer::BoxedTokenizer; + +pub struct HighlightSection { + start: usize, + stop: usize, +} + +impl HighlightSection { + fn new(start: usize, stop: usize) -> HighlightSection { + HighlightSection { + start, + stop + } + } +} + +pub struct FragmentCandidate { + score: f32, + start_offset: usize, + stop_offset: usize, + num_chars: usize, + highlighted: Vec, +} + +pub struct Snippet { + fragments: Vec, +} + +impl Snippet { + pub fn to_html() -> String { + unimplemented!(); + } +} + +/// Returns a non-empty list of "good" fragments. +/// +/// If no target term is within the text, then the function +/// should return an empty Vec. +/// +/// If a target term is within the text, then the returned +/// list is required to be non-empty. +/// +/// The returned list is non-empty and contain less +/// than 12 possibly overlapping fragments. +/// +/// All fragments should contain at least one target term +/// and have at most `max_num_chars` characters (not bytes). +/// +/// It is ok to emit non-overlapping fragments, for instance, +/// one short and one long containing the same keyword, in order +/// to leave optimization opportunity to the fragment selector +/// upstream. +/// +/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ +/// has to be a valid string. +fn search_fragments<'a>( + tokenizer: &BoxedTokenizer, + text: &'a str, + terms: BTreeMap, + max_num_chars: usize) -> Vec { + unimplemented!(); +} + +fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec)>, max_num_chars: usize) -> Snippet { + unimplemented!(); +} + +pub fn generate_snippet<'a>( + doc: &'a [FieldValue], + index: &Index, + terms: Vec, + max_num_chars: usize) -> Snippet { + unimplemented!(); +} + + +#[cfg(test)] +mod tests { + #[test] + fn test_snippet() {} +} \ No newline at end of file From 835cdc2fe8f5ecceaaa1c65619af96fd8114b3c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Mon, 27 Aug 2018 22:14:59 +0530 Subject: [PATCH 17/62] Initial version of snippet refer #368 --- src/snippet/mod.rs | 141 ++++++++++++++++++++++++++++++++++--- src/tokenizer/mod.rs | 2 +- src/tokenizer/tokenizer.rs | 2 +- 3 files changed, 132 insertions(+), 13 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4cc1e41b9..4356e0a80 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,4 +1,4 @@ -use tokenizer::{TokenStream, Tokenizer}; +use tokenizer::{TokenStream, Tokenizer, Token}; use std::collections::BTreeMap; use Term; use Document; @@ -7,6 +7,7 @@ use schema::FieldValue; use schema::Value; use tokenizer::BoxedTokenizer; +#[derive(Debug)] pub struct HighlightSection { start: usize, stop: usize, @@ -21,6 +22,7 @@ impl HighlightSection { } } +#[derive(Debug)] pub struct FragmentCandidate { score: f32, start_offset: usize, @@ -29,13 +31,53 @@ pub struct FragmentCandidate { highlighted: Vec, } -pub struct Snippet { - fragments: Vec, +impl FragmentCandidate { + + fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate { + FragmentCandidate{score: 0.0, + start_offset: start_offset, + stop_offset: end_offset, + num_chars: 0, + highlighted: vec![]} + } + + /// Updates `score` and `highlighted` fields of the objects. + /// + /// + fn calculate_score(&mut self, token: &Token, terms: &BTreeMap) { + if let Some(score) = terms.get(&token.text.to_lowercase()) { + self.score += score; + self.highlighted.push(HighlightSection{start: token.offset_from, + stop: token.offset_to}); + } + } } +#[derive(Debug)] +pub struct Snippet { + fragments: String, + highlighted: Vec, +} + +const HIGHLIGHTEN_PREFIX:&str = ""; +const HIGHLIGHTEN_POSTFIX:&str = ""; + impl Snippet { - pub fn to_html() -> String { - unimplemented!(); + + /// Returns a hignlightned html from the `Snippet`. + pub fn to_html(&self) -> String { + let mut html = String::new(); + let mut start_from: usize = 0; + + for item in self.highlighted.iter() { + html.push_str(&self.fragments[start_from..item.start]); + html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&self.fragments[item.start..item.stop]); + html.push_str(HIGHLIGHTEN_POSTFIX); + start_from = item.stop; + } + html.push_str(&self.fragments[start_from..self.fragments.len()]); + html } } @@ -61,15 +103,61 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: &BoxedTokenizer, + tokenizer: Box, text: &'a str, terms: BTreeMap, max_num_chars: usize) -> Vec { - unimplemented!(); + let mut token_stream = tokenizer.token_stream(text); + let mut fragment = FragmentCandidate::new(0, 0); + let mut fragments:Vec = vec![]; + + loop { + if let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + } else { + fragment.calculate_score(next, &terms); + fragment.stop_offset = next.offset_to; + } + } else { + let txt = &text[fragment.start_offset..fragment.stop_offset]; + if fragment.score > 0.0 { + fragments.push(fragment) + }; + break; + } + } + + fragments } -fn select_best_fragment_combination(fragments_candidate: Vec<(&str, Vec)>, max_num_chars: usize) -> Snippet { - unimplemented!(); +/// Returns a Snippet +/// +/// Takes a vector of `FragmentCandidate`s and the text. +/// Figures out the best fragment from it and creates a snippet. +fn select_best_fragment_combination<'a>(fragments: Vec, + text: &'a str,) -> Snippet { + if let Some(init) = fragments.iter().nth(0) { + let fragment = fragments.iter().skip(1).fold(init, |acc, item| { + if item.score > init.score { item } else { init } + }); + let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; + let highlighted = fragment.highlighted.iter().map(|item| { + HighlightSection{start: item.start-fragment.start_offset, + stop: item.stop-fragment.start_offset} + }).collect(); + Snippet{fragments: fragment_text.to_owned(), + highlighted: highlighted} + } else { + // when there no fragments to chose from, + // for now create a empty snippet + Snippet{fragments: String::new(), + highlighted: vec![]} + } } pub fn generate_snippet<'a>( @@ -83,6 +171,37 @@ pub fn generate_snippet<'a>( #[cfg(test)] mod tests { + use tokenizer::{SimpleTokenizer, box_tokenizer}; + use std::iter::Iterator; + use std::collections::BTreeMap; + use super::{search_fragments, select_best_fragment_combination}; + #[test] - fn test_snippet() {} -} \ No newline at end of file + fn test_snippet() { + let tokenizer = SimpleTokenizer; + + let t = box_tokenizer(tokenizer); + + let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. + +Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] + +Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. +"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("rust"), 1.0); + terms.insert(String::from("language"), 0.9); + + let fragments = search_fragments(t, &text, terms, 100); + assert_eq!(fragments.len(), 7); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.9); + assert_eq!(first.stop_offset, 89); + } + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()) + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..d4a735bd2 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,7 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::BoxedTokenizer; +pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index fcdf8f21b..e806b70d8 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub(crate) fn box_tokenizer(a: A) -> Box +pub fn box_tokenizer(a: A) -> Box where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, { From 46decdb0ea60cc7c2e274b6bfaed180d1ecad0dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Mon, 27 Aug 2018 22:16:47 +0530 Subject: [PATCH 18/62] compare against accumulator rather than init value --- src/snippet/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4356e0a80..2429ac2e0 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -143,7 +143,7 @@ fn select_best_fragment_combination<'a>(fragments: Vec, text: &'a str,) -> Snippet { if let Some(init) = fragments.iter().nth(0) { let fragment = fragments.iter().skip(1).fold(init, |acc, item| { - if item.score > init.score { item } else { init } + if item.score > acc.score { item } else { acc } }); let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; let highlighted = fragment.highlighted.iter().map(|item| { From b373f0084089cfd75b0967a5ef90d8feb0a6f5ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:06:09 +0530 Subject: [PATCH 19/62] add htmlescape and update to_html fn to use it. tests and imports also updated. --- Cargo.toml | 1 + src/lib.rs | 1 + src/snippet/mod.rs | 9 +++++---- 3 files changed, 7 insertions(+), 4 deletions(-) mode change 100644 => 100755 src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index ab767d3fd..1ec1b65d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ census = "0.1" fnv = "1.0.6" owned-read = "0.4" failure = "0.1" +htmlescape = "0.3.1" [target.'cfg(windows)'.dependencies] winapi = "0.2" diff --git a/src/lib.rs b/src/lib.rs old mode 100644 new mode 100755 index 0d64752d9..4f4d364a0 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,6 +154,7 @@ extern crate stable_deref_trait; extern crate tempdir; extern crate tempfile; extern crate uuid; +extern crate htmlescape; #[cfg(test)] #[macro_use] diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 2429ac2e0..7413b8bb8 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -6,6 +6,7 @@ use Index; use schema::FieldValue; use schema::Value; use tokenizer::BoxedTokenizer; +use htmlescape::encode_minimal; #[derive(Debug)] pub struct HighlightSection { @@ -70,13 +71,13 @@ impl Snippet { let mut start_from: usize = 0; for item in self.highlighted.iter() { - html.push_str(&self.fragments[start_from..item.start]); + html.push_str(&encode_minimal(&self.fragments[start_from..item.start])); html.push_str(HIGHLIGHTEN_PREFIX); - html.push_str(&self.fragments[item.start..item.stop]); + html.push_str(&encode_minimal(&self.fragments[item.start..item.stop])); html.push_str(HIGHLIGHTEN_POSTFIX); start_from = item.stop; } - html.push_str(&self.fragments[start_from..self.fragments.len()]); + html.push_str(&encode_minimal(&self.fragments[start_from..self.fragments.len()])); html } } @@ -202,6 +203,6 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl } let snippet = select_best_fragment_combination(fragments, &text); assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()) + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) } } From 8438eda01a05792a29c22b2f1771285659bd25b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:11:50 +0530 Subject: [PATCH 20/62] use while let instead of loop and if. as per CR comment --- src/snippet/mod.rs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 7413b8bb8..a8a7bb194 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -112,26 +112,20 @@ fn search_fragments<'a>( let mut fragment = FragmentCandidate::new(0, 0); let mut fragments:Vec = vec![]; - loop { - if let Some(next) = token_stream.next() { - if (next.offset_to - fragment.start_offset) > max_num_chars { - let txt = &text[fragment.start_offset..fragment.stop_offset]; - if fragment.score > 0.0 { - fragments.push(fragment) - }; - fragment = FragmentCandidate::new(next.offset_from, next.offset_to); - } else { - fragment.calculate_score(next, &terms); - fragment.stop_offset = next.offset_to; - } - } else { - let txt = &text[fragment.start_offset..fragment.stop_offset]; + while let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { if fragment.score > 0.0 { fragments.push(fragment) }; - break; + fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + } else { + fragment.calculate_score(next, &terms); + fragment.stop_offset = next.offset_to; } } + if fragment.score > 0.0 { + fragments.push(fragment) + } fragments } From e1bca6db9d80e9609eefdd6b2f66ff729d80bcfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:24:25 +0530 Subject: [PATCH 21/62] update `calculate_score` to `try_add_token` `try_add_token` will now update the stop_offset as well. `FragmentCandidate::new` now just takes `start_offset`, it expects `try_add_token` to be called to add a token. --- src/snippet/mod.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index a8a7bb194..64d661acb 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -34,18 +34,27 @@ pub struct FragmentCandidate { impl FragmentCandidate { - fn new(start_offset: usize, end_offset: usize) -> FragmentCandidate { + /// Create a basic `FragmentCandidate` + /// + /// `score`, `num_chars` are set to 0 + /// and `highlighted` is set to empty vec + /// stop_offset is set to start_offset, which is taken as a param. + fn new(start_offset: usize) -> FragmentCandidate { FragmentCandidate{score: 0.0, start_offset: start_offset, - stop_offset: end_offset, + stop_offset: start_offset, num_chars: 0, highlighted: vec![]} } /// Updates `score` and `highlighted` fields of the objects. /// - /// - fn calculate_score(&mut self, token: &Token, terms: &BTreeMap) { + /// taking the token and terms, the token is added to the fragment. + /// if the token is one of the terms, the score + /// and highlighted fields are updated in the fragment. + fn try_add_token(&mut self, token: &Token, terms: &BTreeMap) { + self.stop_offset = token.offset_to; + if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; self.highlighted.push(HighlightSection{start: token.offset_from, @@ -109,7 +118,7 @@ fn search_fragments<'a>( terms: BTreeMap, max_num_chars: usize) -> Vec { let mut token_stream = tokenizer.token_stream(text); - let mut fragment = FragmentCandidate::new(0, 0); + let mut fragment = FragmentCandidate::new(0); let mut fragments:Vec = vec![]; while let Some(next) = token_stream.next() { @@ -117,10 +126,9 @@ fn search_fragments<'a>( if fragment.score > 0.0 { fragments.push(fragment) }; - fragment = FragmentCandidate::new(next.offset_from, next.offset_to); + fragment = FragmentCandidate::new(next.offset_from); } else { - fragment.calculate_score(next, &terms); - fragment.stop_offset = next.offset_to; + fragment.try_add_token(next, &terms); } } if fragment.score > 0.0 { From fb9b1c1f41549e889f33c99cbee9d585b5ef555c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 15:40:12 +0530 Subject: [PATCH 22/62] add a test and fix the bug of not calculating first token --- src/snippet/mod.rs | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 64d661acb..8f94a0a40 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -127,9 +127,8 @@ fn search_fragments<'a>( fragments.push(fragment) }; fragment = FragmentCandidate::new(next.offset_from); - } else { - fragment.try_add_token(next, &terms); } + fragment.try_add_token(next, &terms); } if fragment.score > 0.0 { fragments.push(fragment) @@ -183,7 +182,7 @@ mod tests { fn test_snippet() { let tokenizer = SimpleTokenizer; - let t = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(tokenizer); let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. @@ -196,7 +195,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - let fragments = search_fragments(t, &text, terms, 100); + let fragments = search_fragments(boxed_tokenizer, &text, terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); @@ -207,4 +206,30 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) } + + #[test] + fn test_snippet_in_second_fragment() { + let tokenizer = SimpleTokenizer; + + let boxed_tokenizer = box_tokenizer(tokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("c"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 1); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.start_offset, 4); + assert_eq!(first.stop_offset, 6); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "c d"); + assert_eq!(snippet.to_html(), "c d"); + } } From 96a313c6dd2540b6620f5285a15a8d250dae0403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 20:26:45 +0530 Subject: [PATCH 23/62] add more tests --- src/snippet/mod.rs | 68 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8f94a0a40..344c44d82 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -178,11 +178,11 @@ mod tests { use std::collections::BTreeMap; use super::{search_fragments, select_best_fragment_combination}; + const TOKENIZER:SimpleTokenizer = SimpleTokenizer; + #[test] fn test_snippet() { - let tokenizer = SimpleTokenizer; - - let boxed_tokenizer = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(TOKENIZER); let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. @@ -209,9 +209,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_in_second_fragment() { - let tokenizer = SimpleTokenizer; - - let boxed_tokenizer = box_tokenizer(tokenizer); + let boxed_tokenizer = box_tokenizer(TOKENIZER); let text = "a b c d e f g"; @@ -225,11 +223,67 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let first = fragments.iter().nth(0).unwrap(); assert_eq!(first.score, 1.0); assert_eq!(first.start_offset, 4); - assert_eq!(first.stop_offset, 6); + assert_eq!(first.stop_offset, 7); } let snippet = select_best_fragment_combination(fragments, &text); assert_eq!(snippet.fragments, "c d"); assert_eq!(snippet.to_html(), "c d"); } + + #[test] + fn test_snippet_with_term_at_the_end_of_fragment() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d e f f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 11); + assert_eq!(first.start_offset, 8); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f"); + assert_eq!(snippet.to_html(), "e f"); + } + + #[test] + fn test_snippet_with_term_not_in_text() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("z"), 1.0); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + fn test_snippet_with_no_terms() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } } From 6a197e023e3ec77f4b4a52bdbe75e8005ca29f38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 20:34:55 +0530 Subject: [PATCH 24/62] ran rustfmt --- src/snippet/mod.rs | 105 ++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 43 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 344c44d82..26b6be0f6 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,12 +1,12 @@ -use tokenizer::{TokenStream, Tokenizer, Token}; -use std::collections::BTreeMap; -use Term; -use Document; -use Index; +use htmlescape::encode_minimal; use schema::FieldValue; use schema::Value; +use std::collections::BTreeMap; use tokenizer::BoxedTokenizer; -use htmlescape::encode_minimal; +use tokenizer::{Token, TokenStream, Tokenizer}; +use Document; +use Index; +use Term; #[derive(Debug)] pub struct HighlightSection { @@ -16,10 +16,7 @@ pub struct HighlightSection { impl HighlightSection { fn new(start: usize, stop: usize) -> HighlightSection { - HighlightSection { - start, - stop - } + HighlightSection { start, stop } } } @@ -33,18 +30,19 @@ pub struct FragmentCandidate { } impl FragmentCandidate { - /// Create a basic `FragmentCandidate` /// /// `score`, `num_chars` are set to 0 /// and `highlighted` is set to empty vec /// stop_offset is set to start_offset, which is taken as a param. fn new(start_offset: usize) -> FragmentCandidate { - FragmentCandidate{score: 0.0, - start_offset: start_offset, - stop_offset: start_offset, - num_chars: 0, - highlighted: vec![]} + FragmentCandidate { + score: 0.0, + start_offset: start_offset, + stop_offset: start_offset, + num_chars: 0, + highlighted: vec![], + } } /// Updates `score` and `highlighted` fields of the objects. @@ -57,8 +55,10 @@ impl FragmentCandidate { if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; - self.highlighted.push(HighlightSection{start: token.offset_from, - stop: token.offset_to}); + self.highlighted.push(HighlightSection { + start: token.offset_from, + stop: token.offset_to, + }); } } } @@ -69,11 +69,10 @@ pub struct Snippet { highlighted: Vec, } -const HIGHLIGHTEN_PREFIX:&str = ""; -const HIGHLIGHTEN_POSTFIX:&str = ""; +const HIGHLIGHTEN_PREFIX: &str = ""; +const HIGHLIGHTEN_POSTFIX: &str = ""; impl Snippet { - /// Returns a hignlightned html from the `Snippet`. pub fn to_html(&self) -> String { let mut html = String::new(); @@ -86,7 +85,9 @@ impl Snippet { html.push_str(HIGHLIGHTEN_POSTFIX); start_from = item.stop; } - html.push_str(&encode_minimal(&self.fragments[start_from..self.fragments.len()])); + html.push_str(&encode_minimal( + &self.fragments[start_from..self.fragments.len()], + )); html } } @@ -116,10 +117,11 @@ fn search_fragments<'a>( tokenizer: Box, text: &'a str, terms: BTreeMap, - max_num_chars: usize) -> Vec { + max_num_chars: usize, +) -> Vec { let mut token_stream = tokenizer.token_stream(text); let mut fragment = FragmentCandidate::new(0); - let mut fragments:Vec = vec![]; + let mut fragments: Vec = vec![]; while let Some(next) = token_stream.next() { if (next.offset_to - fragment.start_offset) > max_num_chars { @@ -141,24 +143,41 @@ fn search_fragments<'a>( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination<'a>(fragments: Vec, - text: &'a str,) -> Snippet { +fn select_best_fragment_combination<'a>( + fragments: Vec, + text: &'a str, +) -> Snippet { if let Some(init) = fragments.iter().nth(0) { - let fragment = fragments.iter().skip(1).fold(init, |acc, item| { - if item.score > acc.score { item } else { acc } - }); + let fragment = + fragments.iter().skip(1).fold( + init, + |acc, item| { + if item.score > acc.score { + item + } else { + acc + } + }, + ); let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; - let highlighted = fragment.highlighted.iter().map(|item| { - HighlightSection{start: item.start-fragment.start_offset, - stop: item.stop-fragment.start_offset} - }).collect(); - Snippet{fragments: fragment_text.to_owned(), - highlighted: highlighted} + let highlighted = fragment + .highlighted + .iter() + .map(|item| HighlightSection { + start: item.start - fragment.start_offset, + stop: item.stop - fragment.start_offset, + }).collect(); + Snippet { + fragments: fragment_text.to_owned(), + highlighted: highlighted, + } } else { // when there no fragments to chose from, // for now create a empty snippet - Snippet{fragments: String::new(), - highlighted: vec![]} + Snippet { + fragments: String::new(), + highlighted: vec![], + } } } @@ -166,19 +185,19 @@ pub fn generate_snippet<'a>( doc: &'a [FieldValue], index: &Index, terms: Vec, - max_num_chars: usize) -> Snippet { + max_num_chars: usize, +) -> Snippet { unimplemented!(); } - #[cfg(test)] mod tests { - use tokenizer::{SimpleTokenizer, box_tokenizer}; - use std::iter::Iterator; - use std::collections::BTreeMap; use super::{search_fragments, select_best_fragment_combination}; + use std::collections::BTreeMap; + use std::iter::Iterator; + use tokenizer::{box_tokenizer, SimpleTokenizer}; - const TOKENIZER:SimpleTokenizer = SimpleTokenizer; + const TOKENIZER: SimpleTokenizer = SimpleTokenizer; #[test] fn test_snippet() { From f247935bb9a84abc7ffa5ee60756501d8bb6a3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 22:16:22 +0530 Subject: [PATCH 25/62] Use HighlightSection::new rather than just directly creating the object --- src/snippet/mod.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 26b6be0f6..cee3e79ab 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -55,10 +55,8 @@ impl FragmentCandidate { if let Some(score) = terms.get(&token.text.to_lowercase()) { self.score += score; - self.highlighted.push(HighlightSection { - start: token.offset_from, - stop: token.offset_to, - }); + self.highlighted + .push(HighlightSection::new(token.offset_from, token.offset_to)); } } } @@ -163,9 +161,11 @@ fn select_best_fragment_combination<'a>( let highlighted = fragment .highlighted .iter() - .map(|item| HighlightSection { - start: item.start - fragment.start_offset, - stop: item.stop - fragment.start_offset, + .map(|item| { + HighlightSection::new( + item.start - fragment.start_offset, + item.stop - fragment.start_offset, + ) }).collect(); Snippet { fragments: fragment_text.to_owned(), From 18814ba0c15e72dd2db09c589e647b863dbbea51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vignesh=20Sarma=20K=20=28=E0=B4=B5=E0=B4=BF=E0=B4=98?= =?UTF-8?q?=E0=B5=8D=E0=B4=A8=E0=B5=87=E0=B4=B7=E0=B5=8D=20=E0=B4=B6?= =?UTF-8?q?=E0=B5=AA=E0=B4=AE=20=E0=B4=95=E0=B5=86=29?= Date: Tue, 28 Aug 2018 22:27:56 +0530 Subject: [PATCH 26/62] add a test for second fragment having higher score --- src/snippet/mod.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cee3e79ab..8142c54a0 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -274,6 +274,31 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.to_html(), "e f"); } + #[test] + fn test_snippet_with_second_fragment_has_the_highest_score() { + let boxed_tokenizer = box_tokenizer(TOKENIZER); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + terms.insert(String::from("a"), 0.9); + + let fragments = search_fragments(boxed_tokenizer, &text, terms, 7); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 7); + assert_eq!(first.start_offset, 0); + } + + let snippet = select_best_fragment_combination(fragments, &text); + assert_eq!(snippet.fragments, "e f g"); + assert_eq!(snippet.to_html(), "e f g"); + } + #[test] fn test_snippet_with_term_not_in_text() { let boxed_tokenizer = box_tokenizer(TOKENIZER); @@ -292,6 +317,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.to_html(), ""); } + #[test] fn test_snippet_with_no_terms() { let boxed_tokenizer = box_tokenizer(TOKENIZER); From d15efd6635d26dafe985e7e5b01bdf51ecb72e97 Mon Sep 17 00:00:00 2001 From: petr-tik Date: Wed, 29 Aug 2018 00:26:59 +0100 Subject: [PATCH 27/62] Closes #235 - adds a new error type (#398) error message suggests possible causes Addressed code review 1 thread + smaller heap size --- src/error.rs | 11 ++++++++++- src/indexer/index_writer.rs | 18 +++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/error.rs b/src/error.rs index 8fa5cb1ce..db15e3c42 100644 --- a/src/error.rs +++ b/src/error.rs @@ -9,6 +9,7 @@ use schema; use serde_json; use std::path::PathBuf; use std::sync::PoisonError; +use core::LOCKFILE_FILEPATH; /// The library's failure based error enum #[derive(Debug, Fail)] @@ -19,6 +20,9 @@ pub enum TantivyError { /// File already exists, this is a problem when we try to write into a new file. #[fail(display = "file already exists: '{:?}'", _0)] FileAlreadyExists(PathBuf), + /// Lockfile already exists + #[fail(display = "Lockfile '{:?}' already exists. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)] + LockFileAlreadyExists(PathBuf), /// IO Error. #[fail(display = "an IO error occurred: '{}'", _0)] IOError(#[cause] IOError), @@ -95,7 +99,12 @@ impl From for TantivyError { fn from(error: OpenWriteError) -> TantivyError { match error { OpenWriteError::FileAlreadyExists(filepath) => { - TantivyError::FileAlreadyExists(filepath) + let lockfile_fname = LOCKFILE_FILEPATH.to_str().unwrap(); + if filepath.ends_with(lockfile_fname) { + TantivyError::LockFileAlreadyExists(filepath) + } else { + TantivyError::FileAlreadyExists(filepath) + } } OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), }.into() diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 982140fbc..226fb7379 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -657,11 +657,27 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(TantivyError::FileAlreadyExists(_)) => {} + Err(TantivyError::LockFileAlreadyExists(_)) => {} _ => panic!("Expected FileAlreadyExists error"), } } + #[test] + fn test_lockfile_already_exists_error_msg() { + let schema_builder = schema::SchemaBuilder::default(); + let index = Index::create_in_ram(schema_builder.build()); + let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + match index.writer_with_num_threads(1, 3_000_000) { + Err(err) => { + let err_msg = err.to_string(); + assert!(err_msg.contains("Lockfile")); + assert!(err_msg.contains("already exists")); + assert!(err_msg.contains("Possible causes:")) + }, + _ => panic!("Expected LockfileAlreadyExists error"), + } + } + #[test] fn test_set_merge_policy() { let schema_builder = schema::SchemaBuilder::default(); From ee681a4dd1eebb7902f1532861081435171f3ac4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 29 Aug 2018 11:06:04 +0900 Subject: [PATCH 28/62] Added say thanks badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c1824b575..499a12464 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master) +[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png) From a12d211330657931de2c972030504762cdbb8432 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 30 Aug 2018 09:23:34 +0900 Subject: [PATCH 29/62] Extracting terms matching query in the document --- examples/snippet.rs | 75 +++++++++++++++++++++++++++++ src/common/mod.rs | 21 +++++++- src/query/automaton_weight.rs | 47 ++++++++++++++++++ src/query/term_query/term_weight.rs | 22 +++++++++ src/query/weight.rs | 35 ++++++++++++++ src/snippet/mod.rs | 6 +-- 6 files changed, 201 insertions(+), 5 deletions(-) create mode 100644 examples/snippet.rs diff --git a/examples/snippet.rs b/examples/snippet.rs new file mode 100644 index 000000000..35e9e76bd --- /dev/null +++ b/examples/snippet.rs @@ -0,0 +1,75 @@ +// # Snippet example +// +// This example shows how to return a representative snippet of +// your hit result. +// Snippet are an extracted of a target document, and returned in HTML format. +// The keyword searched by the user are highlighted with a `` tag. +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_example_dir")?; + + // # Defining the schema + let mut schema_builder = SchemaBuilder::default(); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let mut old_man_doc = Document::default(); + // we'll only need one doc for this example. + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + // ... + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + let query = query_parser.parse_query("sycamore spring")?; + + let mut top_collector = TopCollector::with_limit(10); + + searcher.search(&*query, &mut top_collector)?; + + let doc_addresses = top_collector.docs(); + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + generate_snippet(&retrieved_doc, query + } + + + Ok(()) +} + + +use tempdir::TempDir; diff --git a/src/common/mod.rs b/src/common/mod.rs index 2942438b4..778f0476a 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,6 +68,17 @@ pub trait HasLen { } } + +pub fn is_stricly_sorted(els: &[T]) -> bool { + if els.is_empty() { + true + } else { + els.iter() + .zip(els[1..].iter()) + .all(|(left, right)| left < right) + } +} + const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -105,12 +116,20 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } + + #[test] + fn test_is_strictly_sorted() { + assert!(is_stricly_sorted::(&[])); + assert!(is_stricly_sorted(&[1])); + assert!(is_stricly_sorted(&[1, 2, 3])); + assert!(!is_stricly_sorted(&[1, 3, 2])); + } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index b38e6592d..d1040eb85 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,6 +7,11 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::weight::MatchingTerms; +use SkipResult; +use Term; +use DocId; +use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight @@ -36,6 +41,48 @@ impl Weight for AutomatonWeight where A: Automaton, { + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let max_doc = reader.max_doc(); + let mut doc_bitset = BitSet::with_max_value(max_doc); + + let inverted_index = reader.inverted_index(self.field); + let term_dict = inverted_index.terms(); + let mut term_stream = self.automaton_stream(term_dict); + + let doc_ids = matching_terms.sorted_doc_ids(); + let mut docs_matching_current_term: Vec = vec![]; + + let mut term_buffer: Vec = vec![]; + + while term_stream.advance() { + docs_matching_current_term.clear(); + let term_info = term_stream.value(); + let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); + for &doc_id in &doc_ids { + match segment_postings.skip_next(doc_id) { + SkipResult::Reached => { + docs_matching_current_term.push(doc_id); + } + SkipResult::OverStep => {} + SkipResult::End => {} + } + } + if !docs_matching_current_term.is_empty() { + term_buffer.clear(); + let term_ord = term_stream.term_ord(); + inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); + let term = Term::from_field_bytes(self.field, &term_buffer[..]); + for &doc_id in &docs_matching_current_term { + matching_terms.add_term(doc_id, term.clone()); + } + } + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index ba45a8042..1a9075b5a 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,6 +8,8 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; +use SkipResult; +use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -38,6 +40,26 @@ impl Weight for TermWeight { } } + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let doc_ids = matching_terms.sorted_doc_ids(); + let mut scorer = self.scorer(reader)?; + for doc_id in doc_ids { + match scorer.skip_next(doc_id) { + SkipResult::Reached => { + matching_terms.add_term(doc_id, self.term.clone()); + } + SkipResult::OverStep => {} + SkipResult::End => { + break; + } + } + } + Ok(()) + } + fn count(&self, reader: &SegmentReader) -> Result { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index d3d8b3520..51289c573 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,6 +1,37 @@ use super::Scorer; use core::SegmentReader; use Result; +use DocId; +use std::collections::HashSet; +use Term; +use std::collections::BTreeMap; + + +pub struct MatchingTerms { + doc_to_terms: BTreeMap> +} + +impl MatchingTerms { + pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { + MatchingTerms { + doc_to_terms: doc_ids + .iter() + .cloned() + .map(|doc_id| (doc_id, HashSet::default())) + .collect() + } + } + + pub fn sorted_doc_ids(&self) -> Vec { + self.doc_to_terms.keys().cloned().collect() + } + + pub fn add_term(&mut self, doc_id: DocId, term: Term) { + if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { + terms.insert(term); + } + } +} /// A Weight is the specialization of a Query /// for a given set of segments. @@ -11,6 +42,10 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + Ok(()) + } + /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result { Ok(self.scorer(reader)?.count()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8142c54a0..97c557e98 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,10 +1,8 @@ -use htmlescape::encode_minimal; + use htmlescape::encode_minimal; use schema::FieldValue; -use schema::Value; use std::collections::BTreeMap; use tokenizer::BoxedTokenizer; -use tokenizer::{Token, TokenStream, Tokenizer}; -use Document; +use tokenizer::{Token, TokenStream}; use Index; use Term; From 6704ab69877154a12f0b1f74a27a5cd3cdc894eb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 30 Aug 2018 09:47:19 +0900 Subject: [PATCH 30/62] Added methods to extract the matching terms. First stab --- src/query/mod.rs | 1 + src/query/query.rs | 2 ++ src/query/weight.rs | 1 - src/snippet/mod.rs | 32 ++++++++++++++++++++++++++++---- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 7546465fb..0b6ee2adb 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -27,6 +27,7 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; +pub use self::weight::MatchingTerms; pub use self::intersection::Intersection; pub use self::union::Union; diff --git a/src/query/query.rs b/src/query/query.rs index 51e068b92..7004768e4 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,6 +5,8 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; +use DocAddress; +use query::weight::MatchingTerms; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. diff --git a/src/query/weight.rs b/src/query/weight.rs index 51289c573..5b603ab1c 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -6,7 +6,6 @@ use std::collections::HashSet; use Term; use std::collections::BTreeMap; - pub struct MatchingTerms { doc_to_terms: BTreeMap> } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 97c557e98..984c1a589 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,10 +1,17 @@ - use htmlescape::encode_minimal; +use htmlescape::encode_minimal; use schema::FieldValue; use std::collections::BTreeMap; +use itertools::Itertools; use tokenizer::BoxedTokenizer; use tokenizer::{Token, TokenStream}; use Index; +use Result; use Term; +use query::Query; +use DocAddress; +use DocId; +use Searcher; +use query::MatchingTerms; #[derive(Debug)] pub struct HighlightSection { @@ -179,12 +186,29 @@ fn select_best_fragment_combination<'a>( } } + + + +fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> { + let weight = query.weight(searcher, false)?; + let mut doc_groups = doc_addresses + .iter() + .group_by(|doc_address| doc_address.0); + for (segment_ord, doc_addrs) in doc_groups.into_iter() { + let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); + let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); + let segment_reader = searcher.segment_reader(segment_ord); + weight.matching_terms(segment_reader, &mut matching_terms)?; + } + Ok(()) +} + pub fn generate_snippet<'a>( - doc: &'a [FieldValue], + doc: &'a [DocAddress], index: &Index, + query: &Query, terms: Vec, - max_num_chars: usize, -) -> Snippet { + max_num_chars: usize) -> Snippet { unimplemented!(); } From f570fe37d491a9c5f669f45316dc8cceeb05bfe4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 31 Aug 2018 09:03:44 +0900 Subject: [PATCH 31/62] small changes --- examples/snippet.rs | 3 +-- src/query/automaton_weight.rs | 1 - src/query/boolean_query/boolean_weight.rs | 9 +++++++++ src/query/phrase_query/phrase_weight.rs | 6 ++++++ src/query/range_query.rs | 6 ++++++ src/snippet/mod.rs | 3 ++- 6 files changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 35e9e76bd..4efea1e5a 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -64,10 +64,9 @@ fn main() -> tantivy::Result<()> { for doc_address in doc_addresses { let retrieved_doc = searcher.doc(&doc_address)?; - generate_snippet(&retrieved_doc, query + // generate_snippet(&retrieved_doc, query } - Ok(()) } diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index d1040eb85..54f8c5f8b 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -41,7 +41,6 @@ impl Weight for AutomatonWeight where A: Automaton, { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 575bc2991..2b3348a21 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -13,6 +13,7 @@ use query::Weight; use std::borrow::Borrow; use std::collections::HashMap; use Result; +use query::MatchingTerms; fn scorer_union(scorers: Vec>) -> Box where @@ -107,6 +108,14 @@ impl BooleanWeight { } impl Weight for BooleanWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + for (_, weight) in &self.weights { + weight.matching_terms(reader, matching_terms)?; + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index de8eeb0d2..fbf43db20 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -7,6 +7,7 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; +use query::MatchingTerms; pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, @@ -30,6 +31,11 @@ impl PhraseWeight { } impl Weight for PhraseWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + unimplemented!(); + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 23efe1995..2b22e7cf8 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,6 +11,7 @@ use std::collections::Bound; use std::ops::Range; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::MatchingTerms; fn map_bound TTo>( bound: &Bound, @@ -274,6 +275,11 @@ impl RangeWeight { } impl Weight for RangeWeight { + + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + unimplemented!(); + } + fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 984c1a589..cd194e0d8 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -200,6 +200,7 @@ fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddres let segment_reader = searcher.segment_reader(segment_ord); weight.matching_terms(segment_reader, &mut matching_terms)?; } + let terms = HashSet<(DocId, Vec)>; Ok(()) } @@ -209,7 +210,7 @@ pub fn generate_snippet<'a>( query: &Query, terms: Vec, max_num_chars: usize) -> Snippet { - unimplemented!(); + search_fragments(boxed_tokenizer, &text, terms, 3); } #[cfg(test)] From b3b2421e8aaf523b97862942891ee5a01be33a03 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 4 Sep 2018 09:17:00 +0900 Subject: [PATCH 32/62] Issue/367 (#404) * First stab * Closes #367 --- src/core/index.rs | 9 +- src/core/mod.rs | 6 -- src/directory/managed_directory.rs | 59 +++++++++--- src/directory/ram_directory.rs | 1 - src/error.rs | 22 ++--- src/indexer/directory_lock.rs | 147 ++++++++++++++++++++++++++--- src/indexer/index_writer.rs | 3 +- src/indexer/mod.rs | 2 + src/indexer/segment_manager.rs | 7 +- 9 files changed, 201 insertions(+), 55 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index efdfedc5f..2a818f222 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -7,7 +7,7 @@ use std::fmt; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use Result; - +use indexer::LockType; use super::pool::LeasedItem; use super::pool::Pool; use super::segment::create_segment; @@ -20,11 +20,10 @@ use core::META_FILEPATH; #[cfg(feature = "mmap")] use directory::MmapDirectory; use directory::{Directory, RAMDirectory}; -use directory::{DirectoryClone, ManagedDirectory}; +use directory::{ManagedDirectory}; use indexer::index_writer::open_index_writer; use indexer::index_writer::HEAP_SIZE_MIN; use indexer::segment_updater::save_new_metas; -use indexer::DirectoryLock; use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; @@ -156,7 +155,8 @@ impl Index { num_threads: usize, overall_heap_size_in_bytes: usize, ) -> Result { - let directory_lock = DirectoryLock::lock(self.directory().box_clone())?; + + let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; open_index_writer( self, @@ -249,6 +249,7 @@ impl Index { /// This needs to be called when a new segment has been /// published or after a merge. pub fn load_searchers(&self) -> Result<()> { + let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?; let searchable_segments = self.searchable_segments()?; let segment_readers: Vec = searchable_segments .iter() diff --git a/src/core/mod.rs b/src/core/mod.rs index 6d43685f8..062b537ee 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -33,10 +33,4 @@ lazy_static! { /// Removing this file is safe, but will prevent the garbage collection of all of the file that /// are currently in the directory pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); - - /// Only one process should be able to write tantivy's index at a time. - /// This file, when present, is in charge of preventing other processes to open an IndexWriter. - /// - /// If the process is killed and this file remains, it is safe to remove it manually. - pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index cf59b9ace..e5510d113 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -12,6 +12,20 @@ use std::sync::RwLockWriteGuard; use std::sync::{Arc, RwLock}; use Directory; use Result; +use indexer::LockType; + + + +/// Returns true iff the file is "managed". +/// Non-managed file are not subject to garbage collection. +/// +/// Filenames that starts by a "." -typically locks- +/// are not managed. +fn is_managed(path: &Path) -> bool { + path.to_str() + .map(|p_str| !p_str.starts_with(".")) + .unwrap_or(true) +} /// Wrapper of directories that keeps track of files created by Tantivy. /// @@ -82,25 +96,34 @@ impl ManagedDirectory { pub fn garbage_collect HashSet>(&mut self, get_living_files: L) { info!("Garbage collect"); let mut files_to_delete = vec![]; + + // It is crucial to get the living files after acquiring the + // read lock of meta informations. That way, we + // avoid the following scenario. + // + // 1) we get the list of living files. + // 2) someone creates a new file. + // 3) we start garbage collection and remove this file + // even though it is a living file. + // + // releasing the lock as .delete() will use it too. { - // releasing the lock as .delete() will use it too. let meta_informations_rlock = self.meta_informations .read() .expect("Managed directory rlock poisoned in garbage collect."); - // It is crucial to get the living files after acquiring the - // read lock of meta informations. That way, we - // avoid the following scenario. - // - // 1) we get the list of living files. - // 2) someone creates a new file. - // 3) we start garbage collection and remove this file - // even though it is a living file. - let living_files = get_living_files(); - - for managed_path in &meta_informations_rlock.managed_paths { - if !living_files.contains(managed_path) { - files_to_delete.push(managed_path.clone()); + // The point of this second "file" lock is to enforce the following scenario + // 1) process B tries to load a new set of searcher. + // The list of segments is loaded + // 2) writer change meta.json (for instance after a merge or a commit) + // 3) gc kicks in. + // 4) gc removes a file that was useful for process B, before process B opened it. + if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) { + let living_files = get_living_files(); + for managed_path in &meta_informations_rlock.managed_paths { + if !living_files.contains(managed_path) { + files_to_delete.push(managed_path.clone()); + } } } } @@ -156,7 +179,15 @@ impl ManagedDirectory { /// registering the filepath and creating the file /// will not lead to garbage files that will /// never get removed. + /// + /// File starting by "." are reserved to locks. + /// They are not managed and cannot be subjected + /// to garbage collection. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { + // Files starting by "." (e.g. lock files) are not managed. + if !is_managed(filepath) { + return Ok(()); + } let mut meta_wlock = self.meta_informations .write() .expect("Managed file lock poisoned"); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index d1a671cd1..4e55da537 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -173,7 +173,6 @@ impl Directory for RAMDirectory { let exists = self.fs .write(path_buf.clone(), &Vec::new()) .map_err(|err| IOError::with_path(path.to_owned(), err))?; - // force the creation of the file to mimic the MMap directory. if exists { Err(OpenWriteError::FileAlreadyExists(path_buf)) diff --git a/src/error.rs b/src/error.rs index db15e3c42..ddde26789 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,10 +6,10 @@ use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteErro use fastfield::FastFieldNotAvailableError; use query; use schema; +use indexer::LockType; use serde_json; use std::path::PathBuf; use std::sync::PoisonError; -use core::LOCKFILE_FILEPATH; /// The library's failure based error enum #[derive(Debug, Fail)] @@ -20,9 +20,9 @@ pub enum TantivyError { /// File already exists, this is a problem when we try to write into a new file. #[fail(display = "file already exists: '{:?}'", _0)] FileAlreadyExists(PathBuf), - /// Lockfile already exists - #[fail(display = "Lockfile '{:?}' already exists. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)] - LockFileAlreadyExists(PathBuf), + /// Failed to acquire file lock + #[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)] + LockFailure(LockType), /// IO Error. #[fail(display = "an IO error occurred: '{}'", _0)] IOError(#[cause] IOError), @@ -95,18 +95,14 @@ impl From for TantivyError { } } + impl From for TantivyError { fn from(error: OpenWriteError) -> TantivyError { match error { - OpenWriteError::FileAlreadyExists(filepath) => { - let lockfile_fname = LOCKFILE_FILEPATH.to_str().unwrap(); - if filepath.ends_with(lockfile_fname) { - TantivyError::LockFileAlreadyExists(filepath) - } else { - TantivyError::FileAlreadyExists(filepath) - } - } - OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), + OpenWriteError::FileAlreadyExists(filepath) => + TantivyError::FileAlreadyExists(filepath), + OpenWriteError::IOError(io_error) => + TantivyError::IOError(io_error), }.into() } } diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index b152a3c58..4dbaa9ed4 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -1,26 +1,147 @@ -use core::LOCKFILE_FILEPATH; use directory::error::OpenWriteError; use Directory; +use TantivyError; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::Duration; +use std::io::Write; -/// The directory lock is a mechanism used to -/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html) -/// -/// Only one lock can exist at a time for a given directory. -/// The lock is release automatically on `Drop`. -pub struct DirectoryLock { - directory: Box, +#[derive(Debug, Clone, Copy)] +pub enum LockType { + /// Only one process should be able to write tantivy's index at a time. + /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. + /// + /// If the process is killed and this file remains, it is safe to remove it manually. + /// + /// Failing to acquire this lock usually means a misuse of tantivy's API, + /// (creating more than one instance of the `IndexWriter`), are a spurious + /// lock file remaining after a crash. In the latter case, removing the file after + /// checking no process running tantivy is running is safe. + IndexWriterLock, + /// The meta lock file is here to protect the segment files being opened by + /// `.load_searchers()` from being garbage collected. + /// It makes it possible for another process to safely consume + /// our index in-writing. Ideally, we may have prefered `RWLock` semantics + /// here, but it is difficult to achieve on Windows. + /// + /// Opening segment readers is a very fast process. + /// Right now if the lock cannot be acquire on the first attempt, the logic + /// is very simplistic. We retry after `100ms` until we effectively + /// acquire the lock. + /// This lock should not have much contention in normal usage. + MetaLock } -impl DirectoryLock { - pub fn lock(mut directory: Box) -> Result { - directory.open_write(&*LOCKFILE_FILEPATH)?; - Ok(DirectoryLock { directory }) + +/// Retry the logic of acquiring locks is pretty simple. +/// We just retry `n` times after a given `duratio`, both +/// depending on the type of lock. +struct RetryPolicy { + num_retries: usize, + wait_in_ms: u64, +} + +impl RetryPolicy { + fn no_retry() -> RetryPolicy { + RetryPolicy { + num_retries: 0, + wait_in_ms: 0, + } } + + fn wait_and_retry(&mut self,) -> bool { + if self.num_retries == 0 { + false + } else { + self.num_retries -= 1; + let wait_duration = Duration::from_millis(self.wait_in_ms); + thread::sleep(wait_duration); + true + } + + } +} + +impl LockType { + + fn retry_policy(&self) -> RetryPolicy { + match *self { + LockType::IndexWriterLock => + RetryPolicy::no_retry(), + LockType::MetaLock => + RetryPolicy { + num_retries: 100, + wait_in_ms: 100, + } + } + } + + fn try_acquire_lock(&self, directory: &mut Directory) -> Result { + let path = self.filename(); + let mut write = directory + .open_write(path) + .map_err(|e| + match e { + OpenWriteError::FileAlreadyExists(_) => + TantivyError::LockFailure(*self), + OpenWriteError::IOError(io_error) => + TantivyError::IOError(io_error), + })?; + write.flush()?; + Ok(DirectoryLock { + directory: directory.box_clone(), + path: path.to_owned(), + }) + } + + + /// Acquire a lock in the given directory. + pub fn acquire_lock(&self, directory: &Directory) -> Result { + let mut box_directory = directory.box_clone(); + let mut retry_policy = self.retry_policy(); + loop { + let lock_result = self.try_acquire_lock(&mut *box_directory); + match lock_result { + Ok(result) => { + return Ok(result); + } + Err(TantivyError::LockFailure(ref filepath)) => { + if !retry_policy.wait_and_retry() { + return Err(TantivyError::LockFailure(filepath.to_owned())); + } + } + Err(_) => { + } + } + } + } + + fn filename(&self) -> &Path { + match *self { + LockType::MetaLock => { + Path::new(".tantivy-meta.lock") + } + LockType::IndexWriterLock => { + Path::new(".tantivy-indexer.lock") + } + } + } +} + + +/// The `DirectoryLock` is an object that represents a file lock. +/// See [`LockType`](struct.LockType.html) +/// +/// It is transparently associated to a lock file, that gets deleted +/// on `Drop.` The lock is release automatically on `Drop`. +pub struct DirectoryLock { + directory: Box, + path: PathBuf, } impl Drop for DirectoryLock { fn drop(&mut self) { - if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) { + if let Err(e) = self.directory.delete(&*self.path) { error!("Failed to remove the lock file. {:?}", e); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 226fb7379..5af4ed694 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -657,7 +657,7 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(TantivyError::LockFileAlreadyExists(_)) => {} + Err(TantivyError::LockFailure(_)) => {} _ => panic!("Expected FileAlreadyExists error"), } } @@ -671,7 +671,6 @@ mod tests { Err(err) => { let err_msg = err.to_string(); assert!(err_msg.contains("Lockfile")); - assert!(err_msg.contains("already exists")); assert!(err_msg.contains("Possible causes:")) }, _ => panic!("Expected LockfileAlreadyExists error"), diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 783e787c8..3d29b38c0 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -16,6 +16,8 @@ mod segment_writer; mod stamper; pub(crate) use self::directory_lock::DirectoryLock; +pub use self::directory_lock::LockType; + pub use self::index_writer::IndexWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 18175c774..0e67d3b15 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,7 +1,7 @@ use super::segment_register::SegmentRegister; use core::SegmentId; use core::SegmentMeta; -use core::{LOCKFILE_FILEPATH, META_FILEPATH}; +use core::META_FILEPATH; use error::TantivyError; use indexer::delete_queue::DeleteCursor; use indexer::SegmentEntry; @@ -78,10 +78,13 @@ impl SegmentManager { registers_lock.committed.len() + registers_lock.uncommitted.len() } + /// List the files that are useful to the index. + /// + /// This does not include lock files, or files that are obsolete + /// but have not yet been deleted by the garbage collector. pub fn list_files(&self) -> HashSet { let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); - files.insert(LOCKFILE_FILEPATH.clone()); for segment_meta in SegmentMeta::all() { files.extend(segment_meta.list_files()); } From c64972e03966671c5e3983fe018758d20b9b32e9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 5 Sep 2018 09:43:56 +0900 Subject: [PATCH 33/62] Apply unicode lowercasing. (#408) Checks if the str is ASCII, and uses a fast track if it is the case. If not, the std's definition of a lowercase character. Closes #406 --- src/tokenizer/lower_caser.rs | 66 ++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ebade3978..578678a4a 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use std::mem; /// Token filter that lowercase terms. #[derive(Clone)] @@ -15,13 +16,22 @@ where } } -pub struct LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ +pub struct LowerCaserTokenStream { + buffer: String, tail: TailTokenStream, } +// writes a lowercased version of text into output. +fn to_lowercase_unicode(text: &mut String, output: &mut String) { + output.clear(); + for c in text.chars() { + // Contrary to the std, we do not take care of sigma special case. + // This will have an normalizationo effect, which is ok for search. + output.extend(c.to_lowercase()); + } +} + + impl TokenStream for LowerCaserTokenStream where TailTokenStream: TokenStream, @@ -36,7 +46,14 @@ where fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().text.make_ascii_lowercase(); + if self.token_mut().text.is_ascii() { + // fast track for ascii. + self.token_mut().text.make_ascii_lowercase(); + } else { + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + + mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + } true } else { false @@ -49,6 +66,43 @@ where TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { tail } + LowerCaserTokenStream { + tail, + buffer: String::with_capacity(100) + } } } + +#[cfg(test)] +mod tests { + use tokenizer::Tokenizer; + use tokenizer::LowerCaser; + use tokenizer::TokenStream; + use tokenizer::SimpleTokenizer; + + #[test] + fn test_to_lower_case() { + assert_eq!(lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()]); + } + + fn lowercase_helper(text: &str) -> Vec { + let mut tokens = vec![]; + let mut token_stream = SimpleTokenizer + .filter(LowerCaser) + .token_stream(text); + while token_stream.advance() { + let token_text = token_stream.token().text.clone(); + tokens.push(token_text); + } + tokens + } + + + #[test] + fn test_lowercaser() { + assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); + assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + } + +} \ No newline at end of file From cec9956a01d922f6ce2e5dee3da003512597e3ed Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 6 Sep 2018 10:10:40 +0900 Subject: [PATCH 34/62] Issue/389 (#405) * Setting up the dependency. * Completed README --- Cargo.toml | 11 +++++- README.md | 4 ++ appveyor.yml | 4 +- ci/script.sh | 2 +- run-tests.sh | 2 + src/common/bitset.rs | 6 +-- src/core/segment_reader.rs | 3 ++ src/directory/ram_directory.rs | 3 ++ src/fastfield/mod.rs | 2 +- src/indexer/index_writer.rs | 71 +++++++++++++++++++++++++--------- src/lib.rs | 4 ++ 11 files changed, 86 insertions(+), 26 deletions(-) create mode 100755 run-tests.sh diff --git a/Cargo.toml b/Cargo.toml index ab767d3fd..098ab91c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ census = "0.1" fnv = "1.0.6" owned-read = "0.4" failure = "0.1" +fail = "0.2" [target.'cfg(windows)'.dependencies] winapi = "0.2" @@ -60,12 +61,20 @@ opt-level = 3 debug = false lto = true debug-assertions = false +overflow-checks = false + +[profile.test] +debug-assertions = true +overflow-checks = true [features] -default = ["mmap"] +# by default no-fail is disabled. We manually enable it when running test. +default = ["mmap", "no_fail"] mmap = ["fst/mmap", "atomicwrites"] lz4-compression = ["lz4"] +no_fail = ["fail/no_fail"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } + diff --git a/README.md b/README.md index 499a12464..0ce522a7c 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,10 @@ To check out and run tests, you can simply run : cd tantivy cargo build +## Running tests + +Some tests will not run with just `cargo test` because of `fail-rs`. +To run the tests exhaustively, run `./run-tests.sh`. # Contribute diff --git a/appveyor.yml b/appveyor.yml index a3bd2ac04..685b04d3a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,5 +18,5 @@ install: build: false test_script: - - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - REM SET RUST_BACKTRACE=1 & cargo build --examples \ No newline at end of file + - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1 + - REM SET RUST_BACKTRACE=1 & cargo build --examples diff --git a/ci/script.sh b/ci/script.sh index b56345753..0939344b0 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,7 +16,7 @@ main() { return fi echo "Test" - cross test --target $TARGET + cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1 fi for example in $(ls examples/*.rs) do diff --git a/run-tests.sh b/run-tests.sh new file mode 100755 index 000000000..fc2944dd5 --- /dev/null +++ b/run-tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +cargo test --no-default-features --features mmap -- --test-threads 1 diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 73f03c4f5..326e7cee8 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -266,14 +266,14 @@ mod tests { #[test] fn test_bitset_large() { - let arr = generate_nonunique_unsorted(1_000_000, 50_000); + let arr = generate_nonunique_unsorted(100_000, 5_000); let mut btreeset: BTreeSet = BTreeSet::new(); - let mut bitset = BitSet::with_max_value(1_000_000); + let mut bitset = BitSet::with_max_value(100_000); for el in arr { btreeset.insert(el); bitset.insert(el); } - for i in 0..1_000_000 { + for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } assert_eq!(btreeset.len(), bitset.len()); diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 37b950332..517e153f6 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -225,6 +225,8 @@ impl SegmentReader { let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); + fail_point!("SegmentReader::open#middle"); + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; let postings_composite = CompositeFile::open(&postings_source)?; @@ -432,6 +434,7 @@ mod test { use schema::{SchemaBuilder, Term, STORED, TEXT}; use DocId; + #[test] fn test_alive_docs_iterator() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 4e55da537..1b40970b4 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -195,6 +195,9 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + fail_point!("RAMDirectory::atomic_write", |msg| { + Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string()))) + }); let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); self.fs.write(path_buf, &Vec::new())?; diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e3599bacf..fdb029432 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -370,7 +370,7 @@ mod tests { pub fn generate_permutation() -> Vec { let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let mut rng = XorShiftRng::from_seed(seed); - let mut permutation: Vec = (0u64..1_000_000u64).collect(); + let mut permutation: Vec = (0u64..100_000u64).collect(); rng.shuffle(&mut permutation); permutation } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 5af4ed694..3e11c4ce5 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -301,25 +301,31 @@ fn index_documents( let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); - let segment_reader = SegmentReader::open(segment)?; - let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset( - &mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp, - )?; - - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { - if may_have_deletes { - Some(deleted_bitset) - } else { - None - } - }); + let segment_entry: SegmentEntry; + if delete_cursor.get().is_some() { + let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); + let segment_reader = SegmentReader::open(segment)?; + let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { + if may_have_deletes { + Some(deleted_bitset) + } else { + None + } + }); + } else { + // if there are no delete operation in the queue, no need + // to even open the segment. + segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); + } Ok(segment_updater.add_segment(generation, segment_entry)) } @@ -858,4 +864,33 @@ mod tests { assert_eq!(initial_table_size(1_000_000_000), 19); } + + #[cfg(not(feature="no_fail"))] + #[test] + fn test_write_commit_fails() { + use fail; + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "a")); + } + index_writer.commit().unwrap(); + fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "b")); + } + assert!(index_writer.commit().is_err()); + index.load_searchers().unwrap(); + let num_docs_containing = |s: &str| { + let searcher = index.searcher(); + let term_a = Term::from_field_text(text_field, s); + searcher.doc_freq(&term_a) + }; + assert_eq!(num_docs_containing("a"), 100); + assert_eq!(num_docs_containing("b"), 0); + fail::cfg("RAMDirectory::atomic_write", "off").unwrap(); + } } diff --git a/src/lib.rs b/src/lib.rs index 985d68a84..e5a75cd64 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,6 +173,9 @@ extern crate tinysegmenter; #[macro_use] extern crate downcast; +#[macro_use] +extern crate fail; + #[cfg(test)] mod functional_test; @@ -946,3 +949,4 @@ mod tests { } } } + From 98c7fbdc6f37469a981c3a3555727debc6780ca7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 6 Sep 2018 10:11:54 +0900 Subject: [PATCH 35/62] Issue/378 (#392) * Added failing unit test * Closes #378. Handling queries that end up empty after going through the analyzer. * Fixed stop word example --- examples/stop_words.rs | 64 +++++++++----------- src/query/query_parser/query_parser.rs | 84 +++++++++++++++++++------- 2 files changed, 89 insertions(+), 59 deletions(-) diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 950a42afd..b131d876c 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -23,7 +23,6 @@ use tantivy::Index; fn main() -> tantivy::Result<()> { // this example assumes you understand the content in `basic_search` - let index_path = TempDir::new("tantivy_stopwords_example_dir")?; let mut schema_builder = SchemaBuilder::default(); // This configures your custom options for how tantivy will @@ -31,36 +30,36 @@ fn main() -> tantivy::Result<()> { // to note is that we are setting the tokenizer to `stoppy` // which will be defined and registered below. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); // Our first field is title. schema_builder.add_text_field("title", text_options); // Our second field is body. let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + .set_indexing_options(text_field_indexing) + .set_stored(); schema_builder.add_text_field("body", text_options); let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_ram(schema.clone()); // This tokenizer lowers all of the text (to help with stop word matching) // then removes all instances of `the` and `and` from the corpus let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); index.tokenizers().register("stoppy", tokenizer); @@ -76,16 +75,16 @@ fn main() -> tantivy::Result<()> { )); index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); index_writer.add_document(doc!( title => "Frankenstein", @@ -103,14 +102,9 @@ fn main() -> tantivy::Result<()> { let query_parser = QueryParser::for_index(&index, vec![title, body]); - // this will have NO hits because it was filtered out - // because the query is run through the analyzer you - // actually will get an error here because the query becomes - // empty - assert!(query_parser.parse_query("the").is_err()); - - // this will have hits - let query = query_parser.parse_query("is")?; + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let mut top_collector = TopCollector::with_limit(10); @@ -124,6 +118,4 @@ fn main() -> tantivy::Result<()> { } Ok(()) -} - -use tempdir::TempDir; +} \ No newline at end of file diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index f3a9f37c0..93deb48c1 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -20,6 +20,7 @@ use std::str::FromStr; use tokenizer::TokenizerManager; use combine::Parser; use query::EmptyQuery; +use query::query_parser::logical_ast::LogicalAST; /// Possible error that may happen when parsing a query. @@ -58,6 +59,27 @@ impl From for QueryParserError { } } + +/// Recursively remove empty clause from the AST +/// +/// Returns `None` iff the `logical_ast` ended up being empty. +fn trim_ast(logical_ast: LogicalAST) -> Option { + match logical_ast { + LogicalAST::Clause(children) => { + let trimmed_children = children.into_iter() + .flat_map(|(occur, child)| + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) ) + .collect::>(); + if trimmed_children.is_empty() { + None + } else { + Some(LogicalAST::Clause(trimmed_children)) + } + }, + _ => Some(logical_ast), + } +} + /// Tantivy's Query parser /// /// The language covered by the current parser is extremely simple. @@ -369,14 +391,15 @@ impl QueryParser { asts.push(LogicalAST::Leaf(Box::new(ast))); } } - let result_ast = if asts.is_empty() { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) - }; + let result_ast: LogicalAST = + if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause( + asts.into_iter() + .map(|ast| (Occur::Should, ast)) + .collect()) + }; Ok(result_ast) } UserInputLeaf::All => { @@ -429,19 +452,17 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { } fn convert_to_query(logical_ast: LogicalAST) -> Box { - match logical_ast { - LogicalAST::Clause(clause) => { - if clause.is_empty() { - Box::new(EmptyQuery) - } else { - let occur_subqueries = clause - .into_iter() - .map(|(occur, subquery)| (occur, convert_to_query(subquery))) - .collect::>(); - Box::new(BooleanQuery::from(occur_subqueries)) - } - } - LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), + match trim_ast(logical_ast) { + Some(LogicalAST::Clause(trimmed_clause)) => { + let occur_subqueries = trimmed_clause + .into_iter() + .map(|(occur, subquery)| (occur, convert_to_query(subquery))) + .collect::>(); + assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming"); + Box::new(BooleanQuery::from(occur_subqueries)) + }, + Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal), + None => Box::new(EmptyQuery) } } @@ -454,12 +475,17 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::SimpleTokenizer; - use tokenizer::TokenizerManager; + use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("en_with_stop_words") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); let title = schema_builder.add_text_field("title", TEXT); let text = schema_builder.add_text_field("text", TEXT); schema_builder.add_i64_field("signed", INT_INDEXED); @@ -468,9 +494,14 @@ mod test { schema_builder.add_text_field("notindexed_u64", STORED); schema_builder.add_text_field("notindexed_i64", STORED); schema_builder.add_text_field("nottokenized", STRING); + schema_builder.add_text_field("with_stop_words", text_options); let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register("en_with_stop_words", SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])) + ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -739,6 +770,13 @@ mod test { ); } + #[test] + pub fn test_query_parser_not_empty_but_no_tokens() { + let query_parser = make_query_parser(); + assert!(query_parser.parse_query(" !, ").is_ok()); + assert!(query_parser.parse_query("with_stop_words:the").is_ok()); + } + #[test] pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( From 934933582e5f6a9e3a8c3c899b9c1cd557f30d2c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 6 Sep 2018 10:12:26 +0900 Subject: [PATCH 36/62] Closes #402 (#403) --- src/core/segment_reader.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 517e153f6..56a3a7b9e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,7 +4,6 @@ use core::InvertedIndexReader; use core::Segment; use core::SegmentComponent; use core::SegmentId; -use core::SegmentMeta; use error::TantivyError; use fastfield::DeleteBitSet; use fastfield::FacetReader; @@ -44,7 +43,8 @@ pub struct SegmentReader { inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, - segment_meta: SegmentMeta, + max_doc: DocId, + num_docs: DocId, termdict_composite: CompositeFile, postings_composite: CompositeFile, @@ -64,7 +64,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes, so it happens /// to also be the number of documents in the index. pub fn max_doc(&self) -> DocId { - self.segment_meta.max_doc() + self.max_doc } /// Returns the number of documents. @@ -73,7 +73,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes so max doc and /// num_docs are the same. pub fn num_docs(&self) -> DocId { - self.segment_meta.num_docs() + self.num_docs } /// Returns the schema of the index this segment belongs to. @@ -262,7 +262,8 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), - segment_meta: segment.meta().clone(), + max_doc: segment.meta().max_doc(), + num_docs: segment.meta().num_docs(), termdict_composite, postings_composite, fast_fields_composite, From 1d439e96f5e8845a79d83f3019e5bae33666b291 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 7 Sep 2018 08:43:44 +0900 Subject: [PATCH 37/62] Using sort unstable by key. --- src/postings/postings_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index fe56795e7..d90b322d9 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -98,7 +98,7 @@ impl MultiFieldPostingsWriter { .iter() .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId)) .collect(); - term_offsets.sort_by_key(|&(k, _, _)| k); + term_offsets.sort_unstable_by_key(|&(k, _, _)| k); let mut offsets: Vec<(Field, usize)> = vec![]; let term_offsets_it = term_offsets From 9101bf575343926256830ddfd9aa1b80004ab637 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 7 Sep 2018 09:57:12 +0900 Subject: [PATCH 38/62] Fragments --- src/core/index.rs | 21 +++++++++++- src/core/searcher.rs | 10 +++++- src/query/automaton_weight.rs | 2 +- src/query/term_query/term_weight.rs | 2 +- src/query/weight.rs | 13 +++++--- src/snippet/mod.rs | 50 +++++++++++++++++++++++------ 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index efdfedc5f..c6f465eef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -29,6 +29,9 @@ use num_cpus; use std::path::Path; use tokenizer::TokenizerManager; use IndexWriter; +use schema::FieldType; +use schema::Field; +use tokenizer::BoxedTokenizer; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -113,6 +116,22 @@ impl Index { &self.tokenizers } + pub fn tokenizer_for_field(&self, field: Field) -> Option> { + let field_type = self.schema.get_field_entry(field).field_type(); + let tokenizer: &TokenizerManager = self.tokenizers(); + match field_type { + FieldType::Str(text_options) => { + text_options.get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer()) + .and_then(|tokenizer_name| tokenizer.get(tokenizer_name)) + + }, + _ => { + None + } + } + } + /// Opens a new directory from an index path. #[cfg(feature = "mmap")] pub fn open_in_dir>(directory_path: P) -> Result { @@ -257,7 +276,7 @@ impl Index { let schema = self.schema(); let num_searchers: usize = self.num_searchers.load(Ordering::Acquire); let searchers = (0..num_searchers) - .map(|_| Searcher::new(schema.clone(), segment_readers.clone())) + .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone())) .collect(); self.searcher_pool.publish_new_generation(searchers); Ok(()) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8f36b58ea..9de6c857c 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use termdict::TermMerger; use DocAddress; use Result; +use Index; /// Holds a list of `SegmentReader`s ready for search. /// @@ -18,17 +19,24 @@ use Result; /// pub struct Searcher { schema: Schema, + index: Index, segment_readers: Vec, } impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, segment_readers: Vec) -> Searcher { + pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec) -> Searcher { Searcher { schema, + index, segment_readers, } } + + pub fn index(&self) -> &Index { + &self.index + } + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 54f8c5f8b..854ecb66e 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -75,7 +75,7 @@ where inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); let term = Term::from_field_bytes(self.field, &term_buffer[..]); for &doc_id in &docs_matching_current_term { - matching_terms.add_term(doc_id, term.clone()); + matching_terms.add_term(doc_id, term.clone(), 1f32); } } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 1a9075b5a..aa1b5e456 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -49,7 +49,7 @@ impl Weight for TermWeight { for doc_id in doc_ids { match scorer.skip_next(doc_id) { SkipResult::Reached => { - matching_terms.add_term(doc_id, self.term.clone()); + matching_terms.add_term(doc_id, self.term.clone(), 1f32); } SkipResult::OverStep => {} SkipResult::End => { diff --git a/src/query/weight.rs b/src/query/weight.rs index 5b603ab1c..8a12c01da 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -5,9 +5,10 @@ use DocId; use std::collections::HashSet; use Term; use std::collections::BTreeMap; +use std::collections::HashMap; pub struct MatchingTerms { - doc_to_terms: BTreeMap> + doc_to_terms: BTreeMap> } impl MatchingTerms { @@ -16,18 +17,22 @@ impl MatchingTerms { doc_to_terms: doc_ids .iter() .cloned() - .map(|doc_id| (doc_id, HashSet::default())) + .map(|doc_id| (doc_id, HashMap::default())) .collect() } } + pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap> { + self.doc_to_terms.get(&doc_id) + } + pub fn sorted_doc_ids(&self) -> Vec { self.doc_to_terms.keys().cloned().collect() } - pub fn add_term(&mut self, doc_id: DocId, term: Term) { + pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) { if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { - terms.insert(term); + terms.insert(term, score); } } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index cd194e0d8..c82777782 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -12,6 +12,9 @@ use DocAddress; use DocId; use Searcher; use query::MatchingTerms; +use schema::Field; +use std::collections::HashMap; +use SegmentLocalId; #[derive(Debug)] pub struct HighlightSection { @@ -189,28 +192,55 @@ fn select_best_fragment_combination<'a>( -fn matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result<()> { +fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result> { let weight = query.weight(searcher, false)?; let mut doc_groups = doc_addresses .iter() .group_by(|doc_address| doc_address.0); + let mut matching_terms_per_segment: HashMap = HashMap::new(); for (segment_ord, doc_addrs) in doc_groups.into_iter() { let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); let segment_reader = searcher.segment_reader(segment_ord); weight.matching_terms(segment_reader, &mut matching_terms)?; + matching_terms_per_segment.insert(segment_ord, matching_terms); } - let terms = HashSet<(DocId, Vec)>; - Ok(()) + Ok(matching_terms_per_segment) } -pub fn generate_snippet<'a>( - doc: &'a [DocAddress], - index: &Index, +pub fn generate_snippet( + doc_addresses: &[DocAddress], + fields: &[Field], + searcher: &Searcher, query: &Query, - terms: Vec, - max_num_chars: usize) -> Snippet { - search_fragments(boxed_tokenizer, &text, terms, 3); + max_num_chars: usize) -> Result> { + // TODO sort doc_addresses + let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; + for doc_address in doc_addresses { + let doc = searcher.doc(doc_address)?; + for &field in fields { + let mut text = String::new(); + for value in doc.get_all(field) { + text.push_str(value.text()); + } + if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) { + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + search_fragments(tokenizer, + &text, + terms, + max_num_chars); + } + } + } + } + } + // search_fragments(boxed_tokenizer, &text, terms, 3); + panic!("e"); } #[cfg(test)] @@ -346,7 +376,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let text = "a b c d"; - let mut terms = BTreeMap::new(); + let terms = BTreeMap::new(); let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); assert_eq!(fragments.len(), 0); From 2e44f0f09901664b91129189d4da02aa16537b78 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 9 Sep 2018 14:23:24 +0900 Subject: [PATCH 39/62] blop --- src/snippet/mod.rs | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c82777782..65d50575c 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -209,32 +209,37 @@ fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[D } pub fn generate_snippet( - doc_addresses: &[DocAddress], - fields: &[Field], searcher: &Searcher, + field: Field, query: &Query, + doc_addresses: &[DocAddress], max_num_chars: usize) -> Result> { + + let mut doc_address_ords: Vec = (0..doc_addresses.len()).collect(); + doc_address_ords.sort_by_key(|k| doc_addresses[*k]); + // TODO sort doc_addresses let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; - for doc_address in doc_addresses { + for doc_address in doc_addresses { + let segment_ord: u32 = doc_address.segment_ord(); let doc = searcher.doc(doc_address)?; - for &field in fields { - let mut text = String::new(); - for value in doc.get_all(field) { - text.push_str(value.text()); - } + + let mut text = String::new(); + for value in doc.get_all(field) { + text.push_str(value.text()); + } + + if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) { if let Some(tokenizer) = searcher.index().tokenizer_for_field(field) { - if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&doc_address.segment_ord()) { - if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { - let terms: BTreeMap = terms - .iter() - .map(|(term, score)| (term.text().to_string(), *score)) - .collect(); - search_fragments(tokenizer, - &text, - terms, - max_num_chars); - } + if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { + let terms: BTreeMap = terms + .iter() + .map(|(term, score)| (term.text().to_string(), *score)) + .collect(); + let fragment_candidates = search_fragments(tokenizer, + &text, + terms, + max_num_chars); } } } From 7e5f697d0099081fa6d8aa1e89ed8f2cc1e9d771 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 9 Sep 2018 16:23:56 +0900 Subject: [PATCH 40/62] Closes #387 --- Cargo.toml | 1 - src/lib.rs | 2 - src/tokenizer/japanese_tokenizer.rs | 94 ----------------------------- src/tokenizer/mod.rs | 24 -------- src/tokenizer/tokenizer_manager.rs | 2 - 5 files changed, 123 deletions(-) delete mode 100644 src/tokenizer/japanese_tokenizer.rs diff --git a/Cargo.toml b/Cargo.toml index 098ab91c4..53a318fc2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,6 @@ keywords = ["search", "information", "retrieval"] base64 = "0.9.1" byteorder = "1.0" lazy_static = "1" -tinysegmenter = "0.1.0" regex = "1.0" fst = {version="0.3", default-features=false} fst-regex = { version="0.2" } diff --git a/src/lib.rs b/src/lib.rs index e5a75cd64..d6073eee1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -168,8 +168,6 @@ extern crate rand; #[cfg(all(test, feature = "unstable"))] extern crate test; -extern crate tinysegmenter; - #[macro_use] extern crate downcast; diff --git a/src/tokenizer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs deleted file mode 100644 index 5b072e380..000000000 --- a/src/tokenizer/japanese_tokenizer.rs +++ /dev/null @@ -1,94 +0,0 @@ -use super::{Token, TokenStream, Tokenizer}; -use tinysegmenter; - -/// Simple japanese tokenizer based on the `tinysegmenter` crate. -#[derive(Clone)] -pub struct JapaneseTokenizer; - -#[derive(Eq, PartialEq)] -enum Cursor { - HasNotStarted, - Cursor(usize), - Terminated, -} - -pub struct JapaneseTokenizerStream { - tokens: Vec, - cursor: Cursor, -} - -impl<'a> Tokenizer<'a> for JapaneseTokenizer { - type TokenStreamImpl = JapaneseTokenizerStream; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - let mut tokens = vec![]; - let mut offset_from; - let mut offset_to = 0; - for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() { - offset_from = offset_to; - offset_to = offset_from + term.len(); - if term.chars().all(char::is_alphanumeric) { - tokens.push(Token { - offset_from, - offset_to, - position: pos, - text: term, - position_length: 1 - }); - } - } - JapaneseTokenizerStream { - tokens, - cursor: Cursor::HasNotStarted, - } - } -} - -impl<'a> TokenStream for JapaneseTokenizerStream { - fn advance(&mut self) -> bool { - let new_cursor = match self.cursor { - Cursor::HasNotStarted => { - if self.tokens.is_empty() { - Cursor::Terminated - } else { - Cursor::Cursor(0) - } - } - Cursor::Cursor(pos) => { - let new_pos = pos + 1; - if new_pos >= self.tokens.len() { - Cursor::Terminated - } else { - Cursor::Cursor(new_pos) - } - } - Cursor::Terminated => Cursor::Terminated, - }; - self.cursor = new_cursor; - self.cursor != Cursor::Terminated - } - - fn token(&self) -> &Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } - - fn token_mut(&mut self) -> &mut Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &mut self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..0b1c68339 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -130,7 +130,6 @@ //! mod alphanum_only; mod facet_tokenizer; -mod japanese_tokenizer; mod lower_caser; mod ngram_tokenizer; mod raw_tokenizer; @@ -144,7 +143,6 @@ mod tokenizer_manager; pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::facet_tokenizer::FacetTokenizer; -pub use self::japanese_tokenizer::JapaneseTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; @@ -224,28 +222,6 @@ pub mod test { assert_token(&tokens[3], 3, "payer", 17, 22); } - #[test] - fn test_jp_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("ja").unwrap(); - - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("野菜食べないとやばい!") - .process(&mut add_token); - } - assert_eq!(tokens.len(), 5); - assert_token(&tokens[0], 0, "野菜", 0, 6); - assert_token(&tokens[1], 1, "食べ", 6, 12); - assert_token(&tokens[2], 2, "ない", 12, 18); - assert_token(&tokens[3], 3, "と", 18, 21); - assert_token(&tokens[4], 4, "やばい", 21, 30); - } - #[test] fn test_ngram_tokenizer() { use super::{LowerCaser, NgramTokenizer}; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..981962a66 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; use tokenizer::tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; -use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; @@ -74,7 +73,6 @@ impl Default for TokenizerManager { .filter(LowerCaser) .filter(Stemmer::new()), ); - manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40))); manager } } From a78aa4c259dc77486d5653c1143d9a8e15d2fda5 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 9 Sep 2018 17:23:30 +0900 Subject: [PATCH 41/62] updating doc --- doc/src/SUMMARY.md | 1 + doc/src/avant-propos.md | 29 +++++++------ doc/src/basis.md | 81 +++++++++++++++++++++++++------------ doc/src/best_practise.md.rs | 0 doc/src/examples.md | 2 + 5 files changed, 74 insertions(+), 39 deletions(-) create mode 100644 doc/src/best_practise.md.rs diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md index 76dd29748..a280d19b7 100644 --- a/doc/src/SUMMARY.md +++ b/doc/src/SUMMARY.md @@ -9,6 +9,7 @@ - [Facetting](./facetting.md) - [Innerworkings](./innerworkings.md) - [Inverted index](./inverted_index.md) +- [Best practise](./inverted_index.md) [Frequently Asked Questions](./faq.md) [Examples](./examples.md) diff --git a/doc/src/avant-propos.md b/doc/src/avant-propos.md index aa50cd02b..485afd178 100644 --- a/doc/src/avant-propos.md +++ b/doc/src/avant-propos.md @@ -2,8 +2,8 @@ > Tantivy is a **search** engine **library** for Rust. -If you are familiar with Lucene, tantivy is heavily inspired by Lucene's design and -they both have the same scope and targetted users. +If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and +they both have the same scope and targetted use cases. If you are not familiar with Lucene, let's break down our little tagline. @@ -17,15 +17,18 @@ relevancy, collapsing, highlighting, spatial search. experience. But keep in mind this is just a toolbox. Which bring us to the second keyword... -- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution. - - Sometimes a functionality will not be available in tantivy because it is too specific to your use case. By design, tantivy should make it possible to extend - the available set of features using the existing rock-solid datastructures. +- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance. - Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own - `Tokenizer/TokenFilter`... But some of your requirement may also be related to - architecture or operations. For instance, you may want to build a large corpus on Hadoop, - fine-tune the merge policy to keep your index sharded in a time-wise fashion, or you may want - to convert and existing index from a different format. - - Tantivy exposes its API to do all of these things. \ No newline at end of file + Sometimes a functionality will not be available in tantivy because it is too + specific to your use case. By design, tantivy should make it possible to extend + the available set of features using the existing rock-solid datastructures. + + Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own + `TokenFilter`... Some of your requirements may also be related to + something closer to architecture or operations. For instance, you may + want to build a large corpus on Hadoop, fine-tune the merge policy to keep your + index sharded in a time-wise fashion, or you may want to convert and existing + index from a different format. + + Tantivy exposes a lot of low level API to do all of these things. + diff --git a/doc/src/basis.md b/doc/src/basis.md index e52615f6d..21dadb7fb 100644 --- a/doc/src/basis.md +++ b/doc/src/basis.md @@ -2,47 +2,76 @@ ## Straight from disk -By default, tantivy accesses its data using its `MMapDirectory`. -While this design has some downsides, this greatly simplifies the source code of tantivy, -and entirely delegates the caching to the OS. +Tantivy accesses its data using an abstracting trait called `Directory`. +In theory, one can come and override the data access logic. In practise, the +trait somewhat assumes that your data can be mapped to memory, and tantivy +seems deeply married to using `mmap` for its io [^1], and the only persisting +directory shipped with tantivy is the `MmapDirectory`. -`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. -As a result, the act of opening an indexing does not involve loading different datastructures -from the disk into random access memory : starting a process, opening an index, and performing a query -can typically be done in a matter of milliseconds. +While this design has some downsides, this greatly simplifies the source code of +tantivy. Caching is also entirely delegated to the OS. -This is an interesting property for a command line search engine, or for some multi-tenant log search engine. -Spawning a new process for each new query can be a perfectly sensible solution in some use case. +`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds. + +This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case. In later chapters, we will discuss tantivy's inverted index data layout. -One key take away is that to achieve great performance, search indexes are extremely compact. +One key take away is that to achieve great performance, search indexes are extremely compact. Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM. -Also, whenever possible the data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access -the data from your spinning hard disk, but this is also a great property when working with `SSD` or `RAM`, -as it makes our read patterns very predictable for the CPU. +Also, whenever possible its data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access the data from your spinning hard disk, but this is also +critical for performance, if your data is read from and an `SSD` or even already in your pagecache. ## Segments, and the log method -That kind compact layout comes at one cost: it prevents our datastructures from being dynamic. -In fact, a trait called `Directory` is in charge of abstracting all of tantivy's data access -and its API does not even allow editing these file once they are written. +That kind of compact layout comes at one cost: it prevents our datastructures from being dynamic. +In fact, the `Directory` trait does not even allow you to modify part of a file. To allow the addition / deletion of documents, and create the illusion that -your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes -referred to as the *log method*. +your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes referred to as the *log method*. -Let's forget about deletes for a moment. As you add documents, these documents are processed and stored in -a dedicated datastructure, in a `RAM` buffer. This datastructure is designed to be dynamic but -cannot be accessed for search. As you add documents, this buffer will reach its capacity and tantivy will -transparently stop adding document to it and start converting this datastructure to its final -read-only format on disk. Once written, an brand empty buffer is available to resume adding documents. +Let's forget about deletes for a moment. + +As you add documents, these documents are processed and stored in a dedicated datastructure, in a `RAM` buffer. This datastructure is not ready for search, but it is useful to receive your data and rearrange it very rapidly. + +As you add documents, this buffer will reach its capacity and tantivy will transparently stop adding document to it and start converting this datastructure to its final read-only format on disk. Once written, an brand empty buffer is available to resume adding documents. The resulting chunk of index obtained after this serialization is called a `Segment`. -> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files -are identified using the naming scheme : `.*`. +> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files are identified using the naming scheme : `.*`. + +Which brings us to the nature of a tantivy `Index`. + +> A tantivy `Index` is a collection of `Segments`. + +Physically, this really just means and index is a bunch of segment files in a given `Directory`, +linked together by a `meta.json` file. This transparency can become extremely handy +to get tantivy to fit your use case: + +*Example 1* You could for instance use hadoop to build a very large search index in a timely manner, copy all of the resulting segment files in the same directory and edit the `meta.json` to get a functional index.[^2] + +*Example 2* You could also disable your merge policy and enforce daily segments. Removing data after one week can then be done very efficiently by just editing the `meta.json` and deleting the files associated to segment `D-7`. -> A tantivy `Index` is a collection of `Segments`. \ No newline at end of file + + + +# Merging + +As you index more and more data, your index will accumulate more and more segments. +Having a lot of small segments is not really optimal. There is a bit of redundancy in having +all these term dictionary. Also when searching, we will need to do term lookups as many times as we have segments. It can hurt search performance a bit. + +That's where merging or compacting comes into place. Tantivy will continuously consider merge +opportunities and start merging segments in the background. + + +# Indexing throughput, number of indexing threads + + + + +[^1]: This may eventually change. + +[^2]: Be careful however. By default these files will not be considered as *managed* by tantivy. This means they will never be garbage collected by tantivy, regardless of whether they become obsolete or not. diff --git a/doc/src/best_practise.md.rs b/doc/src/best_practise.md.rs new file mode 100644 index 000000000..e69de29bb diff --git a/doc/src/examples.md b/doc/src/examples.md index df635b4e6..6ba4a8a4d 100644 --- a/doc/src/examples.md +++ b/doc/src/examples.md @@ -1 +1,3 @@ # Examples + +- [Basic search](/examples/basic_search.html) \ No newline at end of file From e32dba1a9747ee2ed68988d910c518d8e4318229 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 10 Sep 2018 09:26:33 +0900 Subject: [PATCH 42/62] Phrase weight --- src/query/automaton_weight.rs | 42 --------- src/query/boolean_query/boolean_query.rs | 8 ++ src/query/boolean_query/boolean_weight.rs | 9 -- src/query/mod.rs | 2 - src/query/phrase_query/phrase_query.rs | 7 ++ src/query/phrase_query/phrase_weight.rs | 5 - src/query/query.rs | 5 +- src/query/range_query.rs | 5 - src/query/term_query/term_query.rs | 4 + src/query/term_query/term_weight.rs | 21 ----- src/query/weight.rs | 34 ------- src/snippet/mod.rs | 110 ++++++++-------------- 12 files changed, 61 insertions(+), 191 deletions(-) diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 854ecb66e..9ff7b8594 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,7 +7,6 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; -use query::weight::MatchingTerms; use SkipResult; use Term; use DocId; @@ -41,47 +40,6 @@ impl Weight for AutomatonWeight where A: Automaton, { - fn matching_terms(&self, - reader: &SegmentReader, - matching_terms: &mut MatchingTerms) -> Result<()> { - let max_doc = reader.max_doc(); - let mut doc_bitset = BitSet::with_max_value(max_doc); - - let inverted_index = reader.inverted_index(self.field); - let term_dict = inverted_index.terms(); - let mut term_stream = self.automaton_stream(term_dict); - - let doc_ids = matching_terms.sorted_doc_ids(); - let mut docs_matching_current_term: Vec = vec![]; - - let mut term_buffer: Vec = vec![]; - - while term_stream.advance() { - docs_matching_current_term.clear(); - let term_info = term_stream.value(); - let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); - for &doc_id in &doc_ids { - match segment_postings.skip_next(doc_id) { - SkipResult::Reached => { - docs_matching_current_term.push(doc_id); - } - SkipResult::OverStep => {} - SkipResult::End => {} - } - } - if !docs_matching_current_term.is_empty() { - term_buffer.clear(); - let term_ord = term_stream.term_ord(); - inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); - let term = Term::from_field_bytes(self.field, &term_buffer[..]); - for &doc_id in &docs_matching_current_term { - matching_terms.add_term(doc_id, term.clone(), 1f32); - } - } - } - Ok(()) - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 286d9f449..b92a203eb 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -6,6 +6,7 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; +use std::collections::BTreeSet; use Searcher; /// The boolean query combines a set of queries @@ -40,6 +41,7 @@ impl From)>> for BooleanQuery { } impl Query for BooleanQuery { + fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { let sub_weights = self.subqueries .iter() @@ -49,6 +51,12 @@ impl Query for BooleanQuery { .collect::>()?; Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_occur, subquery) in &self.subqueries { + subquery.query_terms(term_set); + } + } } impl BooleanQuery { diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 2b3348a21..575bc2991 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -13,7 +13,6 @@ use query::Weight; use std::borrow::Borrow; use std::collections::HashMap; use Result; -use query::MatchingTerms; fn scorer_union(scorers: Vec>) -> Box where @@ -108,14 +107,6 @@ impl BooleanWeight { } impl Weight for BooleanWeight { - - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - for (_, weight) in &self.weights { - weight.matching_terms(reader, matching_terms)?; - } - Ok(()) - } - fn scorer(&self, reader: &SegmentReader) -> Result> { if self.weights.is_empty() { Ok(Box::new(EmptyScorer)) diff --git a/src/query/mod.rs b/src/query/mod.rs index 0b6ee2adb..73a77174b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -27,8 +27,6 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; -pub use self::weight::MatchingTerms; - pub use self::intersection::Intersection; pub use self::union::Union; diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index e501711ed..d103461c1 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -6,6 +6,7 @@ use query::Query; use query::Weight; use schema::{Field, Term}; use Result; +use std::collections::BTreeSet; /// `PhraseQuery` matches a specific sequence of words. /// @@ -107,4 +108,10 @@ impl Query for PhraseQuery { ))) } } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_, query_term) in &self.phrase_terms { + term_set.insert(query_term.clone()); + } + } } diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index fbf43db20..69ab4e184 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -7,7 +7,6 @@ use query::Weight; use schema::IndexRecordOption; use schema::Term; use Result; -use query::MatchingTerms; pub struct PhraseWeight { phrase_terms: Vec<(usize, Term)>, @@ -32,10 +31,6 @@ impl PhraseWeight { impl Weight for PhraseWeight { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - unimplemented!(); - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/query.rs b/src/query/query.rs index 7004768e4..a72c33d00 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -6,7 +6,8 @@ use std::fmt; use Result; use SegmentLocalId; use DocAddress; -use query::weight::MatchingTerms; +use std::collections::BTreeSet; +use Term; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. @@ -60,6 +61,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + fn query_terms(&self, term_set: &mut BTreeSet) {} + /// Search works as follows : /// /// First the weight object associated to the query is created. diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 2b22e7cf8..06d98db66 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -11,7 +11,6 @@ use std::collections::Bound; use std::ops::Range; use termdict::{TermDictionary, TermStreamer}; use Result; -use query::MatchingTerms; fn map_bound TTo>( bound: &Bound, @@ -276,10 +275,6 @@ impl RangeWeight { impl Weight for RangeWeight { - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - unimplemented!(); - } - fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 9ba10b307..d6cd72288 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -6,6 +6,7 @@ use schema::IndexRecordOption; use Result; use Searcher; use Term; +use std::collections::BTreeSet; /// A Term query matches all of the documents /// containing a specific term. @@ -110,4 +111,7 @@ impl Query for TermQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { Ok(Box::new(self.specialized_weight(searcher, scoring_enabled))) } + fn query_terms(&self, term_set: &mut BTreeSet) { + term_set.insert(self.term.clone()); + } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index aa1b5e456..162abe519 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -9,7 +9,6 @@ use schema::IndexRecordOption; use Result; use Term; use SkipResult; -use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -40,26 +39,6 @@ impl Weight for TermWeight { } } - - fn matching_terms(&self, - reader: &SegmentReader, - matching_terms: &mut MatchingTerms) -> Result<()> { - let doc_ids = matching_terms.sorted_doc_ids(); - let mut scorer = self.scorer(reader)?; - for doc_id in doc_ids { - match scorer.skip_next(doc_id) { - SkipResult::Reached => { - matching_terms.add_term(doc_id, self.term.clone(), 1f32); - } - SkipResult::OverStep => {} - SkipResult::End => { - break; - } - } - } - Ok(()) - } - fn count(&self, reader: &SegmentReader) -> Result { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index 8a12c01da..8bca9ad16 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -7,36 +7,6 @@ use Term; use std::collections::BTreeMap; use std::collections::HashMap; -pub struct MatchingTerms { - doc_to_terms: BTreeMap> -} - -impl MatchingTerms { - pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { - MatchingTerms { - doc_to_terms: doc_ids - .iter() - .cloned() - .map(|doc_id| (doc_id, HashMap::default())) - .collect() - } - } - - pub fn terms_for_doc(&self, doc_id: DocId) -> Option<&HashMap> { - self.doc_to_terms.get(&doc_id) - } - - pub fn sorted_doc_ids(&self) -> Vec { - self.doc_to_terms.keys().cloned().collect() - } - - pub fn add_term(&mut self, doc_id: DocId, term: Term, score: f32) { - if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { - terms.insert(term, score); - } - } -} - /// A Weight is the specialization of a Query /// for a given set of segments. /// @@ -46,10 +16,6 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result>; - fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { - Ok(()) - } - /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result { Ok(self.scorer(reader)?.count()) diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index ffd6613e3..39d1ff89c 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -11,11 +11,11 @@ use query::Query; use DocAddress; use DocId; use Searcher; -use query::MatchingTerms; use schema::Field; use std::collections::HashMap; use SegmentLocalId; use error::TantivyError; +use std::collections::BTreeSet; #[derive(Debug)] pub struct HighlightSection { @@ -129,9 +129,9 @@ impl Snippet { /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// has to be a valid string. fn search_fragments<'a>( - tokenizer: Box, + tokenizer: &BoxedTokenizer, text: &'a str, - terms: BTreeMap, + terms: &BTreeMap, max_num_chars: usize, ) -> Vec { let mut token_stream = tokenizer.token_stream(text); @@ -199,75 +199,41 @@ fn select_best_fragment_combination<'a>( } +const DEFAULT_MAX_NUM_CHARS: usize = 150; - -fn compute_matching_terms(query: &Query, searcher: &Searcher, doc_addresses: &[DocAddress]) -> Result> { - let weight = query.weight(searcher, false)?; - let mut doc_groups = doc_addresses - .iter() - .group_by(|doc_address| doc_address.0); - let mut matching_terms_per_segment: HashMap = HashMap::new(); - for (segment_ord, doc_addrs) in doc_groups.into_iter() { - let doc_addrs_vec: Vec = doc_addrs.map(|doc_addr| doc_addr.1).collect(); - let mut matching_terms = MatchingTerms::from_doc_ids(&doc_addrs_vec[..]); - let segment_reader = searcher.segment_reader(segment_ord); - weight.matching_terms(segment_reader, &mut matching_terms)?; - matching_terms_per_segment.insert(segment_ord, matching_terms); - } - Ok(matching_terms_per_segment) +pub struct SnippetGenerator { + terms_text: BTreeMap, + tokenizer: Box, + max_num_chars: usize } -pub fn generate_snippet( - searcher: &Searcher, - query: &Query, - field: Field, - doc_addresses: &[DocAddress], - max_num_chars: usize) -> Result> { - - let mut doc_address_ords: Vec = (0..doc_addresses.len()).collect(); - doc_address_ords.sort_by_key(|k| doc_addresses[*k]); - - let mut snippets = vec![]; - let matching_terms_per_segment_local_id = compute_matching_terms(query, searcher, doc_addresses)?; - - for &doc_address_ord in &doc_address_ords { - let doc_address = doc_addresses[doc_address_ord]; - let segment_ord: u32 = doc_address.segment_ord(); - let doc = searcher.doc(&doc_address)?; - - let mut text = String::new(); - for value in doc.get_all(field) { - text.push_str(value.text()); - } - - - if let Some(matching_terms) = matching_terms_per_segment_local_id.get(&segment_ord) { - let tokenizer = searcher.index().tokenizer_for_field(field)?; - if let Some(terms) = matching_terms.terms_for_doc(doc_address.doc()) { - let terms: BTreeMap = terms - .iter() - .map(|(term, score)| (term.text().to_string(), *score)) - .collect(); - let fragment_candidates = search_fragments(tokenizer, - &text, - terms, - max_num_chars); - let snippet = select_best_fragment_combination(fragment_candidates, &text); - snippets.push(snippet); - } else { - snippets.push(Snippet::empty()); - } - } else { - - } +impl SnippetGenerator { + pub fn new(searcher: &Searcher, + query: &Query, + field: Field) -> Result { + let mut terms = BTreeSet::new(); + query.query_terms(&mut terms); + let terms_text: BTreeMap = terms.into_iter() + .filter(|term| term.field() == field) + .map(|term| (term.text().to_string(), 1f32)) + .collect(); + let tokenizer = searcher.index().tokenizer_for_field(field)?; + Ok(SnippetGenerator { + terms_text, + tokenizer, + max_num_chars: DEFAULT_MAX_NUM_CHARS + }) } - // reorder the snippets - for i in 0..doc_addresses.len() { - snippets.swap(i, doc_address_ords[i]); - } + pub fn snippet(&self, text: &str) -> Snippet { + let fragment_candidates = search_fragments(&*self.tokenizer, + &text, + &self.terms_text, + self.max_num_chars); + let snippet = select_best_fragment_combination(fragment_candidates, &text); + snippet - Ok(snippets) + } } #[cfg(test)] @@ -294,7 +260,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 100); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); @@ -315,7 +281,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("c"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 1); { @@ -339,7 +305,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("f"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 2); { @@ -364,7 +330,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl terms.insert(String::from("f"), 1.0); terms.insert(String::from("a"), 0.9); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 7); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7); assert_eq!(fragments.len(), 2); { @@ -388,7 +354,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let mut terms = BTreeMap::new(); terms.insert(String::from("z"), 1.0); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -404,7 +370,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl let text = "a b c d"; let terms = BTreeMap::new(); - let fragments = search_fragments(boxed_tokenizer, &text, terms, 3); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); let snippet = select_best_fragment_combination(fragments, &text); From 644d8a3a10bd6ad292360562e6c6a302bb709f78 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 10 Sep 2018 16:39:45 +0900 Subject: [PATCH 43/62] Added snippet generator --- examples/snippet.rs | 10 ++- src/core/index.rs | 2 +- src/lib.rs | 1 + src/query/automaton_weight.rs | 4 -- src/query/query.rs | 3 +- src/query/term_query/term_weight.rs | 1 - src/query/weight.rs | 5 -- src/snippet/mod.rs | 104 +++++++++++++++++++--------- 8 files changed, 78 insertions(+), 52 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 4efea1e5a..3cded2bd1 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -14,6 +14,8 @@ use tantivy::collector::TopCollector; use tantivy::query::QueryParser; use tantivy::schema::*; use tantivy::Index; +use tantivy::SnippetGenerator; +use tempdir::TempDir; fn main() -> tantivy::Result<()> { // Let's create a temporary directory for the @@ -53,15 +55,14 @@ fn main() -> tantivy::Result<()> { let searcher = index.searcher(); let query_parser = QueryParser::for_index(&index, vec![title, body]); - let query = query_parser.parse_query("sycamore spring")?; let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; - let doc_addresses = top_collector.docs(); + let snippet_generator = + let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { let retrieved_doc = searcher.doc(&doc_address)?; // generate_snippet(&retrieved_doc, query @@ -69,6 +70,3 @@ fn main() -> tantivy::Result<()> { Ok(()) } - - -use tempdir::TempDir; diff --git a/src/core/index.rs b/src/core/index.rs index 6c236ff5f..f0df65b75 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -136,7 +136,7 @@ impl Index { Ok(tokenizer) } None => { - Err(TantivyError::SchemaError(format!("{:?} is not a text field.", field_entry.name()))) + Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name()))) } } } diff --git a/src/lib.rs b/src/lib.rs index 5806d5f69..ef00ec4ee 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -214,6 +214,7 @@ pub mod store; pub mod termdict; mod snippet; +pub use self::snippet::SnippetGenerator; mod docset; pub use self::docset::{DocSet, SkipResult}; diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 9ff7b8594..b38e6592d 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,10 +7,6 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; -use SkipResult; -use Term; -use DocId; -use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight diff --git a/src/query/query.rs b/src/query/query.rs index a72c33d00..9bf139b96 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -5,7 +5,6 @@ use downcast; use std::fmt; use Result; use SegmentLocalId; -use DocAddress; use std::collections::BTreeSet; use Term; @@ -61,7 +60,7 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } - fn query_terms(&self, term_set: &mut BTreeSet) {} + fn query_terms(&self, _term_set: &mut BTreeSet) {} /// Search works as follows : /// diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 162abe519..ba45a8042 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,7 +8,6 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; -use SkipResult; pub struct TermWeight { term: Term, diff --git a/src/query/weight.rs b/src/query/weight.rs index 8bca9ad16..d3d8b3520 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,11 +1,6 @@ use super::Scorer; use core::SegmentReader; use Result; -use DocId; -use std::collections::HashSet; -use Term; -use std::collections::BTreeMap; -use std::collections::HashMap; /// A Weight is the specialization of a Query /// for a given set of segments. diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 39d1ff89c..9842cdd00 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,21 +1,12 @@ use htmlescape::encode_minimal; -use schema::FieldValue; use std::collections::BTreeMap; -use itertools::Itertools; -use tokenizer::BoxedTokenizer; use tokenizer::{Token, TokenStream}; -use Index; use Result; -use Term; use query::Query; -use DocAddress; -use DocId; use Searcher; use schema::Field; -use std::collections::HashMap; -use SegmentLocalId; -use error::TantivyError; use std::collections::BTreeSet; +use tokenizer::BoxedTokenizer; #[derive(Debug)] pub struct HighlightSection { @@ -225,14 +216,16 @@ impl SnippetGenerator { }) } + pub fn set_max_num_chars(&mut self, max_num_chars: usize) { + self.max_num_chars = max_num_chars; + } + pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, &self.terms_text, self.max_num_chars); - let snippet = select_best_fragment_combination(fragment_candidates, &text); - snippet - + select_best_fragment_combination(fragment_candidates, &text) } } @@ -242,39 +235,47 @@ mod tests { use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; + use Index; + use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; + use SnippetGenerator; + use query::QueryParser; - const TOKENIZER: SimpleTokenizer = SimpleTokenizer; + + const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which +describes it as a "safe, concurrent, practical language", supporting functional and +imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], +but its designers intend it to provide better memory safety while still maintaining +performance. + +Rust is free and open-source software, released under an MIT License, or Apache License +2.0. Its designers have refined the language through the experiences of writing the Servo +web browser layout engine[14] and the Rust compiler. A large proportion of current commits +to the project are from community members.[15] + +Rust won first place for "most loved programming language" in the Stack Overflow Developer +Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); - - let text = "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe, concurrent, practical language\", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining performance. - -Rust is free and open-source software, released under an MIT License, or Apache License 2.0. Its designers have refined the language through the experiences of writing the Servo web browser layout engine[14] and the Rust compiler. A large proportion of current commits to the project are from community members.[15] - -Rust won first place for \"most loved programming language\" in the Stack Overflow Developer Survey in 2016, 2017, and 2018. -"; - + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let mut terms = BTreeMap::new(); terms.insert(String::from("rust"), 1.0); terms.insert(String::from("language"), 0.9); - - let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 100); + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { let first = fragments.iter().nth(0).unwrap(); assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(fragments, &text); - assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which describes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which describes it as a "safe".to_owned()) + let snippet = select_best_fragment_combination(fragments, &TEST_TEXT); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); + assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) } #[test] fn test_snippet_in_second_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -298,7 +299,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_at_the_end_of_fragment() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f f g"; @@ -322,7 +323,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_second_fragment_has_the_highest_score() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d e f g"; @@ -347,7 +348,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_term_not_in_text() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -365,7 +366,7 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl #[test] fn test_snippet_with_no_terms() { - let boxed_tokenizer = box_tokenizer(TOKENIZER); + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); let text = "a b c d"; @@ -377,4 +378,41 @@ Rust won first place for \"most loved programming language\" in the Stack Overfl assert_eq!(snippet.fragments, ""); assert_eq!(snippet.to_html(), ""); } + + #[test] + fn test_snippet_generator() { + let mut schema_builder = SchemaBuilder::default (); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic) + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc ! (text_field => TEST_TEXT); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + } + { + snippet_generator.set_max_num_chars(90); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + } + + } } From 63868733a38fa57fccf3d2e6e52ae1c5462a01ba Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 11 Sep 2018 09:45:27 +0900 Subject: [PATCH 44/62] Added SnippetGenerator --- Cargo.toml | 1 - examples/snippet.rs | 9 ++-- src/common/mod.rs | 21 +--------- src/core/index.rs | 25 +++++++++++ src/core/searcher.rs | 1 + src/indexer/merger.rs | 10 ++--- src/lib.rs | 6 +-- src/query/query.rs | 2 + src/schema/schema.rs | 4 +- src/schema/value.rs | 6 +-- src/snippet/mod.rs | 67 ++++++++++++++++++++++++++++-- src/store/mod.rs | 2 +- src/tokenizer/mod.rs | 4 +- src/tokenizer/tokenizer.rs | 2 +- src/tokenizer/tokenizer_manager.rs | 2 +- 15 files changed, 117 insertions(+), 45 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 887f7a9b9..6a9b313f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,6 @@ opt-level = 3 debug = false lto = true debug-assertions = false -overflow-checks = false [profile.test] debug-assertions = true diff --git a/examples/snippet.rs b/examples/snippet.rs index 3cded2bd1..bc31a3e38 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -35,7 +35,6 @@ fn main() -> tantivy::Result<()> { let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); - let mut old_man_doc = Document::default(); // we'll only need one doc for this example. index_writer.add_document(doc!( title => "Of Mice and Men", @@ -60,12 +59,14 @@ fn main() -> tantivy::Result<()> { let mut top_collector = TopCollector::with_limit(10); searcher.search(&*query, &mut top_collector)?; - let snippet_generator = + let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?; let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; - // generate_snippet(&retrieved_doc, query + let doc = searcher.doc(&doc_address)?; + let snippet = snippet_generator.snippet_from_doc(&doc); + println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); + println!("snippet: {}", snippet.to_html()); } Ok(()) diff --git a/src/common/mod.rs b/src/common/mod.rs index 778f0476a..2942438b4 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,17 +68,6 @@ pub trait HasLen { } } - -pub fn is_stricly_sorted(els: &[T]) -> bool { - if els.is_empty() { - true - } else { - els.iter() - .zip(els[1..].iter()) - .all(|(left, right)| left < right) - } -} - const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -116,20 +105,12 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } - - #[test] - fn test_is_strictly_sorted() { - assert!(is_stricly_sorted::(&[])); - assert!(is_stricly_sorted(&[1])); - assert!(is_stricly_sorted(&[1, 2, 3])); - assert!(!is_stricly_sorted(&[1, 3, 2])); - } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/core/index.rs b/src/core/index.rs index f0df65b75..da1744961 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -115,6 +115,8 @@ impl Index { &self.tokenizers } + + /// Helper to access the tokenizer associated to a specific field. pub fn tokenizer_for_field(&self, field: Field) -> Result> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -325,3 +327,26 @@ impl Clone for Index { } } } + + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT, INT_INDEXED}; + + #[test] + fn test_indexer_for_field() { + let mut schema_builder = SchemaBuilder::default(); + let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); + let body_field = schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + assert!(index.tokenizer_for_field(body_field).is_ok()); + assert_eq!( + format!("{:?}", index.tokenizer_for_field(num_likes_field).err()), + "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))" + ); + } + + +} \ No newline at end of file diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9de6c857c..f17df042f 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -33,6 +33,7 @@ impl Searcher { } } + /// Returns the `Index` associated to the `Searcher` pub fn index(&self) -> &Index { &self.index } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index e79551a4c..5d2e17c51 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -770,23 +770,23 @@ mod tests { } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec| { diff --git a/src/lib.rs b/src/lib.rs index ef00ec4ee..33fb62eb8 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -899,11 +899,11 @@ mod tests { assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), "tantivy"); - assert_eq!(values[1].text(), "some other value"); + assert_eq!(values[0].text(), Some("tantivy")); + assert_eq!(values[1].text(), Some("some other value")); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), "short"); + assert_eq!(values[0].text(), Some("short")); } #[test] diff --git a/src/query/query.rs b/src/query/query.rs index 9bf139b96..6abbf35e0 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -60,6 +60,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + /// Extract all of the terms associated to the query and insert them in the + /// term set given in arguments. fn query_terms(&self, _term_set: &mut BTreeSet) {} /// Search works as follows : diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 6d4f6c949..d000ab9e2 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -443,8 +443,8 @@ mod tests { }"#, ) .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); - assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); + assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); + assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton")); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/schema/value.rs b/src/schema/value.rs index f5ce151f1..64b0dc795 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -74,10 +74,10 @@ impl Value { /// /// # Panics /// If the value is not of type `Str` - pub fn text(&self) -> &str { + pub fn text(&self) -> Option<&str> { match *self { - Value::Str(ref text) => text, - _ => panic!("This is not a text field."), + Value::Str(ref text) => Some(text), + _ => None, } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 9842cdd00..6703d6411 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -7,6 +7,9 @@ use Searcher; use schema::Field; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; +use Document; + +const DEFAULT_MAX_NUM_CHARS: usize = 150; #[derive(Debug)] pub struct HighlightSection { @@ -189,16 +192,58 @@ fn select_best_fragment_combination<'a>( } } - -const DEFAULT_MAX_NUM_CHARS: usize = 150; - +/// `SnippetGenerator` +/// +/// # Example +/// +/// ```rust +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::query::QueryParser; +/// use tantivy::SnippetGenerator; +/// +/// # fn main() -> tantivy::Result<()> { +/// # let mut schema_builder = SchemaBuilder::default(); +/// # let text_field = schema_builder.add_text_field("text", TEXT); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; +/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +/// # Je ne me sentis plus guidé par les haleurs : +/// # Des Peaux-Rouges criards les avaient pris pour cibles, +/// # Les ayant cloués nus aux poteaux de couleurs. +/// # +/// # J'étais insoucieux de tous les équipages, +/// # Porteur de blés flamands ou de cotons anglais. +/// # Quand avec mes haleurs ont fini ces tapages, +/// # Les Fleuves m'ont laissé descendre où je voulais. +/// # "#); +/// # index_writer.add_document(doc.clone()); +/// # index_writer.commit()?; +/// # let query_parser = QueryParser::for_index(&index, vec![text_field]); +/// // ... +/// let query = query_parser.parse_query("haleurs flamands").unwrap(); +/// # index.load_searchers()?; +/// # let searcher = index.searcher(); +/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?; +/// snippet_generator.set_max_num_chars(100); +/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet_html: String = snippet.to_html(); +/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); +/// # Ok(()) +/// # } +/// ``` pub struct SnippetGenerator { terms_text: BTreeMap, tokenizer: Box, + field: Field, max_num_chars: usize } impl SnippetGenerator { + /// Creates a new snippet generator pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result { @@ -212,14 +257,30 @@ impl SnippetGenerator { Ok(SnippetGenerator { terms_text, tokenizer, + field, max_num_chars: DEFAULT_MAX_NUM_CHARS }) } + /// Sets a maximum number of chars. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; } + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated to the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + let text: String = doc.get_all(self.field) + .into_iter() + .flat_map(|val| val.text()) + .collect::>() + .join(" "); + self.snippet(&text) + } + + /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, diff --git a/src/store/mod.rs b/src/store/mod.rs index 5d71563e1..7bce9085d 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,7 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text(), + *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(), format!("Doc {}", i) ); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index d4a735bd2..e8bb3527f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,9 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; +pub use self::tokenizer::BoxedTokenizer; +pub(crate) use self::tokenizer::box_tokenizer; + pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e806b70d8..fcdf8f21b 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub fn box_tokenizer(a: A) -> Box +pub(crate) fn box_tokenizer(a: A) -> Box where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, { diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..447dea303 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use tokenizer::tokenizer::box_tokenizer; +use tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; From cc23194c581a1b8b56b6e62fd932929701dd83b2 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 11 Sep 2018 10:05:15 +0900 Subject: [PATCH 45/62] Editing document --- CHANGELOG.md | 2 +- src/snippet/mod.rs | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2256923a..718840223 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ Tantivy 0.7 greatly improving performance - Tantivy error now rely on the failure crate (@drusellers) - Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax - +- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton) Tantivy 0.6.1 ========================= diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 6703d6411..a3d2c48e3 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -8,6 +8,7 @@ use schema::Field; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; use Document; +use std::cmp::Ordering; const DEFAULT_MAX_NUM_CHARS: usize = 150; @@ -156,18 +157,17 @@ fn select_best_fragment_combination<'a>( fragments: Vec, text: &'a str, ) -> Snippet { - if let Some(init) = fragments.iter().nth(0) { - let fragment = - fragments.iter().skip(1).fold( - init, - |acc, item| { - if item.score > acc.score { - item - } else { - acc - } - }, - ); + let best_fragment_opt = fragments + .iter() + .max_by(|left, right| { + let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal); + if cmp_score == Ordering::Equal { + (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) + } else { + cmp_score + } + }); + if let Some(fragment) = best_fragment_opt { let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; let highlighted = fragment .highlighted @@ -179,7 +179,7 @@ fn select_best_fragment_combination<'a>( ) }).collect(); Snippet { - fragments: fragment_text.to_owned(), + fragments: fragment_text.to_string(), highlighted: highlighted, } } else { From 2104c0277c15679d86f9c17e0641f5a38fd4bbc1 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 13 Sep 2018 09:13:37 +0900 Subject: [PATCH 46/62] Updating uuid --- Cargo.toml | 2 +- src/core/segment_id.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 67bf67824..94b1d1bd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ num_cpus = "1.2" itertools = "0.7" levenshtein_automata = {version="0.1", features=["fst_automaton"]} bit-set = "0.5" -uuid = { version = "0.6", features = ["v4", "serde"] } +uuid = { version = "0.7", features = ["v4", "serde"] } crossbeam = "0.4" crossbeam-channel = "0.2" futures = "0.1" diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 75e76089d..64c5fee38 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -52,12 +52,12 @@ impl SegmentId { /// Picking the first 8 chars is ok to identify /// segments in a display message. pub fn short_uuid_string(&self) -> String { - (&self.0.simple().to_string()[..8]).to_string() + (&self.0.to_simple_ref().to_string()[..8]).to_string() } /// Returns a segment uuid string. pub fn uuid_string(&self) -> String { - self.0.simple().to_string() + self.0.to_simple_ref().to_string() } } From 82d25b83972dfc2faef6a8d25f6c1af07b314856 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 13 Sep 2018 12:39:42 +0900 Subject: [PATCH 47/62] Fixing snippet example --- examples/snippet.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index bc31a3e38..5e1fa27d1 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -24,7 +24,8 @@ fn main() -> tantivy::Result<()> { // # Defining the schema let mut schema_builder = SchemaBuilder::default(); - schema_builder.add_text_field("body", TEXT); + let title = schema_builder.add_text_field("title", TEXT | STORED); + let body = schema_builder.add_text_field("body", TEXT | STORED); let schema = schema_builder.build(); // # Indexing documents @@ -32,9 +33,6 @@ fn main() -> tantivy::Result<()> { let mut index_writer = index.writer(50_000_000)?; - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); - // we'll only need one doc for this example. index_writer.add_document(doc!( title => "Of Mice and Men", @@ -68,6 +66,6 @@ fn main() -> tantivy::Result<()> { println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); println!("snippet: {}", snippet.to_html()); } - + Ok(()) } From 30f4f85d488696e1b2835d15080d0f2f7a5913b0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 14 Sep 2018 09:11:07 +0900 Subject: [PATCH 48/62] Closes #414. (#417) Updating documentation for load_searchers. --- src/core/index.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index da1744961..a7bef841d 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -273,12 +273,16 @@ impl Index { self.num_searchers.store(num_searchers, Ordering::Release); } - /// Creates a new generation of searchers after - - /// a change of the set of searchable indexes. + /// Update searchers so that they reflect the state of the last + /// `.commit()`. /// - /// This needs to be called when a new segment has been - /// published or after a merge. + /// If indexing happens in the same process as searching, + /// you most likely want to call `.load_searchers()` right after each + /// successful call to `.commit()`. + /// + /// If indexing and searching happen in different processes, the way to + /// get the freshest `index` at all time, is to watch `meta.json` and + /// call `load_searchers` whenever a changes happen. pub fn load_searchers(&self) -> Result<()> { let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?; let searchable_segments = self.searchable_segments()?; From 8600b8ea25657809c7a1ca824f39d55edf13f001 Mon Sep 17 00:00:00 2001 From: pentlander Date: Thu, 13 Sep 2018 17:22:17 -0700 Subject: [PATCH 49/62] Top collector (#413) * Make TopCollector generic Make TopCollector take a generic type instead of only being tied to score. This will allow for sharing code between a TopCollector that sorts results by Score and a TopCollector that sorts documents by a fast field. This commit makes no functional changes to TopCollector. * Add TopFieldCollector and TopScoreCollector Create two new collectors that use the refactored TopCollector. TopFieldCollector has the same functionality that TopCollector originally had. TopFieldCollector allows for sorting results by a given fast field. Closes tantivy-search/tantivy#388 * Make TopCollector private Make TopCollector package private and export TopFieldCollector as TopCollector to maintain backwards compatibility. Mark TopCollector as deprecated to encourage use of the non-aliased TopFieldCollector. Remove Collector implementation for TopCollector since it is not longer used. --- src/collector/mod.rs | 9 +- src/collector/multi_collector.rs | 4 +- src/collector/top_collector.rs | 162 ++++++----------- src/collector/top_field_collector.rs | 263 +++++++++++++++++++++++++++ src/collector/top_score_collector.rs | 187 +++++++++++++++++++ src/query/fuzzy_query.rs | 2 +- src/query/regex_query.rs | 4 +- src/query/term_query/mod.rs | 6 +- 8 files changed, 522 insertions(+), 115 deletions(-) create mode 100644 src/collector/top_field_collector.rs create mode 100644 src/collector/top_score_collector.rs diff --git a/src/collector/mod.rs b/src/collector/mod.rs index d29eb1c6f..99af0f286 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -15,7 +15,14 @@ mod multi_collector; pub use self::multi_collector::MultiCollector; mod top_collector; -pub use self::top_collector::TopCollector; + +mod top_score_collector; +pub use self::top_score_collector::TopScoreCollector; +#[deprecated] +pub use self::top_score_collector::TopScoreCollector as TopCollector; + +mod top_field_collector; +pub use self::top_field_collector::TopFieldCollector; mod facet_collector; pub use self::facet_collector::FacetCollector; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 568a843e8..14ff80788 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -100,11 +100,11 @@ impl<'a> Collector for MultiCollector<'a> { mod tests { use super::*; - use collector::{Collector, CountCollector, TopCollector}; + use collector::{Collector, CountCollector, TopScoreCollector}; #[test] fn test_multi_collector() { - let mut top_collector = TopCollector::with_limit(2); + let mut top_collector = TopScoreCollector::with_limit(2); let mut count_collector = CountCollector::default(); { let mut collectors = diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 8d2829b73..64d2eee7f 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,115 +1,61 @@ -use super::Collector; -use std::cmp::Ordering; -use std::collections::BinaryHeap; use DocAddress; use DocId; -use Result; -use Score; use SegmentLocalId; -use SegmentReader; +use std::cmp::Ordering; +use std::collections::BinaryHeap; -// Rust heap is a max-heap and we need a min heap. +/// Contains a feature (field, score, etc.) of a document along with the document address. +/// +/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the +/// default Rust heap is a max heap, whereas a min heap is needed. #[derive(Clone, Copy)] -struct GlobalScoredDoc { - score: Score, +pub struct ComparableDoc { + feature: T, doc_address: DocAddress, } -impl PartialOrd for GlobalScoredDoc { - fn partial_cmp(&self, other: &GlobalScoredDoc) -> Option { +impl PartialOrd for ComparableDoc { + fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for GlobalScoredDoc { +impl Ord for ComparableDoc { #[inline] - fn cmp(&self, other: &GlobalScoredDoc) -> Ordering { + fn cmp(&self, other: &Self) -> Ordering { other - .score - .partial_cmp(&self.score) + .feature + .partial_cmp(&self.feature) .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) } } -impl PartialEq for GlobalScoredDoc { - fn eq(&self, other: &GlobalScoredDoc) -> bool { +impl PartialEq for ComparableDoc { + fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl Eq for GlobalScoredDoc {} +impl Eq for ComparableDoc {} /// The Top Collector keeps track of the K documents -/// with the best scores. +/// sorted by type `T`. /// /// The implementation is based on a `BinaryHeap`. /// The theorical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. -/// -/// ```rust -/// #[macro_use] -/// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; -/// use tantivy::{Index, Result, DocId, Score}; -/// use tantivy::collector::TopCollector; -/// use tantivy::query::QueryParser; -/// -/// # fn main() { example().unwrap(); } -/// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); -/// let title = schema_builder.add_text_field("title", TEXT); -/// let schema = schema_builder.build(); -/// let index = Index::create_in_ram(schema); -/// { -/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; -/// index_writer.add_document(doc!( -/// title => "The Name of the Wind", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of Muadib", -/// )); -/// index_writer.add_document(doc!( -/// title => "A Dairy Cow", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of a Young Girl", -/// )); -/// index_writer.commit().unwrap(); -/// } -/// -/// index.load_searchers()?; -/// let searcher = index.searcher(); -/// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut top_collector).unwrap(); -/// -/// let score_docs: Vec<(Score, DocId)> = top_collector -/// .score_docs() -/// .into_iter() -/// .map(|(score, doc_address)| (score, doc_address.doc())) -/// .collect(); -/// -/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]); -/// } -/// -/// Ok(()) -/// } -/// ``` -pub struct TopCollector { +pub struct TopCollector { limit: usize, - heap: BinaryHeap, + heap: BinaryHeap>, segment_id: u32, } -impl TopCollector { +impl TopCollector { /// Creates a top collector, with a number of documents equal to "limit". /// /// # Panics /// The method panics if limit is 0 - pub fn with_limit(limit: usize) -> TopCollector { + pub fn with_limit(limit: usize) -> TopCollector { if limit < 1 { panic!("Limit must be strictly greater than 0."); } @@ -125,22 +71,27 @@ impl TopCollector { /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn docs(&self) -> Vec { - self.score_docs() + self.top_docs() .into_iter() - .map(|score_doc| score_doc.1) + .map(|(_feature, doc)| doc) .collect() } - /// Returns K best ScoredDocument sorted in decreasing order. + /// Returns K best FeatureDocuments sorted in decreasing order. /// /// Calling this method triggers the sort. /// The result of the sort is not cached. - pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { - let mut scored_docs: Vec = self.heap.iter().cloned().collect(); - scored_docs.sort(); - scored_docs + pub fn top_docs(&self) -> Vec<(T, DocAddress)> { + let mut feature_docs: Vec> = self.heap.iter().cloned().collect(); + feature_docs.sort(); + feature_docs .into_iter() - .map(|GlobalScoredDoc { score, doc_address }| (score, doc_address)) + .map( + |ComparableDoc { + feature, + doc_address, + }| (feature, doc_address), + ) .collect() } @@ -150,48 +101,47 @@ impl TopCollector { pub fn at_capacity(&self) -> bool { self.heap.len() >= self.limit } -} -impl Collector for TopCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { + /// Sets the segment local ID for the collector + pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) { self.segment_id = segment_id; - Ok(()) } - fn collect(&mut self, doc: DocId, score: Score) { + /// Collects a document scored by the given feature + /// + /// It collects documents until it has reached the max capacity. Once it reaches capacity, it + /// will compare the lowest scoring item with the given one and keep whichever is greater. + pub fn collect(&mut self, doc: DocId, feature: T) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: GlobalScoredDoc = *self.heap + let limit_doc: ComparableDoc = self + .heap .peek() - .expect("Top collector with size 0 is forbidden"); - if limit_doc.score < score { - let mut mut_head = self.heap + .expect("Top collector with size 0 is forbidden") + .clone(); + if limit_doc.feature < feature { + let mut mut_head = self + .heap .peek_mut() .expect("Top collector with size 0 is forbidden"); - mut_head.score = score; + mut_head.feature = feature; mut_head.doc_address = DocAddress(self.segment_id, doc); } } else { - let wrapped_doc = GlobalScoredDoc { - score, + let wrapped_doc = ComparableDoc { + feature, doc_address: DocAddress(self.segment_id, doc), }; self.heap.push(wrapped_doc); } } - - fn requires_scoring(&self) -> bool { - true - } } #[cfg(test)] mod tests { - - use super::*; - use collector::Collector; use DocId; use Score; + use super::*; #[test] fn test_top_collector_not_at_capacity() { @@ -201,7 +151,7 @@ mod tests { top_collector.collect(5, 0.3); assert!(!top_collector.at_capacity()); let score_docs: Vec<(Score, DocId)> = top_collector - .score_docs() + .top_docs() .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); @@ -219,7 +169,7 @@ mod tests { assert!(top_collector.at_capacity()); { let score_docs: Vec<(Score, DocId)> = top_collector - .score_docs() + .top_docs() .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); @@ -238,7 +188,7 @@ mod tests { #[test] #[should_panic] fn test_top_0() { - TopCollector::with_limit(0); + let _collector: TopCollector = TopCollector::with_limit(0); } } diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs new file mode 100644 index 000000000..ec2361b3f --- /dev/null +++ b/src/collector/top_field_collector.rs @@ -0,0 +1,263 @@ +use collector::top_collector::TopCollector; +use DocAddress; +use DocId; +use fastfield::FastFieldReader; +use fastfield::FastValue; +use Result; +use Score; +use SegmentReader; +use super::Collector; +use schema::Field; + +/// The Top Field Collector keeps track of the K documents +/// sorted by a fast field in the index +/// +/// The implementation is based on a `BinaryHeap`. +/// The theorical complexity for collecting the top `K` out of `n` documents +/// is `O(n log K)`. +/// +/// ```rust +/// #[macro_use] +/// extern crate tantivy; +/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; +/// use tantivy::{Index, Result, DocId}; +/// use tantivy::collector::TopFieldCollector; +/// use tantivy::query::QueryParser; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<()> { +/// let mut schema_builder = SchemaBuilder::new(); +/// let title = schema_builder.add_text_field("title", TEXT); +/// let rating = schema_builder.add_u64_field("rating", FAST); +/// let schema = schema_builder.build(); +/// let index = Index::create_in_ram(schema); +/// { +/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; +/// index_writer.add_document(doc!( +/// title => "The Name of the Wind", +/// rating => 92u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of Muadib", +/// rating => 97u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "A Dairy Cow", +/// rating => 63u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of a Young Girl", +/// rating => 80u64, +/// )); +/// index_writer.commit().unwrap(); +/// } +/// +/// index.load_searchers()?; +/// let searcher = index.searcher(); +/// +/// { +/// let mut top_collector = TopFieldCollector::with_limit(rating, 2); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// searcher.search(&*query, &mut top_collector).unwrap(); +/// +/// let score_docs: Vec<(u64, DocId)> = top_collector +/// .top_docs() +/// .into_iter() +/// .map(|(field, doc_address)| (field, doc_address.doc())) +/// .collect(); +/// +/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]); +/// } +/// +/// Ok(()) +/// } +/// ``` +pub struct TopFieldCollector { + field: Field, + collector: TopCollector, + fast_field: Option>, +} + +impl TopFieldCollector { + /// Creates a top field collector, with a number of documents equal to "limit". + /// + /// The given field name must be a fast field, otherwise the collector have an error while + /// collecting results. + /// + /// # Panics + /// The method panics if limit is 0 + pub fn with_limit(field: Field, limit: usize) -> Self { + TopFieldCollector { + field, + collector: TopCollector::with_limit(limit), + fast_field: None, + } + } + + /// Returns K best documents sorted the given field name in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn docs(&self) -> Vec { + self.collector.docs() + } + + /// Returns K best FieldDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn top_docs(&self) -> Vec<(T, DocAddress)> { + self.collector.top_docs() + } + + /// Return true iff at least K documents have gone through + /// the collector. + #[inline] + pub fn at_capacity(&self) -> bool { + self.collector.at_capacity() + } +} + +impl Collector for TopFieldCollector { + fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> { + self.collector.set_segment_id(segment_id); + self.fast_field = Some(segment.fast_field_reader(self.field)?); + Ok(()) + } + + fn collect(&mut self, doc: DocId, _score: Score) { + let field_value = self + .fast_field + .as_ref() + .expect("collect() was called before set_segment. This should never happen.") + .get(doc); + self.collector.collect(doc, field_value); + } + + fn requires_scoring(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use Index; + use IndexWriter; + use TantivyError; + use query::Query; + use query::QueryParser; + use schema::{FAST, SchemaBuilder, TEXT}; + use schema::Field; + use schema::IntOptions; + use schema::Schema; + use super::*; + + const TITLE: &str = "title"; + const SIZE: &str = "size"; + + #[test] + fn test_top_collector_not_at_capacity() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, query) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + index_writer.add_document(doc!( + title => "growler of beer", + size => 64u64, + )); + index_writer.add_document(doc!( + title => "pint of beer", + size => 16u64, + )); + }); + let searcher = index.searcher(); + + let mut top_collector = TopFieldCollector::with_limit(size, 4); + searcher.search(&*query, &mut top_collector).unwrap(); + assert!(!top_collector.at_capacity()); + + let score_docs: Vec<(u64, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(field, doc_address)| (field, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]); + } + + #[test] + #[should_panic] + fn test_field_does_not_exist() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, _) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + }); + let searcher = index.searcher(); + let segment = searcher.segment_reader(0); + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(2), 4); + let _ = top_collector.set_segment(0, segment); + } + + #[test] + fn test_field_not_fast_field() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, IntOptions::default()); + let schema = schema_builder.build(); + let (index, _) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + }); + let searcher = index.searcher(); + let segment = searcher.segment_reader(0); + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(size, 4); + assert_matches!( + top_collector.set_segment(0, segment), + Err(TantivyError::FastFieldError(_)) + ); + } + + #[test] + #[should_panic] + fn test_collect_before_set_segment() { + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 4); + top_collector.collect(0, 0f32); + } + + #[test] + #[should_panic] + fn test_top_0() { + let _: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 0); + } + + fn index( + query: &str, + query_field: Field, + schema: Schema, + mut doc_adder: impl FnMut(&mut IndexWriter) -> (), + ) -> (Index, Box) { + let index = Index::create_in_ram(schema); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + doc_adder(&mut index_writer); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + + let query_parser = QueryParser::for_index(&index, vec![query_field]); + let query = query_parser.parse_query(query).unwrap(); + (index, query) + } +} diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs new file mode 100644 index 000000000..4a8ace86b --- /dev/null +++ b/src/collector/top_score_collector.rs @@ -0,0 +1,187 @@ +use collector::top_collector::TopCollector; +use DocAddress; +use DocId; +use Result; +use Score; +use SegmentLocalId; +use SegmentReader; +use super::Collector; + +/// The Top Score Collector keeps track of the K documents +/// sorted by their score. +/// +/// The implementation is based on a `BinaryHeap`. +/// The theorical complexity for collecting the top `K` out of `n` documents +/// is `O(n log K)`. +/// +/// ```rust +/// #[macro_use] +/// extern crate tantivy; +/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::{Index, Result, DocId, Score}; +/// use tantivy::collector::TopScoreCollector; +/// use tantivy::query::QueryParser; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<()> { +/// let mut schema_builder = SchemaBuilder::new(); +/// let title = schema_builder.add_text_field("title", TEXT); +/// let schema = schema_builder.build(); +/// let index = Index::create_in_ram(schema); +/// { +/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; +/// index_writer.add_document(doc!( +/// title => "The Name of the Wind", +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of Muadib", +/// )); +/// index_writer.add_document(doc!( +/// title => "A Dairy Cow", +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of a Young Girl", +/// )); +/// index_writer.commit().unwrap(); +/// } +/// +/// index.load_searchers()?; +/// let searcher = index.searcher(); +/// +/// { +/// let mut top_collector = TopScoreCollector::with_limit(2); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// searcher.search(&*query, &mut top_collector).unwrap(); +/// +/// let score_docs: Vec<(Score, DocId)> = top_collector +/// .top_docs() +/// .into_iter() +/// .map(|(score, doc_address)| (score, doc_address.doc())) +/// .collect(); +/// +/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]); +/// } +/// +/// Ok(()) +/// } +/// ``` +pub struct TopScoreCollector { + collector: TopCollector, +} + +impl TopScoreCollector { + /// Creates a top score collector, with a number of documents equal to "limit". + /// + /// # Panics + /// The method panics if limit is 0 + pub fn with_limit(limit: usize) -> TopScoreCollector { + TopScoreCollector { + collector: TopCollector::with_limit(limit), + } + } + + /// Returns K best scored documents sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn docs(&self) -> Vec { + self.collector.docs() + } + + /// Returns K best ScoredDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn top_docs(&self) -> Vec<(Score, DocAddress)> { + self.collector.top_docs() + } + + /// Returns K best ScoredDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + #[deprecated] + pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { + self.collector.top_docs() + } + + /// Return true iff at least K documents have gone through + /// the collector. + #[inline] + pub fn at_capacity(&self) -> bool { + self.collector.at_capacity() + } +} + +impl Collector for TopScoreCollector { + fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { + self.collector.set_segment_id(segment_id); + Ok(()) + } + + fn collect(&mut self, doc: DocId, score: Score) { + self.collector.collect(doc, score); + } + + fn requires_scoring(&self) -> bool { + true + } +} + +#[cfg(test)] +mod tests { + use collector::Collector; + use DocId; + use Score; + use super::*; + + #[test] + fn test_top_collector_not_at_capacity() { + let mut top_collector = TopScoreCollector::with_limit(4); + top_collector.collect(1, 0.8); + top_collector.collect(3, 0.2); + top_collector.collect(5, 0.3); + assert!(!top_collector.at_capacity()); + let score_docs: Vec<(Score, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(score, doc_address)| (score, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); + } + + #[test] + fn test_top_collector_at_capacity() { + let mut top_collector = TopScoreCollector::with_limit(4); + top_collector.collect(1, 0.8); + top_collector.collect(3, 0.2); + top_collector.collect(5, 0.3); + top_collector.collect(7, 0.9); + top_collector.collect(9, -0.2); + assert!(top_collector.at_capacity()); + { + let score_docs: Vec<(Score, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(score, doc_address)| (score, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); + } + { + let docs: Vec = top_collector + .docs() + .into_iter() + .map(|doc_address| doc_address.doc()) + .collect(); + assert_eq!(docs, vec![7, 1, 5, 3]); + } + } + + #[test] + #[should_panic] + fn test_top_0() { + TopScoreCollector::with_limit(0); + } + +} diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 6e0a16a67..7c3c6ad08 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -153,7 +153,7 @@ mod test { let fuzzy_query = FuzzyTermQuery::new(term, 1, true); searcher.search(&fuzzy_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; assert_nearly_equals(1f32, score); diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 9b02fc7cf..64d21395f 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -123,7 +123,7 @@ mod test { let mut collector = TopCollector::with_limit(2); let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field); searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; assert_nearly_equals(1f32, score); @@ -132,7 +132,7 @@ mod test { let mut collector = TopCollector::with_limit(2); let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field); searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 0, "Expected ZERO document"); } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index e8a865e02..bf5171016 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -72,7 +72,7 @@ mod tests { let term = Term::from_field_text(left_field, "left2"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1); let (score, _) = scored_docs[0]; assert_nearly_equals(0.77802235, score); @@ -82,7 +82,7 @@ mod tests { let term = Term::from_field_text(left_field, "left1"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 2); let (score1, _) = scored_docs[0]; assert_nearly_equals(0.27101856, score1); @@ -94,7 +94,7 @@ mod tests { let query = query_parser.parse_query("left:left2 left:left1").unwrap(); let mut collector = TopCollector::with_limit(2); searcher.search(&*query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 2); let (score1, _) = scored_docs[0]; assert_nearly_equals(0.9153879, score1); From 21a99407261c75c6cc5666f143ef0682c5ac57f6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 14 Sep 2018 09:31:11 +0900 Subject: [PATCH 50/62] Update Changelog with #388 (#418) --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 718840223..1ad8de098 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Tantivy 0.7 - Tantivy error now rely on the failure crate (@drusellers) - Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax - Added a snippet generator with highlight (@vigneshsarma, @fulmicoton) +- Added a `TopFieldCollector` (@pentlander) Tantivy 0.6.1 ========================= From 0ba1cf93f7049b881a5b6d19dc953a28322bc250 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 14 Sep 2018 09:54:26 +0900 Subject: [PATCH 51/62] Remove Searcher dereference (#419) --- examples/snippet.rs | 2 +- src/query/boolean_query/mod.rs | 10 +++++----- src/query/range_query.rs | 6 +++--- src/snippet/mod.rs | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 5e1fa27d1..814edd62d 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -57,7 +57,7 @@ fn main() -> tantivy::Result<()> { let mut top_collector = TopCollector::with_limit(10); searcher.search(&*query, &mut top_collector)?; - let snippet_generator = SnippetGenerator::new(&*searcher, &*query, body)?; + let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?; let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 5d72406a0..4276720ee 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -69,7 +69,7 @@ mod tests { let query_parser = QueryParser::for_index(&index, vec![text_field]); let query = query_parser.parse_query("+a").unwrap(); let searcher = index.searcher(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::::is_type(&*scorer)); } @@ -81,13 +81,13 @@ mod tests { let searcher = index.searcher(); { let query = query_parser.parse_query("+a +b +c").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::>::is_type(&*scorer)); } { let query = query_parser.parse_query("+a +(b c)").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::>>::is_type(&*scorer)); } @@ -100,7 +100,7 @@ mod tests { let searcher = index.searcher(); { let query = query_parser.parse_query("+a b").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::< RequiredOptionalScorer, Box, SumWithCoordsCombiner>, @@ -108,7 +108,7 @@ mod tests { } { let query = query_parser.parse_query("+a b").unwrap(); - let weight = query.weight(&*searcher, false).unwrap(); + let weight = query.weight(&searcher, false).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); println!("{:?}", scorer.type_name()); assert!(Downcast::::is_type(&*scorer)); diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 06d98db66..6ec9c587f 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -68,7 +68,7 @@ fn map_bound TTo>( /// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970); /// /// let mut count_collector = CountCollector::default(); -/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?; +/// docs_in_the_sixties.search(&searcher, &mut count_collector)?; /// /// let num_60s_books = count_collector.count(); /// @@ -333,7 +333,7 @@ mod tests { // ... or `1960..=1969` if inclusive range is enabled. let mut count_collector = CountCollector::default(); - docs_in_the_sixties.search(&*searcher, &mut count_collector)?; + docs_in_the_sixties.search(&searcher, &mut count_collector)?; assert_eq!(count_collector.count(), 2285); Ok(()) } @@ -371,7 +371,7 @@ mod tests { let count_multiples = |range_query: RangeQuery| { let mut count_collector = CountCollector::default(); range_query - .search(&*searcher, &mut count_collector) + .search(&searcher, &mut count_collector) .unwrap(); count_collector.count() }; diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index a3d2c48e3..8f4ec43b1 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -227,7 +227,7 @@ fn select_best_fragment_combination<'a>( /// let query = query_parser.parse_query("haleurs flamands").unwrap(); /// # index.load_searchers()?; /// # let searcher = index.searcher(); -/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?; +/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?; /// snippet_generator.set_max_num_chars(100); /// let snippet = snippet_generator.snippet_from_doc(&doc); /// let snippet_html: String = snippet.to_html(); @@ -464,7 +464,7 @@ Survey in 2016, 2017, and 2018."#; let searcher = index.searcher(); let query_parser = QueryParser::for_index(&index, vec![text_field]); let query = query_parser.parse_query("rust design").unwrap(); - let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field).unwrap(); + let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); { let snippet = snippet_generator.snippet(TEST_TEXT); assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); From 37e4280c0a62943f70b3cfbf83c72fd10e494973 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 15 Sep 2018 07:44:22 +0900 Subject: [PATCH 52/62] Cargo Format (#420) --- examples/basic_search.rs | 3 - examples/custom_tokenizer.rs | 2 - examples/deleting_updating_documents.rs | 9 +- examples/faceted_search.rs | 66 +++++----- examples/iterating_docs_and_positions.rs | 20 +-- examples/snippet.rs | 2 +- examples/stop_words.rs | 110 ++++++++-------- src/collector/facet_collector.rs | 25 ++-- src/collector/top_collector.rs | 6 +- src/collector/top_field_collector.rs | 18 +-- src/collector/top_score_collector.rs | 4 +- src/common/composite_file.rs | 3 +- src/common/vint.rs | 16 +-- src/core/index.rs | 72 +++++------ src/core/inverted_index_reader.rs | 10 +- src/core/pool.rs | 6 +- src/core/searcher.rs | 17 ++- src/core/segment_reader.rs | 19 ++- src/directory/directory.rs | 12 +- src/directory/managed_directory.rs | 15 ++- src/directory/mmap_directory.rs | 6 +- src/directory/ram_directory.rs | 10 +- src/directory/read_only_source.rs | 1 - src/error.rs | 16 ++- src/fastfield/delete.rs | 3 +- src/fastfield/facet_reader.rs | 3 +- src/fastfield/multivalued/writer.rs | 4 +- src/functional_test.rs | 2 +- src/indexer/delete_queue.rs | 12 +- src/indexer/directory_lock.rs | 51 +++----- src/indexer/index_writer.rs | 13 +- src/indexer/merge_policy.rs | 12 +- src/indexer/merger.rs | 3 +- src/indexer/segment_register.rs | 3 +- src/lib.rs | 5 +- src/positions/mod.rs | 28 ++-- src/positions/reader.rs | 61 +++++---- src/positions/serializer.rs | 11 +- src/postings/compression/mod.rs | 27 ++-- src/postings/compression/vint.rs | 12 +- src/postings/mod.rs | 17 ++- src/postings/postings_writer.rs | 3 +- src/postings/recorder.rs | 3 +- src/postings/segment_postings.rs | 156 +++++++++++++---------- src/postings/serializer.rs | 48 ++++--- src/postings/skip.rs | 27 ++-- src/query/boolean_query/boolean_query.rs | 6 +- src/query/empty_query.rs | 10 +- src/query/mod.rs | 4 +- src/query/occur.rs | 2 +- src/query/phrase_query/phrase_query.rs | 2 +- src/query/phrase_query/phrase_scorer.rs | 3 +- src/query/phrase_query/phrase_weight.rs | 1 - src/query/query.rs | 2 +- src/query/query_parser/query_grammar.rs | 42 +++--- src/query/query_parser/query_parser.rs | 121 +++++++++--------- src/query/query_parser/user_input_ast.rs | 24 ++-- src/query/range_query.rs | 5 +- src/query/scorer.rs | 1 - src/query/term_query/term_query.rs | 2 +- src/schema/schema.rs | 5 +- src/snippet/mod.rs | 82 ++++++------ src/store/mod.rs | 8 +- src/store/skiplist/skiplist_builder.rs | 3 +- src/termdict/term_info_store.rs | 1 - src/termdict/termdict.rs | 3 +- src/tokenizer/lower_caser.rs | 29 +++-- src/tokenizer/mod.rs | 2 +- src/tokenizer/raw_tokenizer.rs | 2 +- src/tokenizer/token_stream_chain.rs | 13 +- src/tokenizer/tokenizer.rs | 2 +- 71 files changed, 697 insertions(+), 650 deletions(-) diff --git a/examples/basic_search.rs b/examples/basic_search.rs index 35867b2f0..1aba7bf3f 100644 --- a/examples/basic_search.rs +++ b/examples/basic_search.rs @@ -10,7 +10,6 @@ // - search for the best document matchings "sea whale" // - retrieve the best document original content. - extern crate tempdir; // --- @@ -235,9 +234,7 @@ fn main() -> tantivy::Result<()> { println!("{}", schema.to_json(&retrieved_doc)); } - Ok(()) } - use tempdir::TempDir; diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index e44b82c57..7c5299e00 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -3,7 +3,6 @@ // In this example, we'll see how to define a tokenizer pipeline // by aligning a bunch of `TokenFilter`. - #[macro_use] extern crate tantivy; use tantivy::collector::TopCollector; @@ -12,7 +11,6 @@ use tantivy::schema::*; use tantivy::tokenizer::NgramTokenizer; use tantivy::Index; - fn main() -> tantivy::Result<()> { // # Defining the schema // diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs index 9ddb38a59..de0603392 100644 --- a/examples/deleting_updating_documents.rs +++ b/examples/deleting_updating_documents.rs @@ -11,10 +11,9 @@ #[macro_use] extern crate tantivy; use tantivy::collector::TopCollector; +use tantivy::query::TermQuery; use tantivy::schema::*; use tantivy::Index; -use tantivy::query::TermQuery; - // A simple helper function to fetch a single document // given its id from our index. @@ -31,7 +30,7 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result tantivy::Result tantivy::Result<()> { - // # Defining the schema // // Check out the *basic_search* example if this makes @@ -126,7 +124,6 @@ fn main() -> tantivy::Result<()> { isbn => "978-9176370711", )); - // You are guaranteed that your clients will only observe your index in // the state it was in after a commit. // In this example, your search engine will at no point be missing the *Frankenstein* document. @@ -143,4 +140,4 @@ fn main() -> tantivy::Result<()> { ); Ok(()) -} \ No newline at end of file +} diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs index 76d167778..24fd536e8 100644 --- a/examples/faceted_search.rs +++ b/examples/faceted_search.rs @@ -22,60 +22,60 @@ use tantivy::schema::*; use tantivy::Index; fn main() -> tantivy::Result<()> { - // Let's create a temporary directory for the - // sake of this example - let index_path = TempDir::new("tantivy_facet_example_dir")?; - let mut schema_builder = SchemaBuilder::default(); + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_facet_example_dir")?; + let mut schema_builder = SchemaBuilder::default(); - schema_builder.add_text_field("name", TEXT | STORED); + schema_builder.add_text_field("name", TEXT | STORED); - // this is our faceted field - schema_builder.add_facet_field("tags"); + // this is our faceted field + schema_builder.add_facet_field("tags"); - let schema = schema_builder.build(); + let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_dir(&index_path, schema.clone())?; - let mut index_writer = index.writer(50_000_000)?; + let mut index_writer = index.writer(50_000_000)?; - let name = schema.get_field("name").unwrap(); - let tags = schema.get_field("tags").unwrap(); + let name = schema.get_field("name").unwrap(); + let tags = schema.get_field("tags").unwrap(); - // For convenience, tantivy also comes with a macro to - // reduce the boilerplate above. - index_writer.add_document(doc!( + // For convenience, tantivy also comes with a macro to + // reduce the boilerplate above. + index_writer.add_document(doc!( name => "the ditch", tags => Facet::from("/pools/north") )); - index_writer.add_document(doc!( + index_writer.add_document(doc!( name => "little stacey", tags => Facet::from("/pools/south") )); - index_writer.commit()?; + index_writer.commit()?; - index.load_searchers()?; + index.load_searchers()?; - let searcher = index.searcher(); + let searcher = index.searcher(); - let mut facet_collector = FacetCollector::for_field(tags); - facet_collector.add_facet("/pools"); + let mut facet_collector = FacetCollector::for_field(tags); + facet_collector.add_facet("/pools"); - searcher.search(&AllQuery, &mut facet_collector).unwrap(); + searcher.search(&AllQuery, &mut facet_collector).unwrap(); - let counts = facet_collector.harvest(); - // This lists all of the facet counts - let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); - assert_eq!( - facets, - vec![ - (&Facet::from("/pools/north"), 1), - (&Facet::from("/pools/south"), 1) - ] - ); + let counts = facet_collector.harvest(); + // This lists all of the facet counts + let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); + assert_eq!( + facets, + vec![ + (&Facet::from("/pools/north"), 1), + (&Facet::from("/pools/south"), 1), + ] + ); - Ok(()) + Ok(()) } use tempdir::TempDir; diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs index 9d3937617..0434f58c8 100644 --- a/examples/iterating_docs_and_positions.rs +++ b/examples/iterating_docs_and_positions.rs @@ -7,18 +7,15 @@ // the list of documents containing a term, getting // its term frequency, and accessing its positions. - // --- // Importing tantivy... #[macro_use] extern crate tantivy; use tantivy::schema::*; use tantivy::Index; -use tantivy::{DocSet, DocId, Postings}; +use tantivy::{DocId, DocSet, Postings}; fn main() -> tantivy::Result<()> { - - // We first create a schema for the sake of the // example. Check the `basic_search` example for more information. let mut schema_builder = SchemaBuilder::default(); @@ -47,7 +44,6 @@ fn main() -> tantivy::Result<()> { // there is actually only one segment here, but let's iterate through the list // anyway) for segment_reader in searcher.segment_readers() { - // A segment contains different data structure. // Inverted index stands for the combination of // - the term dictionary @@ -58,19 +54,18 @@ fn main() -> tantivy::Result<()> { // Let's go through all docs containing the term `title:the` and access their position let term_the = Term::from_field_text(title, "the"); - // This segment posting object is like a cursor over the documents matching the term. // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies // and positions. // // If you don't need all this information, you may get better performance by decompressing less // information. - if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) { - + if let Some(mut segment_postings) = + inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) + { // this buffer will be used to request for positions let mut positions: Vec = Vec::with_capacity(100); while segment_postings.advance() { - // the number of time the term appears in the document. let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once. @@ -98,7 +93,6 @@ fn main() -> tantivy::Result<()> { } } - // A `Term` is a text token associated with a field. // Let's go through all docs containing the term `title:the` and access their position let term_the = Term::from_field_text(title, "the"); @@ -111,7 +105,6 @@ fn main() -> tantivy::Result<()> { // Also, for some VERY specific high performance use case like an OLAP analysis of logs, // you can get better performance by accessing directly the blocks of doc ids. for segment_reader in searcher.segment_readers() { - // A segment contains different data structure. // Inverted index stands for the combination of // - the term dictionary @@ -124,7 +117,9 @@ fn main() -> tantivy::Result<()> { // // If you don't need all this information, you may get better performance by decompressing less // information. - if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) { + if let Some(mut block_segment_postings) = + inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) + { while block_segment_postings.advance() { // Once again these docs MAY contains deleted documents as well. let docs = block_segment_postings.docs(); @@ -136,4 +131,3 @@ fn main() -> tantivy::Result<()> { Ok(()) } - diff --git a/examples/snippet.rs b/examples/snippet.rs index 814edd62d..a39ba2016 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -66,6 +66,6 @@ fn main() -> tantivy::Result<()> { println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); println!("snippet: {}", snippet.to_html()); } - + Ok(()) } diff --git a/examples/stop_words.rs b/examples/stop_words.rs index b131d876c..8945f8614 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -22,59 +22,59 @@ use tantivy::tokenizer::*; use tantivy::Index; fn main() -> tantivy::Result<()> { - // this example assumes you understand the content in `basic_search` - let mut schema_builder = SchemaBuilder::default(); + // this example assumes you understand the content in `basic_search` + let mut schema_builder = SchemaBuilder::default(); - // This configures your custom options for how tantivy will - // store and process your content in the index; The key - // to note is that we are setting the tokenizer to `stoppy` - // which will be defined and registered below. - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + // This configures your custom options for how tantivy will + // store and process your content in the index; The key + // to note is that we are setting the tokenizer to `stoppy` + // which will be defined and registered below. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); - // Our first field is title. - schema_builder.add_text_field("title", text_options); + // Our first field is title. + schema_builder.add_text_field("title", text_options); - // Our second field is body. - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); - schema_builder.add_text_field("body", text_options); + // Our second field is body. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + schema_builder.add_text_field("body", text_options); - let schema = schema_builder.build(); + let schema = schema_builder.build(); - let index = Index::create_in_ram(schema.clone()); + let index = Index::create_in_ram(schema.clone()); - // This tokenizer lowers all of the text (to help with stop word matching) - // then removes all instances of `the` and `and` from the corpus - let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + // This tokenizer lowers all of the text (to help with stop word matching) + // then removes all instances of `the` and `and` from the corpus + let tokenizer = SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); - index.tokenizers().register("stoppy", tokenizer); + index.tokenizers().register("stoppy", tokenizer); - let mut index_writer = index.writer(50_000_000)?; + let mut index_writer = index.writer(50_000_000)?; - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); - index_writer.add_document(doc!( + index_writer.add_document(doc!( title => "The Old Man and the Sea", body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ he had gone eighty-four days now without taking a fish." )); - index_writer.add_document(doc!( + index_writer.add_document(doc!( title => "Of Mice and Men", body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \ @@ -86,7 +86,7 @@ fn main() -> tantivy::Result<()> { limbs and branches that arch over the pool" )); - index_writer.add_document(doc!( + index_writer.add_document(doc!( title => "Frankenstein", body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ enterprise which you have regarded with such evil forebodings. I arrived here \ @@ -94,28 +94,28 @@ fn main() -> tantivy::Result<()> { increasing confidence in the success of my undertaking." )); - index_writer.commit()?; + index_writer.commit()?; - index.load_searchers()?; + index.load_searchers()?; - let searcher = index.searcher(); + let searcher = index.searcher(); - let query_parser = QueryParser::for_index(&index, vec![title, body]); + let query_parser = QueryParser::for_index(&index, vec![title, body]); - // stop words are applied on the query as well. - // The following will be equivalent to `title:frankenstein` - let query = query_parser.parse_query("title:\"the Frankenstein\"")?; + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; - let mut top_collector = TopCollector::with_limit(10); + let mut top_collector = TopCollector::with_limit(10); - searcher.search(&*query, &mut top_collector)?; + searcher.search(&*query, &mut top_collector)?; - let doc_addresses = top_collector.docs(); + let doc_addresses = top_collector.docs(); - for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); - } + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } - Ok(()) -} \ No newline at end of file + Ok(()) +} diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 6c0bb647d..8e1c95876 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -342,16 +342,19 @@ impl FacetCollector { pub fn harvest(mut self) -> FacetCounts { self.finalize_segment(); - let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters + let collapsed_facet_ords: Vec<&[u64]> = self + .segment_counters .iter() .map(|segment_counter| &segment_counter.facet_ords[..]) .collect(); - let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters + let collapsed_facet_counts: Vec<&[u64]> = self + .segment_counters .iter() .map(|segment_counter| &segment_counter.facet_counts[..]) .collect(); - let facet_streams = self.segment_counters + let facet_streams = self + .segment_counters .iter() .map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream()) .collect::>(); @@ -402,7 +405,8 @@ impl Collector for FacetCollector { fn collect(&mut self, doc: DocId, _: Score) { let facet_reader: &mut FacetReader = unsafe { - &mut *self.ff_reader + &mut *self + .ff_reader .as_ref() .expect("collect() was called before set_segment. This should never happen.") .get() @@ -476,9 +480,8 @@ impl FacetCounts { heap.push(Hit { count, facet }); } - let mut lowest_count: u64 = heap.peek().map(|hit| hit.count) - .unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value - // is never used in that case. + let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value + // is never used in that case. for (facet, count) in it { if count > lowest_count { @@ -619,7 +622,13 @@ mod tests { let doc = doc!(facet_field => facet); iter::repeat(doc).take(count) }) - .map(|mut doc| { doc.add_facet(facet_field, &format!("/facet/{}", thread_rng().sample(&uniform) )); doc}) + .map(|mut doc| { + doc.add_facet( + facet_field, + &format!("/facet/{}", thread_rng().sample(&uniform)), + ); + doc + }) .collect(); thread_rng().shuffle(&mut docs[..]); diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 64d2eee7f..6cb61e8b2 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,8 +1,8 @@ +use std::cmp::Ordering; +use std::collections::BinaryHeap; use DocAddress; use DocId; use SegmentLocalId; -use std::cmp::Ordering; -use std::collections::BinaryHeap; /// Contains a feature (field, score, etc.) of a document along with the document address. /// @@ -139,9 +139,9 @@ impl TopCollector { #[cfg(test)] mod tests { + use super::*; use DocId; use Score; - use super::*; #[test] fn test_top_collector_not_at_capacity() { diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs index ec2361b3f..3fb95d21a 100644 --- a/src/collector/top_field_collector.rs +++ b/src/collector/top_field_collector.rs @@ -1,13 +1,13 @@ +use super::Collector; use collector::top_collector::TopCollector; -use DocAddress; -use DocId; use fastfield::FastFieldReader; use fastfield::FastValue; +use schema::Field; +use DocAddress; +use DocId; use Result; use Score; use SegmentReader; -use super::Collector; -use schema::Field; /// The Top Field Collector keeps track of the K documents /// sorted by a fast field in the index @@ -142,16 +142,16 @@ impl Collector for TopFieldCollector { #[cfg(test)] mod tests { - use Index; - use IndexWriter; - use TantivyError; + use super::*; use query::Query; use query::QueryParser; - use schema::{FAST, SchemaBuilder, TEXT}; use schema::Field; use schema::IntOptions; use schema::Schema; - use super::*; + use schema::{SchemaBuilder, FAST, TEXT}; + use Index; + use IndexWriter; + use TantivyError; const TITLE: &str = "title"; const SIZE: &str = "size"; diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 4a8ace86b..68bf114f6 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,3 +1,4 @@ +use super::Collector; use collector::top_collector::TopCollector; use DocAddress; use DocId; @@ -5,7 +6,6 @@ use Result; use Score; use SegmentLocalId; use SegmentReader; -use super::Collector; /// The Top Score Collector keeps track of the K documents /// sorted by their score. @@ -131,10 +131,10 @@ impl Collector for TopScoreCollector { #[cfg(test)] mod tests { + use super::*; use collector::Collector; use DocId; use Score; - use super::*; #[test] fn test_top_collector_not_at_capacity() { diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 2f3f71a47..e7d657b65 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -72,7 +72,8 @@ impl CompositeWrite { let footer_offset = self.write.written_bytes(); VInt(self.offsets.len() as u64).serialize(&mut self.write)?; - let mut offset_fields: Vec<_> = self.offsets + let mut offset_fields: Vec<_> = self + .offsets .iter() .map(|(file_addr, offset)| (*offset, *file_addr)) .collect(); diff --git a/src/common/vint.rs b/src/common/vint.rs index 308aff1ca..7b782a946 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -10,8 +10,6 @@ pub struct VInt(pub u64); const STOP_BIT: u8 = 128; impl VInt { - - pub fn val(&self) -> u64 { self.0 } @@ -20,14 +18,13 @@ impl VInt { VInt::deserialize(reader).map(|vint| vint.0) } - pub fn serialize_into_vec(&self, output: &mut Vec){ + pub fn serialize_into_vec(&self, output: &mut Vec) { let mut buffer = [0u8; 10]; let num_bytes = self.serialize_into(&mut buffer); output.extend(&buffer[0..num_bytes]); } fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize { - let mut remaining = self.0; for (i, b) in buffer.iter_mut().enumerate() { let next_byte: u8 = (remaining % 128u64) as u8; @@ -74,7 +71,6 @@ impl BinarySerializable for VInt { } } - #[cfg(test)] mod tests { @@ -89,10 +85,10 @@ mod tests { } assert!(num_bytes > 0); if num_bytes < 10 { - assert!(1u64 << (7*num_bytes) > val); + assert!(1u64 << (7 * num_bytes) > val); } if num_bytes > 1 { - assert!(1u64 << (7*(num_bytes-1)) <= val); + assert!(1u64 << (7 * (num_bytes - 1)) <= val); } let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap(); assert_eq!(val, serdeser_val.0); @@ -105,11 +101,11 @@ mod tests { aux_test_vint(5); aux_test_vint(u64::max_value()); for i in 1..9 { - let power_of_128 = 1u64 << (7*i); + let power_of_128 = 1u64 << (7 * i); aux_test_vint(power_of_128 - 1u64); - aux_test_vint(power_of_128 ); + aux_test_vint(power_of_128); aux_test_vint(power_of_128 + 1u64); } aux_test_vint(10); } -} \ No newline at end of file +} diff --git a/src/core/index.rs b/src/core/index.rs index a7bef841d..3eafb90cc 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,36 +1,36 @@ -use core::SegmentId; -use error::TantivyError; -use schema::Schema; -use serde_json; -use std::borrow::BorrowMut; -use std::fmt; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use Result; -use indexer::LockType; use super::pool::LeasedItem; use super::pool::Pool; use super::segment::create_segment; use super::segment::Segment; use core::searcher::Searcher; use core::IndexMeta; +use core::SegmentId; use core::SegmentMeta; use core::SegmentReader; use core::META_FILEPATH; +use directory::ManagedDirectory; #[cfg(feature = "mmap")] use directory::MmapDirectory; use directory::{Directory, RAMDirectory}; -use directory::{ManagedDirectory}; +use error::TantivyError; use indexer::index_writer::open_index_writer; use indexer::index_writer::HEAP_SIZE_MIN; use indexer::segment_updater::save_new_metas; +use indexer::LockType; use num_cpus; +use schema::Field; +use schema::FieldType; +use schema::Schema; +use serde_json; +use std::borrow::BorrowMut; +use std::fmt; use std::path::Path; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokenizer::BoxedTokenizer; use tokenizer::TokenizerManager; use IndexWriter; -use schema::FieldType; -use schema::Field; -use tokenizer::BoxedTokenizer; +use Result; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; @@ -115,31 +115,24 @@ impl Index { &self.tokenizers } - /// Helper to access the tokenizer associated to a specific field. pub fn tokenizer_for_field(&self, field: Field) -> Result> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); let tokenizer_manager: &TokenizerManager = self.tokenizers(); - let tokenizer_name_opt: Option> = - match field_type { - FieldType::Str(text_options) => { - text_options - .get_indexing_options() - .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) - .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)) - }, - _ => { - None - } - }; + let tokenizer_name_opt: Option> = match field_type { + FieldType::Str(text_options) => text_options + .get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) + .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)), + _ => None, + }; match tokenizer_name_opt { - Some(tokenizer) => { - Ok(tokenizer) - } - None => { - Err(TantivyError:: SchemaError(format!("{:?} is not a text field.", field_entry.name()))) - } + Some(tokenizer) => Ok(tokenizer), + None => Err(TantivyError::SchemaError(format!( + "{:?} is not a text field.", + field_entry.name() + ))), } } @@ -186,7 +179,6 @@ impl Index { num_threads: usize, overall_heap_size_in_bytes: usize, ) -> Result { - let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; open_index_writer( @@ -225,7 +217,8 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self.searchable_segment_metas()? + Ok(self + .searchable_segment_metas()? .into_iter() .map(|segment_meta| self.segment(segment_meta)) .collect()) @@ -260,7 +253,8 @@ impl Index { /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - Ok(self.searchable_segment_metas()? + Ok(self + .searchable_segment_metas()? .iter() .map(|segment_meta| segment_meta.id()) .collect()) @@ -332,11 +326,10 @@ impl Clone for Index { } } - #[cfg(test)] mod tests { + use schema::{SchemaBuilder, INT_INDEXED, TEXT}; use Index; - use schema::{SchemaBuilder, TEXT, INT_INDEXED}; #[test] fn test_indexer_for_field() { @@ -352,5 +345,4 @@ mod tests { ); } - -} \ No newline at end of file +} diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index b919e09b0..bb71be1ae 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,13 +1,13 @@ use common::BinarySerializable; use directory::ReadOnlySource; +use owned_read::OwnedRead; +use positions::PositionReader; use postings::TermInfo; use postings::{BlockSegmentPostings, SegmentPostings}; use schema::FieldType; use schema::IndexRecordOption; use schema::Term; use termdict::TermDictionary; -use owned_read::OwnedRead; -use positions::PositionReader; /// The inverted index reader is in charge of accessing /// the inverted index associated to a specific field. @@ -100,7 +100,6 @@ impl InvertedIndexReader { block_postings.reset(term_info.doc_freq, postings_reader); } - /// Returns a block postings given a `Term`. /// This method is for an advanced usage only. /// @@ -111,7 +110,7 @@ impl InvertedIndexReader { option: IndexRecordOption, ) -> Option { self.get_term_info(term) - .map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option)) + .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) } /// Returns a block postings given a `term_info`. @@ -147,7 +146,8 @@ impl InvertedIndexReader { if option.has_positions() { let position_reader = self.positions_source.clone(); let skip_reader = self.positions_idx_source.clone(); - let position_reader = PositionReader::new(position_reader, skip_reader, term_info.positions_idx); + let position_reader = + PositionReader::new(position_reader, skip_reader, term_info.positions_idx); Some(position_reader) } else { None diff --git a/src/core/pool.rs b/src/core/pool.rs index 609848317..d8564e46d 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -87,7 +87,8 @@ impl Deref for LeasedItem { type Target = T; fn deref(&self) -> &T { - &self.gen_item + &self + .gen_item .as_ref() .expect("Unwrapping a leased item should never fail") .item // unwrap is safe here @@ -96,7 +97,8 @@ impl Deref for LeasedItem { impl DerefMut for LeasedItem { fn deref_mut(&mut self) -> &mut T { - &mut self.gen_item + &mut self + .gen_item .as_mut() .expect("Unwrapping a mut leased item should never fail") .item // unwrap is safe here diff --git a/src/core/searcher.rs b/src/core/searcher.rs index f17df042f..cbe549062 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -9,8 +9,8 @@ use std::fmt; use std::sync::Arc; use termdict::TermMerger; use DocAddress; -use Result; use Index; +use Result; /// Holds a list of `SegmentReader`s ready for search. /// @@ -25,7 +25,11 @@ pub struct Searcher { impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, index: Index, segment_readers: Vec) -> Searcher { + pub(crate) fn new( + schema: Schema, + index: Index, + segment_readers: Vec, + ) -> Searcher { Searcher { schema, index, @@ -87,7 +91,8 @@ impl Searcher { /// Return the field searcher associated to a `Field`. pub fn field(&self, field: Field) -> FieldSearcher { - let inv_index_readers = self.segment_readers + let inv_index_readers = self + .segment_readers .iter() .map(|segment_reader| segment_reader.inverted_index(field)) .collect::>(); @@ -107,7 +112,8 @@ impl FieldSearcher { /// Returns a Stream over all of the sorted unique terms of /// for the given field. pub fn terms(&self) -> TermMerger { - let term_streamers: Vec<_> = self.inv_index_readers + let term_streamers: Vec<_> = self + .inv_index_readers .iter() .map(|inverted_index| inverted_index.terms().stream()) .collect(); @@ -117,7 +123,8 @@ impl FieldSearcher { impl fmt::Debug for Searcher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let segment_ids = self.segment_readers + let segment_ids = self + .segment_readers .iter() .map(|segment_reader| segment_reader.segment_id()) .collect::>(); diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 56a3a7b9e..dff6cca48 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -157,11 +157,13 @@ impl SegmentReader { &FieldType::Bytes => {} _ => return Err(FastFieldNotAvailableError::new(field_entry)), } - let idx_reader = self.fast_fields_composite + let idx_reader = self + .fast_fields_composite .open_read_with_idx(field, 0) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) .map(FastFieldReader::open)?; - let values = self.fast_fields_composite + let values = self + .fast_fields_composite .open_read_with_idx(field, 1) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?; Ok(BytesFastFieldReader::open(idx_reader, values)) @@ -285,7 +287,8 @@ impl SegmentReader { /// term dictionary associated to a specific field, /// and opening the posting list associated to any term. pub fn inverted_index(&self, field: Field) -> Arc { - if let Some(inv_idx_reader) = self.inv_idx_reader_cache + if let Some(inv_idx_reader) = self + .inv_idx_reader_cache .read() .expect("Lock poisoned. This should never happen") .get(&field) @@ -314,15 +317,18 @@ impl SegmentReader { let postings_source = postings_source_opt.unwrap(); - let termdict_source = self.termdict_composite + let termdict_source = self + .termdict_composite .open_read(field) .expect("Failed to open field term dictionary in composite file. Is the field indexed"); - let positions_source = self.positions_composite + let positions_source = self + .positions_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); - let positions_idx_source = self.positions_idx_composite + let positions_idx_source = self + .positions_idx_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); @@ -435,7 +441,6 @@ mod test { use schema::{SchemaBuilder, Term, STORED, TEXT}; use DocId; - #[test] fn test_alive_docs_iterator() { let mut schema_builder = SchemaBuilder::new(); diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 596cdc492..0f99be74b 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -77,15 +77,15 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// DirectoryClone pub trait DirectoryClone { - /// Clones the directory and boxes the clone - fn box_clone(&self) -> Box; + /// Clones the directory and boxes the clone + fn box_clone(&self) -> Box; } impl DirectoryClone for T where - T: 'static + Directory + Clone, + T: 'static + Directory + Clone, { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index e5510d113..7a9a8bd15 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -2,6 +2,7 @@ use core::MANAGED_FILEPATH; use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; use error::TantivyError; +use indexer::LockType; use serde_json; use std::collections::HashSet; use std::io; @@ -12,9 +13,6 @@ use std::sync::RwLockWriteGuard; use std::sync::{Arc, RwLock}; use Directory; use Result; -use indexer::LockType; - - /// Returns true iff the file is "managed". /// Non-managed file are not subject to garbage collection. @@ -108,7 +106,8 @@ impl ManagedDirectory { // // releasing the lock as .delete() will use it too. { - let meta_informations_rlock = self.meta_informations + let meta_informations_rlock = self + .meta_informations .read() .expect("Managed directory rlock poisoned in garbage collect."); @@ -157,7 +156,8 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. - let mut meta_informations_wlock = self.meta_informations + let mut meta_informations_wlock = self + .meta_informations .write() .expect("Managed directory wlock poisoned (2)."); { @@ -186,9 +186,10 @@ impl ManagedDirectory { fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { // Files starting by "." (e.g. lock files) are not managed. if !is_managed(filepath) { - return Ok(()); + return Ok(()); } - let mut meta_wlock = self.meta_informations + let mut meta_wlock = self + .meta_informations .write() .expect("Managed file lock poisoned"); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 05fa18793..619e0fd19 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result, OpenReadE } })?; - let meta_data = file.metadata() + let meta_data = file + .metadata() .map_err(|e| IOError::with_path(full_path.to_owned(), e))?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible @@ -309,7 +310,8 @@ impl Directory for MmapDirectory { // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { - Ok(_) => self.sync_directory() + Ok(_) => self + .sync_directory() .map_err(|e| IOError::with_path(path.to_owned(), e).into()), Err(e) => { if e.kind() == io::ErrorKind::NotFound { diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 1b40970b4..2f1733e0f 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -170,7 +170,8 @@ impl Directory for RAMDirectory { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); - let exists = self.fs + let exists = self + .fs .write(path_buf.clone(), &Vec::new()) .map_err(|err| IOError::with_path(path.to_owned(), err))?; // force the creation of the file to mimic the MMap directory. @@ -195,9 +196,10 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { - fail_point!("RAMDirectory::atomic_write", |msg| { - Err(io::Error::new(io::ErrorKind::Other, msg.unwrap_or("Undefined".to_string()))) - }); + fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new( + io::ErrorKind::Other, + msg.unwrap_or("Undefined".to_string()) + ))); let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); self.fs.write(path_buf, &Vec::new())?; diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index d2e9358d4..6ed2049e5 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -5,7 +5,6 @@ use fst::raw::MmapReadOnly; use stable_deref_trait::{CloneStableDeref, StableDeref}; use std::ops::Deref; - /// Read object that represents files in tantivy. /// /// These read objects are only in charge to deliver diff --git a/src/error.rs b/src/error.rs index ddde26789..8509d28cd 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,9 +4,9 @@ use std::io; use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError}; use fastfield::FastFieldNotAvailableError; +use indexer::LockType; use query; use schema; -use indexer::LockType; use serde_json; use std::path::PathBuf; use std::sync::PoisonError; @@ -21,7 +21,10 @@ pub enum TantivyError { #[fail(display = "file already exists: '{:?}'", _0)] FileAlreadyExists(PathBuf), /// Failed to acquire file lock - #[fail(display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", _0)] + #[fail( + display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", + _0 + )] LockFailure(LockType), /// IO Error. #[fail(display = "an IO error occurred: '{}'", _0)] @@ -95,14 +98,13 @@ impl From for TantivyError { } } - impl From for TantivyError { fn from(error: OpenWriteError) -> TantivyError { match error { - OpenWriteError::FileAlreadyExists(filepath) => - TantivyError::FileAlreadyExists(filepath), - OpenWriteError::IOError(io_error) => - TantivyError::IOError(io_error), + OpenWriteError::FileAlreadyExists(filepath) => { + TantivyError::FileAlreadyExists(filepath) + } + OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), }.into() } } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 3f8a0eb5b..15ed658ce 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -41,7 +41,8 @@ pub struct DeleteBitSet { impl DeleteBitSet { /// Opens a delete bitset given its data source. pub fn open(data: ReadOnlySource) -> DeleteBitSet { - let num_deleted: usize = data.as_slice() + let num_deleted: usize = data + .as_slice() .iter() .map(|b| b.count_ones() as usize) .sum(); diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 182b17989..92a917089 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -56,7 +56,8 @@ impl FacetReader { /// Given a term ordinal returns the term associated to it. pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) { - let found_term = self.term_dict + let found_term = self + .term_dict .ord_to_term(facet_ord as u64, output.inner_buffer_mut()); assert!(found_term, "Term ordinal {} no found.", facet_ord); } diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 9177ddcd9..e5fd45203 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -132,7 +132,8 @@ impl MultiValueIntFastFieldWriter { ); let mut doc_vals: Vec = Vec::with_capacity(100); - for (start, stop) in self.doc_index + for (start, stop) in self + .doc_index .windows(2) .map(|interval| (interval[0], interval[1])) .chain(Some(last_interval).into_iter()) @@ -148,7 +149,6 @@ impl MultiValueIntFastFieldWriter { value_serializer.add_val(val)?; } } - } None => { let val_min_max = self.vals.iter().cloned().minmax(); diff --git a/src/functional_test.rs b/src/functional_test.rs index af7b1883a..9905f1d6e 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -1,8 +1,8 @@ use rand::thread_rng; use std::collections::HashSet; -use rand::Rng; use rand::distributions::Range; +use rand::Rng; use schema::*; use Index; use Searcher; diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 4c2597fbb..f921b7523 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -52,7 +52,8 @@ impl DeleteQueue { // // Past delete operations are not accessible. pub fn cursor(&self) -> DeleteCursor { - let last_block = self.inner + let last_block = self + .inner .read() .expect("Read lock poisoned when opening delete queue cursor") .last_block @@ -92,7 +93,8 @@ impl DeleteQueue { // be some unflushed operations. // fn flush(&self) -> Option> { - let mut self_wlock = self.inner + let mut self_wlock = self + .inner .write() .expect("Failed to acquire write lock on delete queue writer"); @@ -132,7 +134,8 @@ impl From for NextBlock { impl NextBlock { fn next_block(&self) -> Option> { { - let next_read_lock = self.0 + let next_read_lock = self + .0 .read() .expect("Failed to acquire write lock in delete queue"); if let InnerNextBlock::Closed(ref block) = *next_read_lock { @@ -141,7 +144,8 @@ impl NextBlock { } let next_block; { - let mut next_write_lock = self.0 + let mut next_write_lock = self + .0 .write() .expect("Failed to acquire write lock in delete queue"); match *next_write_lock { diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index 4dbaa9ed4..9555234bb 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -1,10 +1,10 @@ use directory::error::OpenWriteError; -use Directory; -use TantivyError; +use std::io::Write; use std::path::{Path, PathBuf}; use std::thread; use std::time::Duration; -use std::io::Write; +use Directory; +use TantivyError; #[derive(Debug, Clone, Copy)] pub enum LockType { @@ -29,10 +29,9 @@ pub enum LockType { /// is very simplistic. We retry after `100ms` until we effectively /// acquire the lock. /// This lock should not have much contention in normal usage. - MetaLock + MetaLock, } - /// Retry the logic of acquiring locks is pretty simple. /// We just retry `n` times after a given `duratio`, both /// depending on the type of lock. @@ -49,7 +48,7 @@ impl RetryPolicy { } } - fn wait_and_retry(&mut self,) -> bool { + fn wait_and_retry(&mut self) -> bool { if self.num_retries == 0 { false } else { @@ -58,35 +57,26 @@ impl RetryPolicy { thread::sleep(wait_duration); true } - } } impl LockType { - fn retry_policy(&self) -> RetryPolicy { match *self { - LockType::IndexWriterLock => - RetryPolicy::no_retry(), - LockType::MetaLock => - RetryPolicy { - num_retries: 100, - wait_in_ms: 100, - } + LockType::IndexWriterLock => RetryPolicy::no_retry(), + LockType::MetaLock => RetryPolicy { + num_retries: 100, + wait_in_ms: 100, + }, } } fn try_acquire_lock(&self, directory: &mut Directory) -> Result { let path = self.filename(); - let mut write = directory - .open_write(path) - .map_err(|e| - match e { - OpenWriteError::FileAlreadyExists(_) => - TantivyError::LockFailure(*self), - OpenWriteError::IOError(io_error) => - TantivyError::IOError(io_error), - })?; + let mut write = directory.open_write(path).map_err(|e| match e { + OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(*self), + OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), + })?; write.flush()?; Ok(DirectoryLock { directory: directory.box_clone(), @@ -94,7 +84,6 @@ impl LockType { }) } - /// Acquire a lock in the given directory. pub fn acquire_lock(&self, directory: &Directory) -> Result { let mut box_directory = directory.box_clone(); @@ -110,25 +99,19 @@ impl LockType { return Err(TantivyError::LockFailure(filepath.to_owned())); } } - Err(_) => { - } + Err(_) => {} } } } fn filename(&self) -> &Path { match *self { - LockType::MetaLock => { - Path::new(".tantivy-meta.lock") - } - LockType::IndexWriterLock => { - Path::new(".tantivy-indexer.lock") - } + LockType::MetaLock => Path::new(".tantivy-meta.lock"), + LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"), } } } - /// The `DirectoryLock` is an object that represents a file lock. /// See [`LockType`](struct.LockType.html) /// diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3e11c4ce5..2b791ecec 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -347,7 +347,8 @@ impl IndexWriter { } drop(self.workers_join_handle); - let result = self.segment_updater + let result = self + .segment_updater .wait_merging_thread() .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into())); @@ -494,7 +495,8 @@ impl IndexWriter { let document_receiver = self.document_receiver.clone(); // take the directory lock to create a new index_writer. - let directory_lock = self._directory_lock + let directory_lock = self + ._directory_lock .take() .expect("The IndexWriter does not have any lock. This is a bug, please report."); @@ -678,7 +680,7 @@ mod tests { let err_msg = err.to_string(); assert!(err_msg.contains("Lockfile")); assert!(err_msg.contains("Possible causes:")) - }, + } _ => panic!("Expected LockfileAlreadyExists error"), } } @@ -864,8 +866,7 @@ mod tests { assert_eq!(initial_table_size(1_000_000_000), 19); } - - #[cfg(not(feature="no_fail"))] + #[cfg(not(feature = "no_fail"))] #[test] fn test_write_commit_fails() { use fail; @@ -874,7 +875,7 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); - for _ in 0..100 { + for _ in 0..100 { index_writer.add_document(doc!(text_field => "a")); } index_writer.commit().unwrap(); diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 8df588ca3..407cb94bb 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -21,17 +21,17 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug { /// MergePolicyClone pub trait MergePolicyClone { - /// Returns a boxed clone of the MergePolicy. - fn box_clone(&self) -> Box; + /// Returns a boxed clone of the MergePolicy. + fn box_clone(&self) -> Box; } impl MergePolicyClone for T where - T: 'static + MergePolicy + Clone, + T: 'static + MergePolicy + Clone, { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } } /// Never merge segments. diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 5d2e17c51..87158a947 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -440,7 +440,8 @@ impl IndexMerger { ) -> Result> { let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); - let field_readers = self.readers + let field_readers = self + .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) .collect::>(); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index c455d3091..c0c883e15 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -51,7 +51,8 @@ impl SegmentRegister { } pub fn segment_metas(&self) -> Vec { - let mut segment_ids: Vec = self.segment_states + let mut segment_ids: Vec = self + .segment_states .values() .map(|segment_entry| segment_entry.meta().clone()) .collect(); diff --git a/src/lib.rs b/src/lib.rs index 8f4bc726e..9607062e1 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -143,6 +143,7 @@ extern crate fst; extern crate fst_regex; extern crate futures; extern crate futures_cpupool; +extern crate htmlescape; extern crate itertools; extern crate levenshtein_automata; extern crate num_cpus; @@ -154,7 +155,6 @@ extern crate stable_deref_trait; extern crate tempdir; extern crate tempfile; extern crate uuid; -extern crate htmlescape; #[cfg(test)] #[macro_use] @@ -183,7 +183,7 @@ mod macros; pub use error::TantivyError; -#[deprecated(since="0.7.0", note="please use `tantivy::TantivyError` instead")] +#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")] pub use error::TantivyError as Error; extern crate census; @@ -951,4 +951,3 @@ mod tests { } } } - diff --git a/src/positions/mod.rs b/src/positions/mod.rs index f867358f5..ab0375355 100644 --- a/src/positions/mod.rs +++ b/src/positions/mod.rs @@ -1,4 +1,3 @@ - /// Positions are stored in three parts and over two files. // /// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta, @@ -24,13 +23,12 @@ /// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this /// value, and then skip normally. /// - mod reader; mod serializer; pub use self::reader::PositionReader; pub use self::serializer::PositionSerializer; -use bitpacking::{BitPacker4x, BitPacker}; +use bitpacking::{BitPacker, BitPacker4x}; const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; const LONG_SKIP_IN_BLOCKS: usize = 1_024; @@ -43,10 +41,10 @@ lazy_static! { #[cfg(test)] pub mod tests { - use std::iter; - use super::{PositionSerializer, PositionReader}; + use super::{PositionReader, PositionSerializer}; use directory::ReadOnlySource; use positions::COMPRESSION_BLOCK_SIZE; + use std::iter; fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) { let mut skip_buffer = vec![]; @@ -59,7 +57,10 @@ pub mod tests { } serializer.close().unwrap(); } - (ReadOnlySource::from(stream_buffer), ReadOnlySource::from(skip_buffer)) + ( + ReadOnlySource::from(stream_buffer), + ReadOnlySource::from(skip_buffer), + ) } #[test] @@ -103,7 +104,7 @@ pub mod tests { assert_eq!(skip.len(), 12); assert_eq!(stream.len(), 1168); - let mut position_reader = PositionReader::new(stream,skip, 0u64); + let mut position_reader = PositionReader::new(stream, skip, 0u64); let mut buf = [0u32; 7]; let mut c = 0; for _ in 0..100 { @@ -125,7 +126,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 1_000_000); - let mut position_reader = PositionReader::new(stream,skip, 128 * 1024); + let mut position_reader = PositionReader::new(stream, skip, 128 * 1024); let mut buf = [0u32; 1]; position_reader.read(&mut buf); assert_eq!(buf[0], CONST_VAL); @@ -137,12 +138,17 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 4_987_872); - for &offset in &[10, 128 * 1024, 128 * 1024 - 1, 128 * 1024 + 7, 128 * 10 * 1024 + 10] { - let mut position_reader = PositionReader::new(stream.clone(),skip.clone(), offset); + for &offset in &[ + 10, + 128 * 1024, + 128 * 1024 - 1, + 128 * 1024 + 7, + 128 * 10 * 1024 + 10, + ] { + let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset); let mut buf = [0u32; 1]; position_reader.read(&mut buf); assert_eq!(buf[0], offset as u32); } } } - diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 8b5dd70bb..dec653830 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -1,12 +1,12 @@ -use bitpacking::{BitPacker4x, BitPacker}; -use owned_read::OwnedRead; -use common::{BinarySerializable, FixedSize}; -use postings::compression::compressed_block_size; -use directory::ReadOnlySource; -use positions::COMPRESSION_BLOCK_SIZE; -use positions::LONG_SKIP_IN_BLOCKS; -use positions::LONG_SKIP_INTERVAL; use super::BIT_PACKER; +use bitpacking::{BitPacker, BitPacker4x}; +use common::{BinarySerializable, FixedSize}; +use directory::ReadOnlySource; +use owned_read::OwnedRead; +use positions::COMPRESSION_BLOCK_SIZE; +use positions::LONG_SKIP_INTERVAL; +use positions::LONG_SKIP_IN_BLOCKS; +use postings::compression::compressed_block_size; pub struct PositionReader { skip_read: OwnedRead, @@ -18,7 +18,6 @@ pub struct PositionReader { // of the block of the next int to read. } - // `ahead` represents the offset of the block currently loaded // compared to the cursor of the actual stream. // @@ -32,7 +31,8 @@ fn read_impl( buffer: &mut [u32; 128], mut inner_offset: usize, num_bits: &[u8], - output: &mut [u32]) -> usize { + output: &mut [u32], +) -> usize { let mut output_start = 0; let mut output_len = output.len(); let mut ahead = 0; @@ -47,8 +47,7 @@ fn read_impl( output_start += available_len; inner_offset = 0; let num_bits = num_bits[ahead]; - BitPacker4x::new() - .decompress(position, &mut buffer[..], num_bits); + BitPacker4x::new().decompress(position, &mut buffer[..], num_bits); let block_len = compressed_block_size(num_bits); position = &position[block_len..]; ahead += 1; @@ -56,11 +55,12 @@ fn read_impl( } } - impl PositionReader { - pub fn new(position_source: ReadOnlySource, - skip_source: ReadOnlySource, - offset: u64) -> PositionReader { + pub fn new( + position_source: ReadOnlySource, + skip_source: ReadOnlySource, + offset: u64, + ) -> PositionReader { let skip_len = skip_source.len(); let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES); let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted"); @@ -70,7 +70,8 @@ impl PositionReader { let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize; let offset_num_bytes: u64 = { if long_skip_id > 0 { - let mut long_skip_blocks: &[u8] = &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8]; + let mut long_skip_blocks: &[u8] = + &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8]; u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16 } else { 0 @@ -79,13 +80,13 @@ impl PositionReader { let mut position_read = OwnedRead::new(position_source); position_read.advance(offset_num_bytes as usize); let mut skip_read = OwnedRead::new(skip_body); - skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); + skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); let mut position_reader = PositionReader { skip_read, position_read, inner_offset: 0, buffer: Box::new([0u32; 128]), - ahead: None + ahead: None, }; position_reader.skip(small_skip); position_reader @@ -108,7 +109,8 @@ impl PositionReader { self.buffer.as_mut(), self.inner_offset, &skip_data[1..], - output)); + output, + )); } /// Skip the next `skip_len` integer. @@ -118,23 +120,20 @@ impl PositionReader { /// /// May panic if the end of the stream is reached. pub fn skip(&mut self, skip_len: usize) { - let skip_len_plus_inner_offset = skip_len + self.inner_offset; let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE; self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE; - self.ahead = self.ahead - .and_then(|num_blocks| { - if num_blocks >= num_blocks_to_advance { - Some(num_blocks_to_advance - num_blocks_to_advance) - } else { - None - } - }); + self.ahead = self.ahead.and_then(|num_blocks| { + if num_blocks >= num_blocks_to_advance { + Some(num_blocks_to_advance - num_blocks_to_advance) + } else { + None + } + }); - let skip_len = self.skip_read - .as_ref()[..num_blocks_to_advance] + let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance] .iter() .cloned() .map(|num_bit| num_bit as usize) diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 598c26363..2a3dc09c1 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -1,8 +1,8 @@ -use std::io; -use bitpacking::BitPacker; -use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; -use common::BinarySerializable; use super::BIT_PACKER; +use bitpacking::BitPacker; +use common::BinarySerializable; +use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; +use std::io; pub struct PositionSerializer { write_stream: W, @@ -23,7 +23,7 @@ impl PositionSerializer { buffer: vec![0u8; 128 * 4], num_ints: 0u64, long_skips: Vec::new(), - cumulated_num_bits: 0u64 + cumulated_num_bits: 0u64, } } @@ -31,7 +31,6 @@ impl PositionSerializer { self.num_ints } - fn remaining_block_len(&self) -> usize { COMPRESSION_BLOCK_SIZE - self.block.len() } diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 6b05010c6..810cf28e7 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -28,14 +28,16 @@ impl BlockEncoder { pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) { let num_bits = self.bitpacker.num_bits_sorted(offset, block); - let written_size = self.bitpacker - .compress_sorted(offset, block, &mut self.output[..], num_bits); + let written_size = + self.bitpacker + .compress_sorted(offset, block, &mut self.output[..], num_bits); (num_bits, &self.output[..written_size]) } pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) { let num_bits = self.bitpacker.num_bits(block); - let written_size = self.bitpacker + let written_size = self + .bitpacker .compress(block, &mut self.output[..], num_bits); (num_bits, &self.output[..written_size]) } @@ -62,19 +64,21 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32, num_bits: u8) -> usize { + pub fn uncompress_block_sorted( + &mut self, + compressed_data: &[u8], + offset: u32, + num_bits: u8, + ) -> usize { self.output_len = COMPRESSION_BLOCK_SIZE; - self.bitpacker.decompress_sorted( - offset, - &compressed_data, - &mut self.output, - num_bits, - ) + self.bitpacker + .decompress_sorted(offset, &compressed_data, &mut self.output, num_bits) } pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize { self.output_len = COMPRESSION_BLOCK_SIZE; - self.bitpacker.decompress(&compressed_data, &mut self.output, num_bits) + self.bitpacker + .decompress(&compressed_data, &mut self.output, num_bits) } #[inline] @@ -88,7 +92,6 @@ impl BlockDecoder { } } - pub trait VIntEncoder { /// Compresses an array of `u32` integers, /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding) diff --git a/src/postings/compression/vint.rs b/src/postings/compression/vint.rs index 515510c54..88a0df5a5 100644 --- a/src/postings/compression/vint.rs +++ b/src/postings/compression/vint.rs @@ -1,9 +1,5 @@ #[inline(always)] -pub fn compress_sorted<'a>( - input: &[u32], - output: &'a mut [u8], - mut offset: u32, -) -> &'a [u8] { +pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v - offset; @@ -46,11 +42,7 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a } #[inline(always)] -pub fn uncompress_sorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32], - offset: u32, -) -> usize { +pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize { let mut read_byte = 0; let mut result = offset; let num_els = output.len(); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 85852ed22..89b4a3c62 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -2,6 +2,7 @@ Postings module (also called inverted index) */ +pub(crate) mod compression; /// Postings module /// /// Postings, also called inverted lists, is the key datastructure @@ -11,18 +12,17 @@ mod postings_writer; mod recorder; mod segment_postings; mod serializer; -pub(crate) mod compression; +mod skip; mod stacker; mod term_info; -mod skip; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; +use self::compression::COMPRESSION_BLOCK_SIZE; pub use self::postings::Postings; -pub use self::term_info::TermInfo; pub(crate) use self::skip::SkipReader; -use self::compression::{COMPRESSION_BLOCK_SIZE}; +pub use self::term_info::TermInfo; pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings}; @@ -71,8 +71,7 @@ pub mod tests { let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); { - let mut field_serializer = posting_serializer - .new_field(text_field, 120 * 4).unwrap(); + let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap(); field_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; @@ -512,13 +511,13 @@ pub mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); for _ in 0..posting_list_size { let mut doc = Document::default(); - if rng.gen_bool(1f64/ 15f64) { + if rng.gen_bool(1f64 / 15f64) { doc.add_text(text_field, "a"); } - if rng.gen_bool(1f64/ 10f64) { + if rng.gen_bool(1f64 / 10f64) { doc.add_text(text_field, "b"); } - if rng.gen_bool(1f64/ 5f64) { + if rng.gen_bool(1f64 / 5f64) { doc.add_text(text_field, "c"); } doc.add_text(text_field, "d"); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index d90b322d9..bb3ca10a4 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -94,7 +94,8 @@ impl MultiFieldPostingsWriter { &self, serializer: &mut InvertedIndexSerializer, ) -> Result>> { - let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index + let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self + .term_index .iter() .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId)) .collect(); diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index e787ba5e9..c355a78ba 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -107,7 +107,8 @@ impl Recorder for TermFrequencyRecorder { fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> { // the last document has not been closed... // its term freq is self.current_tf. - let mut doc_iter = self.stack + let mut doc_iter = self + .stack .iter(heap) .chain(Some(self.current_tf).into_iter()); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 74fcbc199..8986c036b 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,20 +1,20 @@ -use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; -use DocId; use common::BitSet; use common::HasLen; -use postings::compression::compressed_block_size; +use common::{BinarySerializable, VInt}; use docset::{DocSet, SkipResult}; use fst::Streamer; +use owned_read::OwnedRead; +use positions::PositionReader; +use postings::compression::compressed_block_size; +use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; use postings::serializer::PostingsSerializer; use postings::FreqReadingOption; use postings::Postings; -use owned_read::OwnedRead; -use common::{VInt, BinarySerializable}; -use postings::USE_SKIP_INFO_LIMIT; use postings::SkipReader; +use postings::USE_SKIP_INFO_LIMIT; use schema::IndexRecordOption; -use positions::PositionReader; use std::cmp::Ordering; +use DocId; const EMPTY_ARR: [u8; 0] = []; @@ -98,7 +98,7 @@ impl SegmentPostings { docs.len() as u32, OwnedRead::new(buffer), IndexRecordOption::Basic, - IndexRecordOption::Basic + IndexRecordOption::Basic, ); SegmentPostings::from_block_postings(block_segment_postings, None) } @@ -151,7 +151,11 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) { /// The target is assumed smaller or equal to the last element. fn search_within_block(block_docs: &[u32], target: u32) -> usize { let (start, end) = exponential_search(target, block_docs); - start.wrapping_add(block_docs[start..end].binary_search(&target).unwrap_or_else(|e| e)) + start.wrapping_add( + block_docs[start..end] + .binary_search(&target) + .unwrap_or_else(|e| e), + ) } impl DocSet for SegmentPostings { @@ -179,21 +183,20 @@ impl DocSet for SegmentPostings { // check if we need to go to the next block let need_positions = self.position_computer.is_some(); let mut sum_freqs_skipped: u32 = 0; - if !self.block_cursor - .docs() - .last() - .map(|doc| *doc >= target) - .unwrap_or(false) // there should always be at least a document in the block - // since advance returned. + if !self + .block_cursor + .docs() + .last() + .map(|doc| *doc >= target) + .unwrap_or(false) + // there should always be at least a document in the block + // since advance returned. { // we are not in the right block. // // First compute all of the freqs skipped from the current block. if need_positions { - sum_freqs_skipped = self.block_cursor - .freqs()[self.cur..] - .iter() - .sum(); + sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum(); match self.block_cursor.skip_to(target) { BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => { sum_freqs_skipped += block_skip_freqs; @@ -215,9 +218,13 @@ impl DocSet for SegmentPostings { let block_docs = self.block_cursor.docs(); debug_assert!(target >= self.doc()); - let new_cur = self.cur.wrapping_add(search_within_block(&block_docs[self.cur..], target)); + let new_cur = self + .cur + .wrapping_add(search_within_block(&block_docs[self.cur..], target)); if need_positions { - sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur].iter().sum::(); + sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur] + .iter() + .sum::(); self.position_computer .as_mut() .unwrap() @@ -330,7 +337,10 @@ pub struct BlockSegmentPostings { skip_reader: SkipReader, } -fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option, OwnedRead) { +fn split_into_skips_and_postings( + doc_freq: u32, + mut data: OwnedRead, +) -> (Option, OwnedRead) { if doc_freq >= USE_SKIP_INFO_LIMIT { let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize; let mut postings_data = data.clone(); @@ -345,7 +355,7 @@ fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option< #[derive(Debug, Eq, PartialEq)] pub enum BlockSegmentPostingsSkipResult { Terminated, - Success(u32) //< number of term freqs to skip + Success(u32), //< number of term freqs to skip } impl BlockSegmentPostings { @@ -353,7 +363,7 @@ impl BlockSegmentPostings { doc_freq: u32, data: OwnedRead, record_option: IndexRecordOption, - requested_option: IndexRecordOption + requested_option: IndexRecordOption, ) -> BlockSegmentPostings { let freq_reading_option = match (record_option, requested_option) { (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq, @@ -362,11 +372,10 @@ impl BlockSegmentPostings { }; let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data); - let skip_reader = - match skip_data_opt { - Some(skip_data) => SkipReader::new(skip_data, record_option), - None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option) - }; + let skip_reader = match skip_data_opt { + Some(skip_data) => SkipReader::new(skip_data, record_option), + None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option), + }; let doc_freq = doc_freq as usize; let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE; BlockSegmentPostings { @@ -450,7 +459,6 @@ impl BlockSegmentPostings { self.doc_decoder.output_len } - /// position on a block that may contains `doc_id`. /// Always advance the current block. /// @@ -461,9 +469,7 @@ impl BlockSegmentPostings { /// Returns false iff all of the document remaining are smaller than /// `doc_id`. In that case, all of these document are consumed. /// - pub fn skip_to(&mut self, - target_doc: DocId) -> BlockSegmentPostingsSkipResult { - + pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult { let mut skip_freqs = 0u32; while self.skip_reader.advance() { if self.skip_reader.doc() >= target_doc { @@ -472,11 +478,11 @@ impl BlockSegmentPostings { // // We found our block! let num_bits = self.skip_reader.doc_num_bits(); - let num_consumed_bytes = self.doc_decoder - .uncompress_block_sorted( - self.remaining_data.as_ref(), - self.doc_offset, - num_bits); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + num_bits, + ); self.remaining_data.advance(num_consumed_bytes); let tf_num_bits = self.skip_reader.tf_num_bits(); match self.freq_reading_option { @@ -486,9 +492,9 @@ impl BlockSegmentPostings { self.remaining_data.advance(num_bytes_to_skip); } FreqReadingOption::ReadFreq => { - let num_consumed_bytes = self.freq_decoder - .uncompress_block_unsorted(self.remaining_data.as_ref(), - tf_num_bits); + let num_consumed_bytes = self + .freq_decoder + .uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits); self.remaining_data.advance(num_consumed_bytes); } } @@ -518,7 +524,8 @@ impl BlockSegmentPostings { } } self.num_vint_docs = 0; - return self.docs() + return self + .docs() .last() .map(|last_doc| { if *last_doc >= target_doc { @@ -538,11 +545,11 @@ impl BlockSegmentPostings { pub fn advance(&mut self) -> bool { if self.skip_reader.advance() { let num_bits = self.skip_reader.doc_num_bits(); - let num_consumed_bytes = self.doc_decoder - .uncompress_block_sorted( - self.remaining_data.as_ref(), - self.doc_offset, - num_bits); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + num_bits, + ); self.remaining_data.advance(num_consumed_bytes); let tf_num_bits = self.skip_reader.tf_num_bits(); match self.freq_reading_option { @@ -552,9 +559,9 @@ impl BlockSegmentPostings { self.remaining_data.advance(num_bytes_to_skip); } FreqReadingOption::ReadFreq => { - let num_consumed_bytes = self.freq_decoder - .uncompress_block_unsorted(self.remaining_data.as_ref(), - tf_num_bits); + let num_consumed_bytes = self + .freq_decoder + .uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits); self.remaining_data.advance(num_consumed_bytes); } } @@ -594,7 +601,6 @@ impl BlockSegmentPostings { doc_offset: 0, doc_freq: 0, - remaining_data: OwnedRead::new(vec![]), skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic), } @@ -616,7 +622,9 @@ impl<'b> Streamer<'b> for BlockSegmentPostings { #[cfg(test)] mod tests { + use super::search_within_block; use super::BlockSegmentPostings; + use super::BlockSegmentPostingsSkipResult; use super::SegmentPostings; use common::HasLen; use core::Index; @@ -626,9 +634,7 @@ mod tests { use schema::SchemaBuilder; use schema::Term; use schema::INT_INDEXED; - use super::BlockSegmentPostingsSkipResult; use DocId; - use super::search_within_block; #[test] fn test_empty_segment_postings() { @@ -645,7 +651,6 @@ mod tests { assert_eq!(postings.doc_freq(), 0); } - fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize { block .iter() @@ -653,11 +658,15 @@ mod tests { .enumerate() .filter(|&(_, ref val)| *val >= target) .next() - .unwrap().0 + .unwrap() + .0 } fn util_test_search_within_block(block: &[u32], target: u32) { - assert_eq!(search_within_block(block, target), search_within_block_trivial_but_slow(block, target)); + assert_eq!( + search_within_block(block, target), + search_within_block_trivial_but_slow(block, target) + ); } fn util_test_search_within_block_all(block: &[u32]) { @@ -677,7 +686,7 @@ mod tests { #[test] fn test_search_within_block() { for len in 1u32..128u32 { - let v: Vec = (0..len).map(|i| i*2).collect(); + let v: Vec = (0..len).map(|i| i * 2).collect(); util_test_search_within_block_all(&v[..]); } } @@ -726,14 +735,22 @@ mod tests { fn test_block_segment_postings_skip() { for i in 0..4 { let mut block_postings = build_block_postings(vec![3]); - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32)); - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Success(0u32) + ); + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Terminated + ); } let mut block_postings = build_block_postings(vec![3]); - assert_eq!(block_postings.skip_to(4u32), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(4u32), + BlockSegmentPostingsSkipResult::Terminated + ); } - #[test] fn test_block_segment_postings_skip2() { let mut docs = vec![0]; @@ -741,14 +758,23 @@ mod tests { docs.push((i * i / 100) + i); } let mut block_postings = build_block_postings(docs.clone()); - for i in vec![0, 424, 10000] { - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32)); + for i in vec![0, 424, 10000] { + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Success(0u32) + ); let docs = block_postings.docs(); assert!(docs[0] <= i); assert!(docs.last().cloned().unwrap_or(0u32) >= i); } - assert_eq!(block_postings.skip_to(100_000), BlockSegmentPostingsSkipResult::Terminated); - assert_eq!(block_postings.skip_to(101_000), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(100_000), + BlockSegmentPostingsSkipResult::Terminated + ); + assert_eq!( + block_postings.skip_to(101_000), + BlockSegmentPostingsSkipResult::Terminated + ); } #[test] diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 521d467d2..f7661933a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,18 +1,18 @@ use super::TermInfo; -use common::{VInt, BinarySerializable}; +use common::{BinarySerializable, VInt}; use common::{CompositeWrite, CountingWriter}; -use postings::compression::{VIntEncoder, BlockEncoder, COMPRESSION_BLOCK_SIZE}; use core::Segment; use directory::WritePtr; +use positions::PositionSerializer; +use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; +use postings::skip::SkipSerializer; +use postings::USE_SKIP_INFO_LIMIT; use schema::Schema; use schema::{Field, FieldEntry, FieldType}; use std::io::{self, Write}; use termdict::{TermDictionaryBuilder, TermOrdinal}; use DocId; use Result; -use postings::USE_SKIP_INFO_LIMIT; -use postings::skip::SkipSerializer; -use positions::PositionSerializer; /// `PostingsSerializer` is in charge of serializing /// postings on disk, in the @@ -104,7 +104,7 @@ impl InvertedIndexSerializer { term_dictionary_write, postings_write, positions_write, - positionsidx_write + positionsidx_write, ) } @@ -135,7 +135,7 @@ impl<'a> FieldSerializer<'a> { term_dictionary_write: &'a mut CountingWriter, postings_write: &'a mut CountingWriter, positions_write: &'a mut CountingWriter, - positionsidx_write: &'a mut CountingWriter + positionsidx_write: &'a mut CountingWriter, ) -> io::Result> { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { FieldType::Str(ref text_options) => { @@ -153,7 +153,8 @@ impl<'a> FieldSerializer<'a> { }; let term_dictionary_builder = TermDictionaryBuilder::new(term_dictionary_write, field_type)?; - let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); + let postings_serializer = + PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); let positions_serializer_opt = if position_enabled { Some(PositionSerializer::new(positions_write, positionsidx_write)) } else { @@ -171,14 +172,15 @@ impl<'a> FieldSerializer<'a> { } fn current_term_info(&self) -> TermInfo { - let positions_idx = self.positions_serializer_opt + let positions_idx = self + .positions_serializer_opt .as_ref() .map(|positions_serializer| positions_serializer.positions_idx()) .unwrap_or(0u64); TermInfo { doc_freq: 0, postings_offset: self.postings_serializer.addr(), - positions_idx + positions_idx, } } @@ -253,7 +255,7 @@ impl<'a> FieldSerializer<'a> { struct Block { doc_ids: [DocId; COMPRESSION_BLOCK_SIZE], term_freqs: [u32; COMPRESSION_BLOCK_SIZE], - len: usize + len: usize, } impl Block { @@ -261,7 +263,7 @@ impl Block { Block { doc_ids: [0u32; COMPRESSION_BLOCK_SIZE], term_freqs: [0u32; COMPRESSION_BLOCK_SIZE], - len: 0 + len: 0, } } @@ -312,9 +314,12 @@ pub struct PostingsSerializer { termfreq_sum_enabled: bool, } - impl PostingsSerializer { - pub fn new(write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool) -> PostingsSerializer { + pub fn new( + write: W, + termfreq_enabled: bool, + termfreq_sum_enabled: bool, + ) -> PostingsSerializer { PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -337,14 +342,16 @@ impl PostingsSerializer { .block_encoder .compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); self.last_doc_id_encoded = self.block.last_doc(); - self.skip_write.write_doc(self.last_doc_id_encoded, num_bits); + self.skip_write + .write_doc(self.last_doc_id_encoded, num_bits); // last el block 0, offset block 1, self.postings_write.extend(block_encoded); } if self.termfreq_enabled { // encode the term_freqs - let (num_bits, block_encoded): (u8, &[u8]) = - self.block_encoder.compress_block_unsorted(&self.block.term_freqs()); + let (num_bits, block_encoded): (u8, &[u8]) = self + .block_encoder + .compress_block_unsorted(&self.block.term_freqs()); self.postings_write.extend(block_encoded); self.skip_write.write_term_freq(num_bits); if self.termfreq_sum_enabled { @@ -375,13 +382,15 @@ impl PostingsSerializer { // In that case, the remaining part is encoded // using variable int encoding. { - let block_encoded = self.block_encoder + let block_encoded = self + .block_encoder .compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); self.postings_write.write_all(block_encoded)?; } // ... Idem for term frequencies if self.termfreq_enabled { - let block_encoded = self.block_encoder + let block_encoded = self + .block_encoder .compress_vint_unsorted(self.block.term_freqs()); self.postings_write.write_all(block_encoded)?; } @@ -392,7 +401,6 @@ impl PostingsSerializer { VInt(skip_data.len() as u64).serialize(&mut self.output_write)?; self.output_write.write_all(skip_data)?; self.output_write.write_all(&self.postings_write[..])?; - } else { self.output_write.write_all(&self.postings_write[..])?; } diff --git a/src/postings/skip.rs b/src/postings/skip.rs index e2d59e2c6..ab2dcb6c2 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,8 +1,8 @@ -use DocId; use common::BinarySerializable; use owned_read::OwnedRead; use postings::compression::COMPRESSION_BLOCK_SIZE; use schema::IndexRecordOption; +use DocId; pub struct SkipSerializer { buffer: Vec, @@ -18,8 +18,11 @@ impl SkipSerializer { } pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) { - assert!(last_doc > self.prev_doc, "write_doc(...) called with non-increasing doc ids. \ - Did you forget to call clear maybe?"); + assert!( + last_doc > self.prev_doc, + "write_doc(...) called with non-increasing doc ids. \ + Did you forget to call clear maybe?" + ); let delta_doc = last_doc - self.prev_doc; self.prev_doc = last_doc; delta_doc.serialize(&mut self.buffer).unwrap(); @@ -30,9 +33,10 @@ impl SkipSerializer { self.buffer.push(tf_num_bits); } - pub fn write_total_term_freq(&mut self, tf_sum: u32) { - tf_sum.serialize(&mut self.buffer).expect("Should never fail"); + tf_sum + .serialize(&mut self.buffer) + .expect("Should never fail"); } pub fn data(&self) -> &[u8] { @@ -103,33 +107,32 @@ impl SkipReader { } else { let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted"); self.doc += doc_delta as DocId; - self.doc_num_bits = self.owned_read.get(0); + self.doc_num_bits = self.owned_read.get(0); match self.skip_info { IndexRecordOption::Basic => { self.owned_read.advance(1); } - IndexRecordOption::WithFreqs=> { + IndexRecordOption::WithFreqs => { self.tf_num_bits = self.owned_read.get(1); self.owned_read.advance(2); } IndexRecordOption::WithFreqsAndPositions => { self.tf_num_bits = self.owned_read.get(1); self.owned_read.advance(2); - self.tf_sum = u32::deserialize(&mut self.owned_read) - .expect("Failed reading tf_sum"); + self.tf_sum = + u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum"); } } true } - } } #[cfg(test)] mod tests { - use super::{SkipReader, SkipSerializer}; use super::IndexRecordOption; + use super::{SkipReader, SkipSerializer}; use owned_read::OwnedRead; #[test] @@ -171,4 +174,4 @@ mod tests { assert_eq!(skip_reader.doc_num_bits(), 5u8); assert!(!skip_reader.advance()); } -} \ No newline at end of file +} diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index b92a203eb..0a01d9977 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -5,8 +5,8 @@ use query::TermQuery; use query::Weight; use schema::IndexRecordOption; use schema::Term; -use Result; use std::collections::BTreeSet; +use Result; use Searcher; /// The boolean query combines a set of queries @@ -41,9 +41,9 @@ impl From)>> for BooleanQuery { } impl Query for BooleanQuery { - fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { - let sub_weights = self.subqueries + let sub_weights = self + .subqueries .iter() .map(|&(ref occur, ref subquery)| { Ok((*occur, subquery.weight(searcher, scoring_enabled)?)) diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs index 06c15c3f3..6e64dca57 100644 --- a/src/query/empty_query.rs +++ b/src/query/empty_query.rs @@ -1,11 +1,11 @@ use super::Scorer; -use DocSet; -use Score; -use DocId; use query::Query; -use Result; -use Searcher; use query::Weight; +use DocId; +use DocSet; +use Result; +use Score; +use Searcher; use SegmentReader; /// `EmptyQuery` is a dummy `Query` in which no document matches. diff --git a/src/query/mod.rs b/src/query/mod.rs index 73a77174b..78b9cd56b 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -3,11 +3,11 @@ Query */ mod all_query; -mod empty_query; mod automaton_weight; mod bitset; mod bm25; mod boolean_query; +mod empty_query; mod exclude; mod fuzzy_query; mod intersection; @@ -34,10 +34,10 @@ pub use self::union::Union; pub use self::vec_docset::VecDocSet; pub use self::all_query::{AllQuery, AllScorer, AllWeight}; -pub use self::empty_query::{EmptyQuery, EmptyWeight, EmptyScorer}; pub use self::automaton_weight::AutomatonWeight; pub use self::bitset::BitSetDocSet; pub use self::boolean_query::BooleanQuery; +pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight}; pub use self::exclude::Exclude; pub use self::fuzzy_query::FuzzyTermQuery; pub use self::intersection::intersect_scorers; diff --git a/src/query/occur.rs b/src/query/occur.rs index 1a9396de0..e91e3a2b6 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -46,4 +46,4 @@ pub fn compose_occur(left: Occur, right: Occur) -> Occur { } } } -} \ No newline at end of file +} diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index d103461c1..959b17b0e 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -5,8 +5,8 @@ use query::bm25::BM25Weight; use query::Query; use query::Weight; use schema::{Field, Term}; -use Result; use std::collections::BTreeSet; +use Result; /// `PhraseQuery` matches a specific sequence of words. /// diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 6e04291c6..85f075d3a 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -124,7 +124,8 @@ impl PhraseScorer { fieldnorm_reader: FieldNormReader, score_needed: bool, ) -> PhraseScorer { - let max_offset = term_postings.iter() + let max_offset = term_postings + .iter() .map(|&(offset, _)| offset) .max() .unwrap_or(0); diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 69ab4e184..de8eeb0d2 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -30,7 +30,6 @@ impl PhraseWeight { } impl Weight for PhraseWeight { - fn scorer(&self, reader: &SegmentReader) -> Result> { let similarity_weight = self.similarity_weight.clone(); let field = self.phrase_terms[0].1.field(); diff --git a/src/query/query.rs b/src/query/query.rs index 6abbf35e0..ca7de8ca6 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -2,10 +2,10 @@ use super::Weight; use collector::Collector; use core::searcher::Searcher; use downcast; +use std::collections::BTreeSet; use std::fmt; use Result; use SegmentLocalId; -use std::collections::BTreeSet; use Term; /// The `Query` trait defines a set of documents and a scoring method diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 557e38e24..6df6f9144 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,8 +1,8 @@ use super::user_input_ast::*; use combine::char::*; -use combine::*; -use combine::stream::StreamErrorFor; use combine::error::StreamError; +use combine::stream::StreamErrorFor; +use combine::*; use query::occur::Occur; use query::query_parser::user_input_ast::UserInputBound; @@ -123,7 +123,8 @@ parser! { } enum BinaryOperand { - Or, And + Or, + And, } parser! { @@ -138,19 +139,16 @@ parser! { } } - enum Element { SingleEl(UserInputAST), - NormalDisjunctive(Vec>) + NormalDisjunctive(Vec>), } impl Element { pub fn into_dnf(self) -> Vec> { match self { - Element::NormalDisjunctive(conjunctions) => - conjunctions, - Element::SingleEl(el) => - vec!(vec!(el)), + Element::NormalDisjunctive(conjunctions) => conjunctions, + Element::SingleEl(el) => vec![vec![el]], } } } @@ -227,10 +225,12 @@ mod test { assert!(parse_to_ast().parse(query).is_err()); } - #[test] fn test_parse_query_to_ast_not_op() { - assert_eq!(format!("{:?}", parse_to_ast().parse("NOT")), "Err(UnexpectedParse)"); + assert_eq!( + format!("{:?}", parse_to_ast().parse("NOT")), + "Err(UnexpectedParse)" + ); test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); test_parse_query_to_ast_helper("NOT a", "-(\"a\")"); } @@ -241,10 +241,22 @@ mod test { test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))"); test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))"); test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))"); - assert_eq!(format!("{:?}", parse_to_ast().parse("a OR b aaa")), "Err(UnexpectedParse)"); - assert_eq!(format!("{:?}", parse_to_ast().parse("a AND b aaa")), "Err(UnexpectedParse)"); - assert_eq!(format!("{:?}", parse_to_ast().parse("aaa a OR b ")), "Err(UnexpectedParse)"); - assert_eq!(format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), "Err(UnexpectedParse)"); + assert_eq!( + format!("{:?}", parse_to_ast().parse("a OR b aaa")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("a AND b aaa")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("aaa a OR b ")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), + "Err(UnexpectedParse)" + ); } #[test] diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 93deb48c1..a8aec4f56 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,11 +1,14 @@ use super::logical_ast::*; use super::query_grammar::parse_to_ast; use super::user_input_ast::*; +use combine::Parser; use core::Index; +use query::occur::compose_occur; +use query::query_parser::logical_ast::LogicalAST; use query::AllQuery; use query::BooleanQuery; +use query::EmptyQuery; use query::Occur; -use query::occur::compose_occur; use query::PhraseQuery; use query::Query; use query::RangeQuery; @@ -18,10 +21,6 @@ use std::num::ParseIntError; use std::ops::Bound; use std::str::FromStr; use tokenizer::TokenizerManager; -use combine::Parser; -use query::EmptyQuery; -use query::query_parser::logical_ast::LogicalAST; - /// Possible error that may happen when parsing a query. #[derive(Debug, PartialEq, Eq)] @@ -59,23 +58,24 @@ impl From for QueryParserError { } } - /// Recursively remove empty clause from the AST /// /// Returns `None` iff the `logical_ast` ended up being empty. fn trim_ast(logical_ast: LogicalAST) -> Option { match logical_ast { LogicalAST::Clause(children) => { - let trimmed_children = children.into_iter() - .flat_map(|(occur, child)| - trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) ) + let trimmed_children = children + .into_iter() + .flat_map(|(occur, child)| { + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) + }) .collect::>(); if trimmed_children.is_empty() { None } else { Some(LogicalAST::Clause(trimmed_children)) } - }, + } _ => Some(logical_ast), } } @@ -188,8 +188,9 @@ impl QueryParser { /// Parse the user query into an AST. fn parse_query_to_logical_ast(&self, query: &str) -> Result { - let (user_input_ast, _remaining) = - parse_to_ast().parse(query).map_err(|_| QueryParserError::SyntaxError)?; + let (user_input_ast, _remaining) = parse_to_ast() + .parse(query) + .map_err(|_| QueryParserError::SyntaxError)?; self.compute_logical_ast(user_input_ast) } @@ -291,12 +292,9 @@ impl QueryParser { ) -> Result, QueryParserError> { let terms = self.compute_terms_for_string(field, phrase)?; match &terms[..] { - [] => - Ok(None), - [(_, term)] => - Ok(Some(LogicalLiteral::Term(term.clone()))), - _ => - Ok(Some(LogicalLiteral::Phrase(terms.clone()))), + [] => Ok(None), + [(_, term)] => Ok(Some(LogicalLiteral::Term(term.clone()))), + _ => Ok(Some(LogicalLiteral::Phrase(terms.clone()))), } } @@ -308,7 +306,11 @@ impl QueryParser { } } - fn resolve_bound(&self, field: Field, bound: &UserInputBound) -> Result, QueryParserError> { + fn resolve_bound( + &self, + field: Field, + bound: &UserInputBound, + ) -> Result, QueryParserError> { if bound.term_str() == "*" { return Ok(Bound::Unbounded); } @@ -355,18 +357,21 @@ impl QueryParser { Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } UserInputAST::Unary(left_occur, subquery) => { - let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; + let (right_occur, logical_sub_queries) = + self.compute_logical_ast_with_occur(*subquery)?; Ok((compose_occur(left_occur, right_occur), logical_sub_queries)) } - UserInputAST::Leaf(leaf) => { + UserInputAST::Leaf(leaf) => { let result_ast = self.compute_logical_ast_from_leaf(*leaf)?; Ok((Occur::Should, result_ast)) } } } - - fn compute_logical_ast_from_leaf(&self, leaf: UserInputLeaf) -> Result { + fn compute_logical_ast_from_leaf( + &self, + leaf: UserInputLeaf, + ) -> Result { match leaf { UserInputLeaf::Literal(literal) => { let term_phrases: Vec<(Field, String)> = match literal.field_name { @@ -391,21 +396,19 @@ impl QueryParser { asts.push(LogicalAST::Leaf(Box::new(ast))); } } - let result_ast: LogicalAST = - if asts.len() == 1 { - asts.into_iter().next().unwrap() - } else { - LogicalAST::Clause( - asts.into_iter() - .map(|ast| (Occur::Should, ast)) - .collect()) - }; + let result_ast: LogicalAST = if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) + }; Ok(result_ast) } - UserInputLeaf::All => { - Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))) - } - UserInputLeaf::Range { field, lower, upper } => { + UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))), + UserInputLeaf::Range { + field, + lower, + upper, + } => { let fields = self.resolved_fields(&field)?; let mut clauses = fields .iter() @@ -433,14 +436,15 @@ impl QueryParser { Ok(result_ast) } } - } } fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { match logical_literal { LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)), - LogicalLiteral::Phrase(term_with_offsets) => Box::new(PhraseQuery::new_with_offset(term_with_offsets)), + LogicalLiteral::Phrase(term_with_offsets) => { + Box::new(PhraseQuery::new_with_offset(term_with_offsets)) + } LogicalLiteral::Range { field, value_type, @@ -458,11 +462,16 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box { .into_iter() .map(|(occur, subquery)| (occur, convert_to_query(subquery))) .collect::>(); - assert!(!occur_subqueries.is_empty(), "Should not be empty after trimming"); + assert!( + !occur_subqueries.is_empty(), + "Should not be empty after trimming" + ); Box::new(BooleanQuery::from(occur_subqueries)) - }, - Some(LogicalAST::Leaf(trimmed_logical_literal)) => convert_literal_to_query(*trimmed_logical_literal), - None => Box::new(EmptyQuery) + } + Some(LogicalAST::Leaf(trimmed_logical_literal)) => { + convert_literal_to_query(*trimmed_logical_literal) + } + None => Box::new(EmptyQuery), } } @@ -475,7 +484,7 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::{Tokenizer, SimpleTokenizer, LowerCaser, StopWordFilter, TokenizerManager}; + use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { @@ -498,9 +507,11 @@ mod test { let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); - tokenizer_manager.register("en_with_stop_words", SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec!["the".to_string()])) + tokenizer_manager.register( + "en_with_stop_words", + SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])), ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -571,16 +582,8 @@ mod test { #[test] pub fn test_parse_query_empty() { - test_parse_query_to_logical_ast_helper( - "", - "", - false, - ); - test_parse_query_to_logical_ast_helper( - " ", - "", - false, - ); + test_parse_query_to_logical_ast_helper("", "", false); + test_parse_query_to_logical_ast_helper(" ", "", false); let query_parser = make_query_parser(); let query_result = query_parser.parse_query(""); let query = query_result.unwrap(); @@ -693,11 +696,7 @@ mod test { "(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)", false, ); - test_parse_query_to_logical_ast_helper( - "*", - "*", - false, - ); + test_parse_query_to_logical_ast_helper("*", "*", false); } #[test] diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 37adb94be..40cdd2424 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -16,9 +16,7 @@ pub enum UserInputLeaf { impl Debug for UserInputLeaf { fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { match self { - UserInputLeaf::Literal(literal) => { - literal.fmt(formatter) - } + UserInputLeaf::Literal(literal) => literal.fmt(formatter), UserInputLeaf::Range { ref field, ref lower, @@ -82,13 +80,12 @@ impl UserInputBound { pub enum UserInputAST { Clause(Vec), Unary(Occur, Box), -// Not(Box), -// Should(Box), -// Must(Box), + // Not(Box), + // Should(Box), + // Must(Box), Leaf(Box), } - impl UserInputAST { pub fn unary(self, occur: Occur) -> UserInputAST { UserInputAST::Unary(occur, Box::new(self)) @@ -100,12 +97,10 @@ impl UserInputAST { if asts.len() == 1 { asts.into_iter().next().unwrap() //< safe } else { - UserInputAST::Clause(asts - .into_iter() - .map(|ast: UserInputAST| - ast.unary(occur) - ) - .collect::>() + UserInputAST::Clause( + asts.into_iter() + .map(|ast: UserInputAST| ast.unary(occur)) + .collect::>(), ) } } @@ -117,11 +112,8 @@ impl UserInputAST { pub fn or(asts: Vec) -> UserInputAST { UserInputAST::compose(Occur::Should, asts) } - } - - /* impl UserInputAST { diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 6ec9c587f..fd739652c 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -274,7 +274,6 @@ impl RangeWeight { } impl Weight for RangeWeight { - fn scorer(&self, reader: &SegmentReader) -> Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); @@ -370,9 +369,7 @@ mod tests { let searcher = index.searcher(); let count_multiples = |range_query: RangeQuery| { let mut count_collector = CountCollector::default(); - range_query - .search(&searcher, &mut count_collector) - .unwrap(); + range_query.search(&searcher, &mut count_collector).unwrap(); count_collector.count() }; diff --git a/src/query/scorer.rs b/src/query/scorer.rs index 186e75a22..2c2f0cd62 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -50,7 +50,6 @@ impl Scorer for Box { } } - /// Wraps a `DocSet` and simply returns a constant `Scorer`. /// The `ConstScorer` is useful if you have a `DocSet` where /// you needed a scorer. diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index d6cd72288..267ca9ba7 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -3,10 +3,10 @@ use query::bm25::BM25Weight; use query::Query; use query::Weight; use schema::IndexRecordOption; +use std::collections::BTreeSet; use Result; use Searcher; use Term; -use std::collections::BTreeSet; /// A Term query matches all of the documents /// containing a specific term. diff --git a/src/schema/schema.rs b/src/schema/schema.rs index d000ab9e2..0855200f4 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -444,7 +444,10 @@ mod tests { ) .unwrap(); assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); - assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton")); + assert_eq!( + doc.get_first(author_field).unwrap().text(), + Some("fulmicoton") + ); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 8f4ec43b1..3957417e6 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,14 +1,14 @@ use htmlescape::encode_minimal; -use std::collections::BTreeMap; -use tokenizer::{Token, TokenStream}; -use Result; use query::Query; -use Searcher; use schema::Field; +use std::cmp::Ordering; +use std::collections::BTreeMap; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; +use tokenizer::{Token, TokenStream}; use Document; -use std::cmp::Ordering; +use Result; +use Searcher; const DEFAULT_MAX_NUM_CHARS: usize = 150; @@ -75,11 +75,10 @@ const HIGHLIGHTEN_PREFIX: &str = ""; const HIGHLIGHTEN_POSTFIX: &str = ""; impl Snippet { - pub fn empty() -> Snippet { Snippet { fragments: String::new(), - highlighted: Vec::new() + highlighted: Vec::new(), } } @@ -157,16 +156,17 @@ fn select_best_fragment_combination<'a>( fragments: Vec, text: &'a str, ) -> Snippet { - let best_fragment_opt = fragments - .iter() - .max_by(|left, right| { - let cmp_score = left.score.partial_cmp(&right.score).unwrap_or(Ordering::Equal); - if cmp_score == Ordering::Equal { - (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) - } else { - cmp_score - } - }); + let best_fragment_opt = fragments.iter().max_by(|left, right| { + let cmp_score = left + .score + .partial_cmp(&right.score) + .unwrap_or(Ordering::Equal); + if cmp_score == Ordering::Equal { + (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) + } else { + cmp_score + } + }); if let Some(fragment) = best_fragment_opt { let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; let highlighted = fragment @@ -177,7 +177,8 @@ fn select_best_fragment_combination<'a>( item.start - fragment.start_offset, item.stop - fragment.start_offset, ) - }).collect(); + }) + .collect(); Snippet { fragments: fragment_text.to_string(), highlighted: highlighted, @@ -239,17 +240,16 @@ pub struct SnippetGenerator { terms_text: BTreeMap, tokenizer: Box, field: Field, - max_num_chars: usize + max_num_chars: usize, } impl SnippetGenerator { /// Creates a new snippet generator - pub fn new(searcher: &Searcher, - query: &Query, - field: Field) -> Result { + pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result { let mut terms = BTreeSet::new(); query.query_terms(&mut terms); - let terms_text: BTreeMap = terms.into_iter() + let terms_text: BTreeMap = terms + .into_iter() .filter(|term| term.field() == field) .map(|term| (term.text().to_string(), 1f32)) .collect(); @@ -258,7 +258,7 @@ impl SnippetGenerator { terms_text, tokenizer, field, - max_num_chars: DEFAULT_MAX_NUM_CHARS + max_num_chars: DEFAULT_MAX_NUM_CHARS, }) } @@ -272,7 +272,8 @@ impl SnippetGenerator { /// This method extract the text associated to the `SnippetGenerator`'s field /// and computes a snippet. pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { - let text: String = doc.get_all(self.field) + let text: String = doc + .get_all(self.field) .into_iter() .flat_map(|val| val.text()) .collect::>() @@ -282,10 +283,12 @@ impl SnippetGenerator { /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { - let fragment_candidates = search_fragments(&*self.tokenizer, - &text, - &self.terms_text, - self.max_num_chars); + let fragment_candidates = search_fragments( + &*self.tokenizer, + &text, + &self.terms_text, + self.max_num_chars, + ); select_best_fragment_combination(fragment_candidates, &text) } } @@ -293,16 +296,16 @@ impl SnippetGenerator { #[cfg(test)] mod tests { use super::{search_fragments, select_best_fragment_combination}; + use query::QueryParser; + use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions}; use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; use Index; - use schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; use SnippetGenerator; - use query::QueryParser; - - const TEST_TEXT: &'static str = r#"Rust is a systems programming language sponsored by Mozilla which + const TEST_TEXT: &'static str = + r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], but its designers intend it to provide better memory safety while still maintaining @@ -431,7 +434,7 @@ Survey in 2016, 2017, and 2018."#; let text = "a b c d"; - let terms = BTreeMap::new(); + let terms = BTreeMap::new(); let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); @@ -442,12 +445,12 @@ Survey in 2016, 2017, and 2018."#; #[test] fn test_snippet_generator() { - let mut schema_builder = SchemaBuilder::default (); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default() + let mut schema_builder = SchemaBuilder::default(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() .set_tokenizer("en_stem") - .set_index_option(IndexRecordOption::Basic) - ); + .set_index_option(IndexRecordOption::Basic), + ); let text_field = schema_builder.add_text_field("text", text_options); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -474,6 +477,5 @@ Survey in 2016, 2017, and 2018."#; let snippet = snippet_generator.snippet(TEST_TEXT); assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); } - } } diff --git a/src/store/mod.rs b/src/store/mod.rs index 7bce9085d..57930e8d8 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,13 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(), + *store + .get(i) + .unwrap() + .get_first(field_title) + .unwrap() + .text() + .unwrap(), format!("Doc {}", i) ); } diff --git a/src/store/skiplist/skiplist_builder.rs b/src/store/skiplist/skiplist_builder.rs index 14ccd6dda..61f04bf34 100644 --- a/src/store/skiplist/skiplist_builder.rs +++ b/src/store/skiplist/skiplist_builder.rs @@ -72,7 +72,8 @@ impl SkipListBuilder { let mut skip_pointer = self.data_layer.insert(key, dest)?; loop { skip_pointer = match skip_pointer { - Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id) + Some((skip_doc_id, skip_offset)) => self + .get_skip_layer(layer_id) .insert(skip_doc_id, &skip_offset)?, None => { return Ok(()); diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index 70ded6090..98947feb5 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -59,7 +59,6 @@ impl TermInfoBlockMeta { } fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { - let num_bits = self.num_bits() as usize; let mut cursor = num_bits * inner_offset; diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index f633211ef..03738e694 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -164,7 +164,8 @@ impl TermDictionary { let fst = self.fst_index.as_fst(); let mut node = fst.root(); while ord != 0 || !node.is_final() { - if let Some(transition) = node.transitions() + if let Some(transition) = node + .transitions() .take_while(|transition| transition.out.value() <= ord) .last() { diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 578678a4a..38fa782fc 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -31,7 +31,6 @@ fn to_lowercase_unicode(text: &mut String, output: &mut String) { } } - impl TokenStream for LowerCaserTokenStream where TailTokenStream: TokenStream, @@ -50,7 +49,7 @@ where // fast track for ascii. self.token_mut().text.make_ascii_lowercase(); } else { - to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); } @@ -68,41 +67,43 @@ where fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { LowerCaserTokenStream { tail, - buffer: String::with_capacity(100) + buffer: String::with_capacity(100), } } } #[cfg(test)] mod tests { - use tokenizer::Tokenizer; use tokenizer::LowerCaser; - use tokenizer::TokenStream; use tokenizer::SimpleTokenizer; + use tokenizer::TokenStream; + use tokenizer::Tokenizer; #[test] fn test_to_lower_case() { - assert_eq!(lowercase_helper("Русский текст"), - vec!["русский".to_string(), "текст".to_string()]); + assert_eq!( + lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()] + ); } fn lowercase_helper(text: &str) -> Vec { let mut tokens = vec![]; - let mut token_stream = SimpleTokenizer - .filter(LowerCaser) - .token_stream(text); + let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text); while token_stream.advance() { let token_text = token_stream.token().text.clone(); tokens.push(token_text); } tokens - } - + } #[test] fn test_lowercaser() { assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); - assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + assert_eq!( + lowercase_helper("Русский"), + vec!["русский".to_string()] + ); } -} \ No newline at end of file +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 9d94441ab..dd8eb18dd 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -151,8 +151,8 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::BoxedTokenizer; pub(crate) use self::tokenizer::box_tokenizer; +pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 12a5af82c..338109b88 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer { offset_to: text.len(), position: 0, text: text.to_string(), - position_length: 1 + position_length: 1, }; RawTokenStream { token, diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index 01d631e2b..224d7746c 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -71,13 +71,16 @@ where #[cfg(test)] mod tests { - use super::POSITION_GAP; + use super::super::{SimpleTokenizer, TokenStream, Tokenizer}; use super::TokenStreamChain; - use super::super::{Tokenizer, TokenStream, SimpleTokenizer}; + use super::POSITION_GAP; #[test] fn test_chain_first_emits_no_tokens() { - let token_streams = vec![SimpleTokenizer.token_stream(""), SimpleTokenizer.token_stream("hello world")]; + let token_streams = vec![ + SimpleTokenizer.token_stream(""), + SimpleTokenizer.token_stream("hello world"), + ]; let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams); assert!(token_chain.advance()); @@ -91,8 +94,8 @@ mod tests { assert_eq!(token_chain.token().offset_from, 6); assert_eq!(token_chain.token().offset_to, 11); assert_eq!(token_chain.token().position, POSITION_GAP); - + assert!(!token_chain.advance()); } -} \ No newline at end of file +} diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index fcdf8f21b..d73f84e93 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -276,7 +276,7 @@ mod test { offset_from: 2, offset_to: 3, text: "abc".to_string(), - position_length: 1 + position_length: 1, }; let t2 = t1.clone(); From 06e7bd18e7a34027697b3c705ac755e3984aa1cf Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 15 Sep 2018 14:56:14 +0900 Subject: [PATCH 53/62] Clippy (#421) * Cargo Format * Clippy * bugfix * still clippy stuff * clippy step 2 --- src/common/bitset.rs | 6 +++--- src/core/index_meta.rs | 2 +- src/core/inverted_index_reader.rs | 2 +- src/core/segment_meta.rs | 2 +- src/core/segment_reader.rs | 2 +- src/directory/managed_directory.rs | 4 ++-- src/error.rs | 22 +++++++++++----------- src/fieldnorm/code.rs | 3 ++- src/indexer/delete_queue.rs | 2 +- src/indexer/directory_lock.rs | 10 +++++----- src/indexer/index_writer.rs | 3 +-- src/indexer/segment_entry.rs | 4 ++-- src/indexer/segment_updater.rs | 2 +- src/lib.rs | 15 ++++++++------- src/positions/reader.rs | 2 +- src/positions/serializer.rs | 4 ++-- src/postings/compression/vint.rs | 16 +++++++--------- src/postings/mod.rs | 2 +- src/postings/postings_writer.rs | 21 +++++++++------------ src/postings/segment_postings.rs | 10 ++++------ src/postings/serializer.rs | 6 +++--- src/postings/stacker/memory_arena.rs | 12 ++++++------ src/postings/stacker/murmurhash2.rs | 1 + src/postings/stacker/term_hashmap.rs | 2 +- src/query/all_query.rs | 4 ++-- src/query/bm25.rs | 6 +++--- src/query/boolean_query/boolean_query.rs | 2 +- src/query/boolean_query/boolean_weight.rs | 2 +- src/query/fuzzy_query.rs | 2 +- src/query/intersection.rs | 11 ++++++----- src/query/occur.rs | 4 ++-- src/query/query_parser/query_grammar.rs | 14 ++++++++------ src/query/query_parser/query_parser.rs | 2 ++ src/query/query_parser/user_input_ast.rs | 2 +- src/query/regex_query.rs | 2 +- src/query/union.rs | 1 + src/schema/document.rs | 2 +- src/schema/facet.rs | 2 +- src/schema/index_record_option.rs | 16 ++++++++-------- src/snippet/mod.rs | 4 ++-- src/store/reader.rs | 2 +- src/termdict/merger.rs | 4 ++-- src/termdict/mod.rs | 16 ++++++++-------- src/termdict/term_info_store.rs | 7 ++++--- src/termdict/termdict.rs | 10 +++++----- 45 files changed, 136 insertions(+), 134 deletions(-) diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 326e7cee8..750d835f8 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -34,17 +34,17 @@ impl TinySet { } /// Returns the complement of the set in `[0, 64[`. - fn complement(&self) -> TinySet { + fn complement(self) -> TinySet { TinySet(!self.0) } /// Returns true iff the `TinySet` contains the element `el`. - pub fn contains(&self, el: u32) -> bool { + pub fn contains(self, el: u32) -> bool { !self.intersect(TinySet::singleton(el)).is_empty() } /// Returns the intersection of `self` and `other` - pub fn intersect(&self, other: TinySet) -> TinySet { + pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) } diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index b5ed52427..ecef75d02 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -58,7 +58,7 @@ mod tests { }; let index_metas = IndexMeta { segments: Vec::new(), - schema: schema, + schema, opstamp: 0u64, payload: None, }; diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index bb71be1ae..9ae0dbf43 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -59,7 +59,7 @@ impl InvertedIndexReader { .get_index_record_option() .unwrap_or(IndexRecordOption::Basic); InvertedIndexReader { - termdict: TermDictionary::empty(field_type), + termdict: TermDictionary::empty(&field_type), postings_source: ReadOnlySource::empty(), positions_source: ReadOnlySource::empty(), positions_idx_source: ReadOnlySource::empty(), diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index f1e707bad..9478663ea 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -50,7 +50,7 @@ impl<'a> serde::Deserialize<'a> for SegmentMeta { { let inner = InnerSegmentMeta::deserialize(deserializer)?; let tracked = INVENTORY.track(inner); - Ok(SegmentMeta { tracked: tracked }) + Ok(SegmentMeta { tracked }) } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index dff6cca48..03cfdf08d 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -400,7 +400,7 @@ pub struct SegmentReaderAliveDocsIterator<'a> { impl<'a> SegmentReaderAliveDocsIterator<'a> { pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> { SegmentReaderAliveDocsIterator { - reader: reader, + reader, max_doc: reader.max_doc(), current: 0, } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 7a9a8bd15..34259c184 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -21,7 +21,7 @@ use Result; /// are not managed. fn is_managed(path: &Path) -> bool { path.to_str() - .map(|p_str| !p_str.starts_with(".")) + .map(|p_str| !p_str.starts_with('.')) .unwrap_or(true) } @@ -52,7 +52,7 @@ fn save_managed_paths( wlock: &RwLockWriteGuard, ) -> io::Result<()> { let mut w = serde_json::to_vec(&wlock.managed_paths)?; - write!(&mut w, "\n")?; + writeln!(&mut w)?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; Ok(()) } diff --git a/src/error.rs b/src/error.rs index 8509d28cd..d7f0d1d1a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -53,31 +53,31 @@ pub enum TantivyError { impl From for TantivyError { fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { - TantivyError::FastFieldError(fastfield_error).into() + TantivyError::FastFieldError(fastfield_error) } } impl From for TantivyError { fn from(io_error: IOError) -> TantivyError { - TantivyError::IOError(io_error).into() + TantivyError::IOError(io_error) } } impl From for TantivyError { fn from(io_error: io::Error) -> TantivyError { - TantivyError::IOError(io_error.into()).into() + TantivyError::IOError(io_error.into()) } } impl From for TantivyError { fn from(parsing_error: query::QueryParserError) -> TantivyError { - TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into() + TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)) } } impl From> for TantivyError { fn from(_: PoisonError) -> TantivyError { - TantivyError::Poisoned.into() + TantivyError::Poisoned } } @@ -85,16 +85,16 @@ impl From for TantivyError { fn from(error: OpenReadError) -> TantivyError { match error { OpenReadError::FileDoesNotExist(filepath) => { - TantivyError::PathDoesNotExist(filepath).into() + TantivyError::PathDoesNotExist(filepath) } - OpenReadError::IOError(io_error) => TantivyError::IOError(io_error).into(), + OpenReadError::IOError(io_error) => TantivyError::IOError(io_error), } } } impl From for TantivyError { fn from(error: schema::DocParsingError) -> TantivyError { - TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)).into() + TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)) } } @@ -113,11 +113,11 @@ impl From for TantivyError { fn from(error: OpenDirectoryError) -> TantivyError { match error { OpenDirectoryError::DoesNotExist(directory_path) => { - TantivyError::PathDoesNotExist(directory_path).into() + TantivyError::PathDoesNotExist(directory_path) } OpenDirectoryError::NotADirectory(directory_path) => TantivyError::InvalidArgument( format!("{:?} is not a directory", directory_path), - ).into(), + ), } } } @@ -125,6 +125,6 @@ impl From for TantivyError { impl From for TantivyError { fn from(error: serde_json::Error) -> TantivyError { let io_err = io::Error::from(error); - TantivyError::IOError(io_err.into()).into() + TantivyError::IOError(io_err.into()) } } diff --git a/src/fieldnorm/code.rs b/src/fieldnorm/code.rs index 71079bd02..230c0e743 100644 --- a/src/fieldnorm/code.rs +++ b/src/fieldnorm/code.rs @@ -10,11 +10,12 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { .unwrap_or_else(|idx| idx - 1) as u8 } +#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))] pub const FIELD_NORMS_TABLE: [u32; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232, - 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048, + 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176, diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index f921b7523..18eff3387 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -186,7 +186,7 @@ impl DeleteCursor { /// `opstamp >= target_opstamp`. pub fn skip_to(&mut self, target_opstamp: u64) { // TODO Can be optimize as we work with block. - #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] loop { if let Some(operation) = self.get() { if operation.opstamp >= target_opstamp { diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index 9555234bb..172165bc2 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -61,8 +61,8 @@ impl RetryPolicy { } impl LockType { - fn retry_policy(&self) -> RetryPolicy { - match *self { + fn retry_policy(self) -> RetryPolicy { + match self { LockType::IndexWriterLock => RetryPolicy::no_retry(), LockType::MetaLock => RetryPolicy { num_retries: 100, @@ -71,10 +71,10 @@ impl LockType { } } - fn try_acquire_lock(&self, directory: &mut Directory) -> Result { + fn try_acquire_lock(self, directory: &mut Directory) -> Result { let path = self.filename(); let mut write = directory.open_write(path).map_err(|e| match e { - OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(*self), + OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self), OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), })?; write.flush()?; @@ -85,7 +85,7 @@ impl LockType { } /// Acquire a lock in the given directory. - pub fn acquire_lock(&self, directory: &Directory) -> Result { + pub fn acquire_lock(self, directory: &Directory) -> Result { let mut box_directory = directory.box_clone(); let mut retry_policy = self.retry_policy(); loop { diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 2b791ecec..32017e8f5 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -54,7 +54,6 @@ type DocumentReceiver = channel::Receiver; fn initial_table_size(per_thread_memory_budget: usize) -> usize { let table_size_limit: usize = per_thread_memory_budget / 3; (1..) - .into_iter() .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) .last() .expect(&format!( @@ -177,7 +176,7 @@ pub fn compute_deleted_bitset( ) -> Result { let mut might_have_changed = false; - #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] loop { if let Some(delete_op) = delete_cursor.get() { if delete_op.opstamp > target_opstamp { diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index c35406ad1..7e23940d5 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -11,8 +11,8 @@ pub enum SegmentState { } impl SegmentState { - pub fn letter_code(&self) -> char { - match *self { + pub fn letter_code(self) -> char { + match self { SegmentState::InMerge => 'M', SegmentState::Ready => 'R', } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 732270ea1..2f1aab70c 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -72,7 +72,7 @@ pub fn save_metas( payload, }; let mut buffer = serde_json::to_vec_pretty(&metas)?; - write!(&mut buffer, "\n")?; + writeln!(&mut buffer)?; directory.atomic_write(&META_FILEPATH, &buffer[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 9607062e1..15a2bd567 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,13 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] -#![cfg_attr(feature = "cargo-clippy", allow(module_inception))] -#![cfg_attr(feature = "cargo-clippy", allow(inline_always))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::inline_always))] +#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))] #![cfg_attr(all(feature = "unstable", test), feature(test))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::new_without_default))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::decimal_literal_representation))] + #![doc(test(attr(allow(unused_variables), deny(warnings))))] #![allow(unknown_lints)] -#![allow(new_without_default)] -#![allow(decimal_literal_representation)] #![warn(missing_docs)] #![recursion_limit = "80"] @@ -133,7 +135,6 @@ extern crate bit_set; extern crate bitpacking; extern crate byteorder; -#[macro_use] extern crate combine; extern crate crossbeam; @@ -266,12 +267,12 @@ impl DocAddress { /// The segment ordinal is an id identifying the segment /// hosting the document. It is only meaningful, in the context /// of a searcher. - pub fn segment_ord(&self) -> SegmentLocalId { + pub fn segment_ord(self) -> SegmentLocalId { self.0 } /// Return the segment local `DocId` - pub fn doc(&self) -> DocId { + pub fn doc(self) -> DocId { self.1 } } diff --git a/src/positions/reader.rs b/src/positions/reader.rs index dec653830..9a0157725 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -127,7 +127,7 @@ impl PositionReader { self.ahead = self.ahead.and_then(|num_blocks| { if num_blocks >= num_blocks_to_advance { - Some(num_blocks_to_advance - num_blocks_to_advance) + Some(num_blocks - num_blocks_to_advance) } else { None } diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 2a3dc09c1..68c6885cd 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -51,8 +51,8 @@ impl PositionSerializer { fn flush_block(&mut self) -> io::Result<()> { let num_bits = BIT_PACKER.num_bits(&self.block[..]); - self.cumulated_num_bits += num_bits as u64; - self.write_skiplist.write(&[num_bits])?; + self.cumulated_num_bits += u64::from(num_bits); + self.write_skiplist.write_all(&[num_bits])?; let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits); self.write_stream.write_all(&self.buffer[..written_len])?; self.block.clear(); diff --git a/src/postings/compression/vint.rs b/src/postings/compression/vint.rs index 88a0df5a5..87a672e64 100644 --- a/src/postings/compression/vint.rs +++ b/src/postings/compression/vint.rs @@ -45,40 +45,38 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize { let mut read_byte = 0; let mut result = offset; - let num_els = output.len(); - for i in 0..num_els { + for output_mut in output.iter_mut() { let mut shift = 0u32; loop { let cur_byte = compressed_data[read_byte]; read_byte += 1; - result += ((cur_byte % 128u8) as u32) << shift; + result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { break; } shift += 7; } - output[i] = result; + *output_mut = result; } read_byte } #[inline(always)] -pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { +pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize { let mut read_byte = 0; - let num_els = output.len(); - for i in 0..num_els { + for output_mut in output_arr.iter_mut() { let mut result = 0u32; let mut shift = 0u32; loop { let cur_byte = compressed_data[read_byte]; read_byte += 1; - result += ((cur_byte % 128u8) as u32) << shift; + result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { break; } shift += 7; } - output[i] = result; + *output_mut = result; } read_byte } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 89b4a3c62..ec0940ff0 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -34,7 +34,7 @@ pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32; pub(crate) type UnorderedTermId = u64; -#[allow(enum_variant_names)] +#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] #[derive(Debug, PartialEq, Clone, Copy, Eq)] pub(crate) enum FreqReadingOption { NoFreq, diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index bb3ca10a4..b3f879611 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -15,7 +15,7 @@ use tokenizer::TokenStream; use DocId; use Result; -fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box { +fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options .get_indexing_options() @@ -128,8 +128,8 @@ impl MultiFieldPostingsWriter { let field_entry = self.schema.get_field_entry(field); - match field_entry.field_type() { - &FieldType::Str(_) | &FieldType::HierarchicalFacet => { + match *field_entry.field_type() { + FieldType::Str(_) | FieldType::HierarchicalFacet => { // populating the (unordered term ord) -> (ordered term ord) mapping // for the field. let mut unordered_term_ids = term_offsets[start..stop] @@ -143,8 +143,8 @@ impl MultiFieldPostingsWriter { .collect(); unordered_term_mappings.insert(field, mapping); } - &FieldType::U64(_) | &FieldType::I64(_) => {} - &FieldType::Bytes => {} + FieldType::U64(_) | FieldType::I64(_) => {} + FieldType::Bytes => {} } let postings_writer = &self.per_field_postings_writers[field.0 as usize]; @@ -203,14 +203,11 @@ pub trait PostingsWriter { heap: &mut MemoryArena, ) -> u32 { let mut term = Term::for_field(field); - let num_tokens = { - let mut sink = |token: &Token| { - term.set_text(token.text.as_str()); - self.subscribe(term_index, doc_id, token.position as u32, &term, heap); - }; - token_stream.process(&mut sink) + let mut sink = |token: &Token| { + term.set_text(token.text.as_str()); + self.subscribe(term_index, doc_id, token.position as u32, &term, heap); }; - num_tokens + token_stream.process(&mut sink) } fn total_num_tokens(&self) -> u64; diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 8986c036b..38f12fe09 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -205,11 +205,9 @@ impl DocSet for SegmentPostings { return SkipResult::End; } } - } else { + } else if self.block_cursor.skip_to(target) == BlockSegmentPostingsSkipResult::Terminated { // no positions needed. no need to sum freqs. - if self.block_cursor.skip_to(target) == BlockSegmentPostingsSkipResult::Terminated { - return SkipResult::End; - } + return SkipResult::End; } self.cur = 0; } @@ -236,9 +234,9 @@ impl DocSet for SegmentPostings { let doc = block_docs[new_cur]; debug_assert!(doc >= target); if doc == target { - return SkipResult::Reached; + SkipResult::Reached } else { - return SkipResult::OverStep; + SkipResult::OverStep } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index f7661933a..f578f2caf 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -100,7 +100,7 @@ impl InvertedIndexSerializer { let positionsidx_write = self.positionsidx_write.for_field(field); let field_type: FieldType = (*field_entry.field_type()).clone(); FieldSerializer::new( - field_type, + &field_type, term_dictionary_write, postings_write, positions_write, @@ -131,7 +131,7 @@ pub struct FieldSerializer<'a> { impl<'a> FieldSerializer<'a> { fn new( - field_type: FieldType, + field_type: &FieldType, term_dictionary_write: &'a mut CountingWriter, postings_write: &'a mut CountingWriter, positions_write: &'a mut CountingWriter, @@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> { _ => (false, false), }; let term_dictionary_builder = - TermDictionaryBuilder::new(term_dictionary_write, field_type)?; + TermDictionaryBuilder::new(term_dictionary_write, &field_type)?; let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); let positions_serializer_opt = if position_enabled { diff --git a/src/postings/stacker/memory_arena.rs b/src/postings/stacker/memory_arena.rs index b420fdb22..3b0f875d4 100644 --- a/src/postings/stacker/memory_arena.rs +++ b/src/postings/stacker/memory_arena.rs @@ -47,7 +47,7 @@ impl Addr { } /// Returns the `Addr` object for `addr + offset` - pub fn offset(&self, offset: u32) -> Addr { + pub fn offset(self, offset: u32) -> Addr { Addr(self.0.wrapping_add(offset)) } @@ -55,16 +55,16 @@ impl Addr { Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32) } - fn page_id(&self) -> usize { + fn page_id(self) -> usize { (self.0 as usize) >> NUM_BITS_PAGE_ADDR } - fn page_local_addr(&self) -> usize { + fn page_local_addr(self) -> usize { (self.0 as usize) & (PAGE_SIZE - 1) } /// Returns true if and only if the `Addr` is null. - pub fn is_null(&self) -> bool { + pub fn is_null(self) -> bool { self.0 == u32::max_value() } } @@ -233,12 +233,12 @@ impl Page { #[inline(always)] pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 { - self.data.as_ptr().offset(addr as isize) + self.data.as_ptr().add(addr) } #[inline(always)] pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 { - self.data.as_mut_ptr().offset(addr as isize) + self.data.as_mut_ptr().add(addr) } } diff --git a/src/postings/stacker/murmurhash2.rs b/src/postings/stacker/murmurhash2.rs index 729819be8..68e22e6c3 100644 --- a/src/postings/stacker/murmurhash2.rs +++ b/src/postings/stacker/murmurhash2.rs @@ -4,6 +4,7 @@ const M: u32 = 0x5bd1_e995; #[inline(always)] pub fn murmurhash2(key: &[u8]) -> u32 { + #[cfg_attr(feature="cargo-clippy", allow(clippy::cast_ptr_alignment))] let mut key_ptr: *const u32 = key.as_ptr() as *const u32; let len = key.len() as u32; let mut h: u32 = SEED ^ len; diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index 6e3625d5d..47ee3d5c7 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -61,7 +61,7 @@ impl Default for KeyValue { } impl KeyValue { - fn is_empty(&self) -> bool { + fn is_empty(self) -> bool { self.key_value_addr.is_null() } } diff --git a/src/query/all_query.rs b/src/query/all_query.rs index bfc1fddbf..4f5490ab1 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -59,10 +59,10 @@ impl DocSet for AllScorer { } } if self.doc < self.max_doc { - return true; + true } else { self.state = State::Finished; - return false; + false } } diff --git a/src/query/bm25.rs b/src/query/bm25.rs index 1fc6087ed..4a3a25590 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -17,9 +17,9 @@ fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 { fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] { let mut cache = [0f32; 256]; - for fieldnorm_id in 0..256 { + for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() { let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8); - cache[fieldnorm_id] = cached_tf_component(fieldnorm, average_fieldnorm); + *cache_mut = cached_tf_component(fieldnorm, average_fieldnorm); } cache } @@ -54,7 +54,7 @@ impl BM25Weight { for segment_reader in searcher.segment_readers() { let inverted_index = segment_reader.inverted_index(field); total_num_tokens += inverted_index.total_num_tokens(); - total_num_docs += segment_reader.max_doc() as u64; + total_num_docs += u64::from(segment_reader.max_doc()); } let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32; diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 0a01d9977..353c89806 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -28,7 +28,7 @@ impl Clone for BooleanQuery { fn clone(&self) -> Self { self.subqueries .iter() - .map(|(x, y)| (x.clone(), y.box_clone())) + .map(|(occur, subquery)| (*occur, subquery.box_clone())) .collect::>() .into() } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 575bc2991..edd8fecae 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -39,7 +39,7 @@ where } let scorer: Box = Box::new(Union::<_, TScoreCombiner>::from(scorers)); - return scorer; + scorer } pub struct BooleanWeight { diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 7c3c6ad08..5253fa80c 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -10,7 +10,7 @@ lazy_static! { let mut lev_builder_cache = HashMap::new(); // TODO make population lazy on a `(distance, val)` basis for distance in 0..3 { - for &transposition in [false, true].iter() { + for &transposition in &[false, true] { let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition); lev_builder_cache.insert((distance, transposition), lev_automaton_builder); } diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 02c40e169..e38d32ec7 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -26,10 +26,11 @@ pub fn intersect_scorers(mut scorers: Vec>) -> Box { (Some(single_docset), None) => single_docset, (Some(left), Some(right)) => { { - if [&left, &right].into_iter().all(|scorer| { + let all_term_scorers = [&left, &right].into_iter().all(|scorer| { let scorer_ref: &Scorer = (*scorer).borrow(); Downcast::::is_type(scorer_ref) - }) { + }); + if all_term_scorers { let left = *Downcast::::downcast(left).unwrap(); let right = *Downcast::::downcast(right).unwrap(); return Box::new(Intersection { @@ -40,12 +41,12 @@ pub fn intersect_scorers(mut scorers: Vec>) -> Box { }); } } - return Box::new(Intersection { + Box::new(Intersection { left, right, others: scorers, num_docsets, - }); + }) } _ => { unreachable!(); @@ -99,7 +100,7 @@ impl Intersection } impl DocSet for Intersection { - #[allow(never_loop)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::never_loop))] fn advance(&mut self) -> bool { let (left, right) = (&mut self.left, &mut self.right); diff --git a/src/query/occur.rs b/src/query/occur.rs index e91e3a2b6..96ff9018c 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -18,8 +18,8 @@ impl Occur { /// - `Should` => '?', /// - `Must` => '+' /// - `Not` => '-' - pub fn to_char(&self) -> char { - match *self { + pub fn to_char(self) -> char { + match self { Occur::Should => '?', Occur::Must => '+', Occur::MustNot => '-', diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 6df6f9144..3a3ac4256 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,3 +1,6 @@ +#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))] + use super::user_input_ast::*; use combine::char::*; use combine::error::StreamError; @@ -110,12 +113,11 @@ parser! { .or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr)) .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) )) .or(try( - (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))) + (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot)) + ) ) - .or( - try( - range() - .map(|leaf| UserInputAST::from(leaf)) + .or(try( + range().map(UserInputAST::from) ) ) .or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf)))) @@ -189,7 +191,7 @@ parser! { } else { let conjunctions = fnd .into_iter() - .map(|conjunction| UserInputAST::and(conjunction)) + .map(UserInputAST::and) .collect(); UserInputAST::or(conjunctions) } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index a8aec4f56..fcfa19345 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))] + use super::logical_ast::*; use super::query_grammar::parse_to_ast; use super::user_input_ast::*; diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 40cdd2424..52ab4d293 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -22,7 +22,7 @@ impl Debug for UserInputLeaf { ref lower, ref upper, } => { - if let &Some(ref field) = field { + if let Some(ref field) = field { write!(formatter, "{}:", field)?; } lower.display_lower(formatter)?; diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 64d21395f..dcdd9bdff 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -82,7 +82,7 @@ impl RegexQuery { let automaton = Regex::new(&self.regex_pattern) .map_err(|_| TantivyError::InvalidArgument(self.regex_pattern.clone()))?; - Ok(AutomatonWeight::new(self.field.clone(), automaton)) + Ok(AutomatonWeight::new(self.field, automaton)) } } diff --git a/src/query/union.rs b/src/query/union.rs index 58d5de242..b4a7441a3 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -215,6 +215,7 @@ where // The target is outside of the buffered horizon. // advance all docsets to a doc >= to the target. + #[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))] unordered_drain_filter(&mut self.docsets, |docset| { if docset.doc() < target { if docset.skip_next(target) == SkipResult::End { diff --git a/src/schema/document.rs b/src/schema/document.rs index c6a508a94..7254c9660 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -113,7 +113,7 @@ impl Document { .into_iter() .group_by(|field_value| field_value.field()) .into_iter() - .map(|(key, group)| (key, group.into_iter().collect())) + .map(|(key, group)| (key, group.collect())) .collect::)>>() } diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 429766c85..6a34e8d42 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -97,7 +97,7 @@ impl Facet { } /// Returns `true` iff other is a subfacet of `self`. - #[allow(collapsible_if)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::collapsible_if))] pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_bytes: &[u8] = self.encoded_bytes(); let other_bytes: &[u8] = other.encoded_bytes(); diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index bd2deaaac..4a595a8f0 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -30,16 +30,16 @@ pub enum IndexRecordOption { impl IndexRecordOption { /// Returns true iff the term frequency will be encoded. - pub fn is_termfreq_enabled(&self) -> bool { - match *self { + pub fn is_termfreq_enabled(self) -> bool { + match self { IndexRecordOption::WithFreqsAndPositions | IndexRecordOption::WithFreqs => true, _ => false, } } /// Returns true iff the term positions within the document are stored as well. - pub fn is_position_enabled(&self) -> bool { - match *self { + pub fn is_position_enabled(self) -> bool { + match self { IndexRecordOption::WithFreqsAndPositions => true, _ => false, } @@ -47,8 +47,8 @@ impl IndexRecordOption { /// Returns true iff this option includes encoding /// term frequencies. - pub fn has_freq(&self) -> bool { - match *self { + pub fn has_freq(self) -> bool { + match self { IndexRecordOption::Basic => false, IndexRecordOption::WithFreqs | IndexRecordOption::WithFreqsAndPositions => true, } @@ -56,8 +56,8 @@ impl IndexRecordOption { /// Returns true iff this option include encoding /// term positions. - pub fn has_positions(&self) -> bool { - match *self { + pub fn has_positions(self) -> bool { + match self { IndexRecordOption::Basic | IndexRecordOption::WithFreqs => false, IndexRecordOption::WithFreqsAndPositions => true, } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 3957417e6..4f33dc39e 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -42,7 +42,7 @@ impl FragmentCandidate { fn new(start_offset: usize) -> FragmentCandidate { FragmentCandidate { score: 0.0, - start_offset: start_offset, + start_offset, stop_offset: start_offset, num_chars: 0, highlighted: vec![], @@ -181,7 +181,7 @@ fn select_best_fragment_combination<'a>( .collect(); Snippet { fragments: fragment_text.to_string(), - highlighted: highlighted, + highlighted, } } else { // when there no fragments to chose from, diff --git a/src/store/reader.rs b/src/store/reader.rs index 5f02825e3..bdf02f00c 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -89,7 +89,7 @@ impl StoreReader { } } -#[allow(needless_pass_by_value)] +#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index ef0959cf7..407a49e90 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -81,7 +81,7 @@ impl<'a> TermMerger<'a> { /// Advance the term iterator to the next term. /// Returns true if there is indeed another term /// False if there is none. - #[allow(while_let_loop)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] pub fn advance(&mut self) -> bool { self.advance_segments(); if let Some(head) = self.heap.pop() { @@ -123,7 +123,7 @@ impl<'a> TermMerger<'a> { } /// Iterates through terms - #[allow(should_implement_trait)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))] pub fn next(&mut self) -> Option> { if self.advance() { Some(Term::wrap(self.current_streamers[0].streamer.key())) diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index d403314fa..5ae259425 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -66,7 +66,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) @@ -92,7 +92,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u64)) .unwrap(); @@ -180,7 +180,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -210,7 +210,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); // term requires more than 16bits term_dictionary_builder .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) @@ -245,7 +245,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -314,7 +314,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); term_dictionary_builder .insert(&[], &make_term_info(1 as u64)) .unwrap(); @@ -338,7 +338,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; term_dictionary_builder @@ -408,7 +408,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index 98947feb5..8b6a0159b 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -69,7 +69,6 @@ impl TermInfoBlockMeta { cursor += self.postings_offset_nbits as usize; let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits); - self.positions_idx_nbits as usize; TermInfo { doc_freq, @@ -91,8 +90,10 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { let bit_shift = (addr_bits % 8) as u64; assert!(data.len() >= addr_byte + 7); let val_unshifted_unmasked: u64 = unsafe { - // ok thanks to the 7 byte padding on `.close` - let addr = data.as_ptr().offset(addr_byte as isize) as *const u64; + // ok because the pointer is only accessed using `ptr::read_unaligned` + #[cfg_attr(feature="cargo-clippy", allow(clippy::cast_ptr_alignment))] + let addr = data.as_ptr().add(addr_byte) as *const u64; + // ok thanks to the 7 byte padding ptr::read_unaligned(addr) }; let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift; diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index 03738e694..99bea0b09 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -29,7 +29,7 @@ where W: Write, { /// Creates a new `TermDictionaryBuilder` - pub fn new(w: W, _field_type: FieldType) -> io::Result { + pub fn new(w: W, _field_type: &FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilder { fst_builder, @@ -129,9 +129,9 @@ impl TermDictionary { } /// Creates an empty term dictionary which contains no terms. - pub fn empty(field_type: FieldType) -> Self { + pub fn empty(field_type: &FieldType) -> Self { let term_dictionary_data: Vec = - TermDictionaryBuilder::new(Vec::::new(), field_type) + TermDictionaryBuilder::new(Vec::::new(), &field_type) .expect("Creating a TermDictionaryBuilder in a Vec should never fail") .finish() .expect("Writing in a Vec should never fail"); @@ -193,12 +193,12 @@ impl TermDictionary { /// Returns a range builder, to stream all of the terms /// within an interval. - pub fn range<'a>(&'a self) -> TermStreamerBuilder<'a> { + pub fn range(&self) -> TermStreamerBuilder { TermStreamerBuilder::new(self, self.fst_index.range()) } /// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field) - pub fn stream<'a>(&'a self) -> TermStreamer<'a> { + pub fn stream(&self) -> TermStreamer { self.range().into_stream() } From 10f6c07c53001d617766b7be2e714c798727c3ba Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 15 Sep 2018 20:20:22 +0900 Subject: [PATCH 54/62] Clippy (#422) * Cargo Format * Clippy --- examples/basic_search.rs | 2 +- examples/custom_tokenizer.rs | 2 +- examples/deleting_updating_documents.rs | 2 +- examples/snippet.rs | 2 +- examples/stop_words.rs | 2 +- src/collector/facet_collector.rs | 18 ++++------ src/collector/top_collector.rs | 3 +- src/common/bitpacker.rs | 2 ++ src/common/bitset.rs | 4 +-- src/core/inverted_index_reader.rs | 6 +++- src/core/searcher.rs | 11 +++--- src/core/segment_reader.rs | 12 +++---- src/directory/ram_directory.rs | 6 ++-- src/error.rs | 12 +++---- src/fastfield/bytes/writer.rs | 2 +- src/fastfield/reader.rs | 4 +-- src/fieldnorm/code.rs | 34 +++++++++--------- src/indexer/delete_queue.rs | 20 ++++++----- src/indexer/index_writer.rs | 29 +++++++-------- src/indexer/merger.rs | 45 +++++++++++------------- src/indexer/segment_updater.rs | 3 +- src/indexer/segment_writer.rs | 34 +++++++++--------- src/lib.rs | 16 ++++----- src/positions/reader.rs | 3 +- src/postings/postings_writer.rs | 6 ++-- src/postings/segment_postings.rs | 7 ++-- src/postings/stacker/murmurhash2.rs | 2 +- src/query/bm25.rs | 3 +- src/query/boolean_query/boolean_query.rs | 6 ++-- src/query/phrase_query/phrase_scorer.rs | 3 +- src/query/query_parser/query_parser.rs | 29 ++++++++------- src/query/range_query.rs | 12 +++---- src/query/union.rs | 8 +++-- src/schema/facet.rs | 10 ++---- src/schema/schema.rs | 3 +- src/snippet/mod.rs | 22 +++++------- src/store/reader.rs | 5 ++- src/store/writer.rs | 3 +- src/termdict/merger.rs | 8 +++-- src/termdict/mod.rs | 16 ++++----- src/termdict/streamer.rs | 4 +++ src/termdict/term_info_store.rs | 2 +- src/termdict/termdict.rs | 7 ++-- src/tokenizer/stemmer.rs | 2 ++ 44 files changed, 208 insertions(+), 224 deletions(-) diff --git a/examples/basic_search.rs b/examples/basic_search.rs index 1aba7bf3f..00576be51 100644 --- a/examples/basic_search.rs +++ b/examples/basic_search.rs @@ -230,7 +230,7 @@ fn main() -> tantivy::Result<()> { // a title. for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index 7c5299e00..08236c0e5 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -109,7 +109,7 @@ fn main() -> tantivy::Result<()> { let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs index de0603392..afae85685 100644 --- a/examples/deleting_updating_documents.rs +++ b/examples/deleting_updating_documents.rs @@ -31,7 +31,7 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result tantivy::Result<()> { let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let doc = searcher.doc(&doc_address)?; + let doc = searcher.doc(doc_address)?; let snippet = snippet_generator.snippet_from_doc(&doc); println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); println!("snippet: {}", snippet.to_html()); diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 8945f8614..80e78ece2 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -113,7 +113,7 @@ fn main() -> tantivy::Result<()> { let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 8e1c95876..a092a8dae 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -377,10 +377,8 @@ impl FacetCollector { } else { collapsed_facet_counts[seg_ord][collapsed_term_id] } - }) - .unwrap_or(0) - }) - .sum(); + }).unwrap_or(0) + }).sum(); if count > 0u64 { let bytes: Vec = facet_merger.key().to_owned(); // may create an corrupted facet if the term dicitonary is corrupted @@ -529,8 +527,7 @@ mod tests { n /= 4; let leaf = n % 5; Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf)) - }) - .collect(); + }).collect(); for i in 0..num_facets * 10 { let mut doc = Document::new(); doc.add_facet(facet_field, facets[i % num_facets].clone()); @@ -557,7 +554,8 @@ mod tests { ("/top1/mid1", 50), ("/top1/mid2", 50), ("/top1/mid3", 50), - ].iter() + ] + .iter() .map(|&(facet_str, count)| (String::from(facet_str), count)) .collect::>() ); @@ -621,15 +619,13 @@ mod tests { let facet = Facet::from(&format!("/facet/{}", c)); let doc = doc!(facet_field => facet); iter::repeat(doc).take(count) - }) - .map(|mut doc| { + }).map(|mut doc| { doc.add_facet( facet_field, &format!("/facet/{}", thread_rng().sample(&uniform)), ); doc - }) - .collect(); + }).collect(); thread_rng().shuffle(&mut docs[..]); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 6cb61e8b2..265a6981a 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -91,8 +91,7 @@ impl TopCollector { feature, doc_address, }| (feature, doc_address), - ) - .collect() + ).collect() } /// Return true iff at least K documents have gone through diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 4945796b0..593e36fb8 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -102,6 +102,7 @@ where addr + 8 <= data.len(), "The fast field field should have been padded with 7 bytes." ); + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let val_unshifted_unmasked: u64 = u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }); let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; @@ -125,6 +126,7 @@ where for output_val in output.iter_mut() { let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 750d835f8..a125f4cbc 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -77,7 +77,7 @@ impl TinySet { /// Returns true iff the `TinySet` is empty. #[inline(always)] - pub fn is_empty(&self) -> bool { + pub fn is_empty(self) -> bool { self.0 == 0u64 } @@ -114,7 +114,7 @@ impl TinySet { self.0 = 0u64; } - pub fn len(&self) -> u32 { + pub fn len(self) -> u32 { self.0.count_ones() } } diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 9ae0dbf43..ba9d77c70 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -32,6 +32,10 @@ pub struct InvertedIndexReader { } impl InvertedIndexReader { + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::needless_pass_by_value) + )] // for symetry pub(crate) fn new( termdict: TermDictionary, postings_source: ReadOnlySource, @@ -54,7 +58,7 @@ impl InvertedIndexReader { /// Creates an empty `InvertedIndexReader` object, which /// contains no terms at all. - pub fn empty(field_type: FieldType) -> InvertedIndexReader { + pub fn empty(field_type: &FieldType) -> InvertedIndexReader { let record_option = field_type .get_index_record_option() .unwrap_or(IndexRecordOption::Basic); diff --git a/src/core/searcher.rs b/src/core/searcher.rs index cbe549062..64e5263ee 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -46,8 +46,8 @@ impl Searcher { /// /// The searcher uses the segment ordinal to route the /// the request to the right `Segment`. - pub fn doc(&self, doc_address: &DocAddress) -> Result { - let DocAddress(segment_local_id, doc_id) = *doc_address; + pub fn doc(&self, doc_address: DocAddress) -> Result { + let DocAddress(segment_local_id, doc_id) = doc_address; let segment_reader = &self.segment_readers[segment_local_id as usize]; segment_reader.doc(doc_id) } @@ -61,7 +61,7 @@ impl Searcher { pub fn num_docs(&self) -> u64 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.num_docs() as u64) + .map(|segment_reader| u64::from(segment_reader.num_docs())) .sum::() } @@ -70,8 +70,9 @@ impl Searcher { pub fn doc_freq(&self, term: &Term) -> u64 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64) - .sum::() + .map(|segment_reader| { + u64::from(segment_reader.inverted_index(term.field()).doc_freq(term)) + }).sum::() } /// Return the list of segment readers diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 03cfdf08d..7cf395c9f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -153,8 +153,8 @@ impl SegmentReader { /// Accessor to the `BytesFastFieldReader` associated to a given `Field`. pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); - match field_entry.field_type() { - &FieldType::Bytes => {} + match *field_entry.field_type() { + FieldType::Bytes => {} _ => return Err(FastFieldNotAvailableError::new(field_entry)), } let idx_reader = self @@ -177,7 +177,7 @@ impl SegmentReader { "The field {:?} is not a \ hierarchical facet.", field_entry - )).into()); + ))); } let term_ords_reader = self.multi_fast_field_reader(field)?; let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { @@ -188,7 +188,7 @@ impl SegmentReader { field_entry.name() )) })?; - let termdict = TermDictionary::from_source(termdict_source); + let termdict = TermDictionary::from_source(&termdict_source); let facet_reader = FacetReader::new(term_ords_reader, termdict); Ok(facet_reader) } @@ -312,7 +312,7 @@ impl SegmentReader { // As a result, no data is associated to the inverted index. // // Returns an empty inverted index. - return Arc::new(InvertedIndexReader::empty(field_type.clone())); + return Arc::new(InvertedIndexReader::empty(field_type)); } let postings_source = postings_source_opt.unwrap(); @@ -333,7 +333,7 @@ impl SegmentReader { .expect("Index corrupted. Failed to open field positions in composite file."); let inv_idx_reader = Arc::new(InvertedIndexReader::new( - TermDictionary::from_source(termdict_source), + TermDictionary::from_source(&termdict_source), postings_source, positions_source, positions_idx_source, diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 2f1733e0f..ad79319e7 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -100,8 +100,7 @@ impl InnerDirectory { ); let io_err = make_io_err(msg); OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) - .and_then(|readable_map| { + }).and_then(|readable_map| { readable_map .get(path) .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) @@ -121,8 +120,7 @@ impl InnerDirectory { ); let io_err = make_io_err(msg); DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) - .and_then(|mut writable_map| match writable_map.remove(path) { + }).and_then(|mut writable_map| match writable_map.remove(path) { Some(_) => Ok(()), None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), }) diff --git a/src/error.rs b/src/error.rs index d7f0d1d1a..a84befbc8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -84,9 +84,7 @@ impl From> for TantivyError { impl From for TantivyError { fn from(error: OpenReadError) -> TantivyError { match error { - OpenReadError::FileDoesNotExist(filepath) => { - TantivyError::PathDoesNotExist(filepath) - } + OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath), OpenReadError::IOError(io_error) => TantivyError::IOError(io_error), } } @@ -105,7 +103,7 @@ impl From for TantivyError { TantivyError::FileAlreadyExists(filepath) } OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), - }.into() + } } } @@ -115,9 +113,9 @@ impl From for TantivyError { OpenDirectoryError::DoesNotExist(directory_path) => { TantivyError::PathDoesNotExist(directory_path) } - OpenDirectoryError::NotADirectory(directory_path) => TantivyError::InvalidArgument( - format!("{:?} is not a directory", directory_path), - ), + OpenDirectoryError::NotADirectory(directory_path) => { + TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path)) + } } } } diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 568a5421f..472e8d682 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -51,7 +51,7 @@ impl BytesFastFieldWriter { self.next_doc(); for field_value in doc.field_values() { if field_value.field() == self.field { - if let &Value::Bytes(ref bytes) = field_value.value() { + if let Value::Bytes(ref bytes) = *field_value.value() { self.vals.extend_from_slice(bytes); } else { panic!( diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index f4b90ac8b..6df8e3775 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -11,7 +11,6 @@ use schema::SchemaBuilder; use schema::FAST; use std::collections::HashMap; use std::marker::PhantomData; -use std::mem; use std::path::Path; use DocId; @@ -80,7 +79,8 @@ impl FastFieldReader { // TODO change start to `u64`. // For multifastfield, start is an index in a second fastfield, not a `DocId` pub fn get_range(&self, start: u32, output: &mut [Item]) { - let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64` + // ok: Item is either `u64` or `i64` + let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) }; self.bit_unpacker.get_range(start, output_u64); for out in output_u64.iter_mut() { *out = Item::from_u64(*out + self.min_value_u64).as_u64(); diff --git a/src/fieldnorm/code.rs b/src/fieldnorm/code.rs index 230c0e743..3a62d18c2 100644 --- a/src/fieldnorm/code.rs +++ b/src/fieldnorm/code.rs @@ -15,23 +15,23 @@ pub const FIELD_NORMS_TABLE: [u32; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232, - 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1_048, - 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120, - 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384, - 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176, - 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480, - 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008, - 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600, - 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464, - 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480, - 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088, - 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304, - 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888, - 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, - 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, - 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, - 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, - 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944, + 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, + 1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, + 3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, + 14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, + 45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, + 131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, + 393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, + 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, + 2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, + 6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, + 14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, + 31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, + 67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, + 150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, + 301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, + 603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, + 1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944, ]; #[cfg(test)] diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 18eff3387..842b7a2f3 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -186,19 +186,21 @@ impl DeleteCursor { /// `opstamp >= target_opstamp`. pub fn skip_to(&mut self, target_opstamp: u64) { // TODO Can be optimize as we work with block. - #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] - loop { - if let Some(operation) = self.get() { - if operation.opstamp >= target_opstamp { - break; - } - } else { - break; - } + while self.is_behind_opstamp(target_opstamp) { self.advance(); } } + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::wrong_self_convention) + )] + fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool { + self.get() + .map(|operation| operation.opstamp < target_opstamp) + .unwrap_or(false) + } + /// If the current block has been entirely /// consumed, try to load the next one. /// diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 32017e8f5..66de84c16 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -56,11 +56,12 @@ fn initial_table_size(per_thread_memory_budget: usize) -> usize { (1..) .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) .last() - .expect(&format!( - "Per thread memory is too small: {}", - per_thread_memory_budget - )) - .min(19) // we cap it at 512K + .unwrap_or_else(|| { + panic!( + "Per thread memory is too small: {}", + per_thread_memory_budget + ) + }).min(19) // we cap it at 512K } /// `IndexWriter` is the user entry-point to add document to an index. @@ -300,9 +301,7 @@ fn index_documents( let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - let segment_entry: SegmentEntry; - - if delete_cursor.get().is_some() { + let segment_entry: SegmentEntry = if delete_cursor.get().is_some() { let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); let segment_reader = SegmentReader::open(segment)?; let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); @@ -313,18 +312,18 @@ fn index_documents( &doc_to_opstamps, last_docstamp, )?; - segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { + SegmentEntry::new(segment_meta, delete_cursor, { if may_have_deletes { Some(deleted_bitset) } else { None } - }); + }) } else { // if there are no delete operation in the queue, no need // to even open the segment. - segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None); - } + SegmentEntry::new(segment_meta, delete_cursor, None) + }; Ok(segment_updater.add_segment(generation, segment_entry)) } @@ -391,11 +390,9 @@ impl IndexWriter { .name(format!( "indexing thread {} for gen {}", self.worker_id, generation - )) - .spawn(move || { + )).spawn(move || { loop { - let mut document_iterator = - document_receiver_clone.clone().into_iter().peekable(); + let mut document_iterator = document_receiver_clone.clone().peekable(); // the peeking here is to avoid // creating a new segment's files diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 87158a947..a42ea6d44 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -40,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { total_tokens += reader.inverted_index(field).total_num_tokens(); } } - total_tokens - + count - .iter() - .cloned() - .enumerate() - .map(|(fieldnorm_ord, count)| { - count as u64 * FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8) as u64 - }) - .sum::() + total_tokens + count + .iter() + .cloned() + .enumerate() + .map(|(fieldnorm_ord, count)| { + count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) + }).sum::() } pub struct IndexMerger { @@ -111,7 +109,7 @@ impl TermOrdinalMapping { .iter() .flat_map(|term_ordinals| term_ordinals.iter().cloned().max()) .max() - .unwrap_or(TermOrdinal::default()) + .unwrap_or_else(TermOrdinal::default) } } @@ -190,7 +188,7 @@ impl IndexMerger { `term_ordinal_mapping`."); self.write_hierarchical_facet_field( field, - term_ordinal_mapping, + &term_ordinal_mapping, fast_field_serializer, )?; } @@ -314,7 +312,7 @@ impl IndexMerger { fn write_hierarchical_facet_field( &self, field: Field, - term_ordinal_mappings: TermOrdinalMapping, + term_ordinal_mappings: &TermOrdinalMapping, fast_field_serializer: &mut FastFieldSerializer, ) -> Result<()> { // Multifastfield consists in 2 fastfields. @@ -393,8 +391,8 @@ impl IndexMerger { // We can now initialize our serializer, and push it the different values { - let mut serialize_vals = - fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?; + let mut serialize_vals = fast_field_serializer + .new_u64_fast_field_with_idx(field, min_value, max_value, 1)?; for reader in &self.readers { let ff_reader: MultiValueIntFastFieldReader = reader.multi_fast_field_reader(field)?; @@ -525,8 +523,7 @@ impl IndexMerger { } } None - }) - .collect(); + }).collect(); // At this point, `segment_postings` contains the posting list // of all of the segments containing the given term. @@ -667,8 +664,7 @@ mod tests { TextFieldIndexing::default() .set_tokenizer("default") .set_index_option(IndexRecordOption::WithFreqs), - ) - .set_stored(); + ).set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); @@ -770,23 +766,23 @@ mod tests { ); } { - let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); + let doc = searcher.doc(DocAddress(0, 0)).unwrap(); assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); + let doc = searcher.doc(DocAddress(0, 1)).unwrap(); assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { - let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); + let doc = searcher.doc(DocAddress(0, 2)).unwrap(); assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { - let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); + let doc = searcher.doc(DocAddress(0, 3)).unwrap(); assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); + let doc = searcher.doc(DocAddress(0, 4)).unwrap(); assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { @@ -822,8 +818,7 @@ mod tests { let text_fieldtype = schema::TextOptions::default() .set_indexing_options( TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), - ) - .set_stored(); + ).set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 2f1aab70c..1b2cd7c85 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -336,8 +336,7 @@ impl SegmentUpdater { .unwrap() .remove(&merging_thread_id); Ok(()) - }) - .expect("Failed to spawn a thread."); + }).expect("Failed to spawn a thread."); self.0 .merging_threads .write() diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 9627d60ad..ce4b1eb68 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -49,20 +49,20 @@ impl SegmentWriter { ) -> Result { let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits); - let tokenizers = schema - .fields() - .iter() - .map(|field_entry| field_entry.field_type()) - .map(|field_type| match *field_type { - FieldType::Str(ref text_options) => text_options.get_indexing_options().and_then( - |text_index_option| { - let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) - }, - ), - _ => None, - }) - .collect(); + let tokenizers = + schema + .fields() + .iter() + .map(|field_entry| field_entry.field_type()) + .map(|field_type| match *field_type { + FieldType::Str(ref text_options) => text_options + .get_indexing_options() + .and_then(|text_index_option| { + let tokenizer_name = &text_index_option.tokenizer(); + segment.index().tokenizers().get(tokenizer_name) + }), + _ => None, + }).collect(); Ok(SegmentWriter { max_doc: 0, multifield_postings, @@ -117,8 +117,7 @@ impl SegmentWriter { _ => { panic!("Expected hierarchical facet"); } - }) - .collect(); + }).collect(); let mut term = Term::for_field(field); // we set the Term for facet_bytes in facets { let mut unordered_term_id_opt = None; @@ -146,8 +145,7 @@ impl SegmentWriter { .flat_map(|field_value| match *field_value.value() { Value::Str(ref text) => Some(text.as_str()), _ => None, - }) - .collect(); + }).collect(); if texts.is_empty() { 0 } else { diff --git a/src/lib.rs b/src/lib.rs index 15a2bd567..8e717e82f 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,8 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] -#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] -#![cfg_attr(feature = "cargo-clippy", allow(clippy::inline_always))] -#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))] #![cfg_attr(all(feature = "unstable", test), feature(test))] -#![cfg_attr(feature = "cargo-clippy", allow(clippy::new_without_default))] -#![cfg_attr(feature = "cargo-clippy", allow(clippy::decimal_literal_representation))] - +#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] -#![allow(unknown_lints)] #![warn(missing_docs)] #![recursion_limit = "80"] @@ -98,7 +93,7 @@ //! // most relevant doc ids... //! let doc_addresses = top_collector.docs(); //! for doc_address in doc_addresses { -//! let retrieved_doc = searcher.doc(&doc_address)?; +//! let retrieved_doc = searcher.doc(doc_address)?; //! println!("{}", schema.to_json(&retrieved_doc)); //! } //! @@ -184,7 +179,10 @@ mod macros; pub use error::TantivyError; -#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")] +#[deprecated( + since = "0.7.0", + note = "please use `tantivy::TantivyError` instead" +)] pub use error::TantivyError as Error; extern crate census; diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 9a0157725..470abaaa2 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -137,7 +137,8 @@ impl PositionReader { .iter() .cloned() .map(|num_bit| num_bit as usize) - .sum::() * (COMPRESSION_BLOCK_SIZE / 8); + .sum::() + * (COMPRESSION_BLOCK_SIZE / 8); self.skip_read.advance(num_blocks_to_advance); self.position_read.advance(skip_len); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index b3f879611..dd0f691ae 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -29,8 +29,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { IndexRecordOption::WithFreqsAndPositions => { SpecializedPostingsWriter::::new_boxed() } - }) - .unwrap_or_else(|| SpecializedPostingsWriter::::new_boxed()), + }).unwrap_or_else(|| SpecializedPostingsWriter::::new_boxed()), FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => { SpecializedPostingsWriter::::new_boxed() } @@ -139,8 +138,7 @@ impl MultiFieldPostingsWriter { .enumerate() .map(|(term_ord, unord_term_id)| { (unord_term_id as UnorderedTermId, term_ord as TermOrdinal) - }) - .collect(); + }).collect(); unordered_term_mappings.insert(field, mapping); } FieldType::U64(_) | FieldType::I64(_) => {} diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 38f12fe09..776844f2a 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -205,7 +205,9 @@ impl DocSet for SegmentPostings { return SkipResult::End; } } - } else if self.block_cursor.skip_to(target) == BlockSegmentPostingsSkipResult::Terminated { + } else if self.block_cursor.skip_to(target) + == BlockSegmentPostingsSkipResult::Terminated + { // no positions needed. no need to sum freqs. return SkipResult::End; } @@ -531,8 +533,7 @@ impl BlockSegmentPostings { } else { BlockSegmentPostingsSkipResult::Terminated } - }) - .unwrap_or(BlockSegmentPostingsSkipResult::Terminated); + }).unwrap_or(BlockSegmentPostingsSkipResult::Terminated); } BlockSegmentPostingsSkipResult::Terminated } diff --git a/src/postings/stacker/murmurhash2.rs b/src/postings/stacker/murmurhash2.rs index 68e22e6c3..9626dcb53 100644 --- a/src/postings/stacker/murmurhash2.rs +++ b/src/postings/stacker/murmurhash2.rs @@ -4,7 +4,7 @@ const M: u32 = 0x5bd1_e995; #[inline(always)] pub fn murmurhash2(key: &[u8]) -> u32 { - #[cfg_attr(feature="cargo-clippy", allow(clippy::cast_ptr_alignment))] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let mut key_ptr: *const u32 = key.as_ptr() as *const u32; let len = key.len() as u32; let mut h: u32 = SEED ^ len; diff --git a/src/query/bm25.rs b/src/query/bm25.rs index 4a3a25590..eb2546725 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -63,8 +63,7 @@ impl BM25Weight { .map(|term| { let term_doc_freq = searcher.doc_freq(term); idf(term_doc_freq, total_num_docs) - }) - .sum::(); + }).sum::(); BM25Weight::new(idf, average_fieldnorm) } diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 353c89806..b530c6b0a 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -47,8 +47,7 @@ impl Query for BooleanQuery { .iter() .map(|&(ref occur, ref subquery)| { Ok((*occur, subquery.weight(searcher, scoring_enabled)?)) - }) - .collect::>()?; + }).collect::>()?; Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } @@ -69,8 +68,7 @@ impl BooleanQuery { let term_query: Box = Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)); (Occur::Should, term_query) - }) - .collect(); + }).collect(); BooleanQuery::from(occur_term_queries) } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 85f075d3a..9b896a46a 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -134,8 +134,7 @@ impl PhraseScorer { .into_iter() .map(|(offset, postings)| { PostingsWithOffset::new(postings, (max_offset - offset) as u32) - }) - .collect::>(); + }).collect::>(); PhraseScorer { intersection_docset: Intersection::new(postings_with_offsets), num_docsets, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index fcfa19345..6f35971a5 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -1,5 +1,3 @@ -#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))] - use super::logical_ast::*; use super::query_grammar::parse_to_ast; use super::user_input_ast::*; @@ -70,8 +68,7 @@ fn trim_ast(logical_ast: LogicalAST) -> Option { .into_iter() .flat_map(|(occur, child)| { trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) - }) - .collect::>(); + }).collect::>(); if trimmed_children.is_empty() { None } else { @@ -237,14 +234,15 @@ impl QueryParser { } FieldType::Str(ref str_options) => { if let Some(option) = str_options.get_indexing_options() { - let mut tokenizer = self.tokenizer_manager.get(option.tokenizer()).ok_or_else( - || { - QueryParserError::UnknownTokenizer( - field_entry.name().to_string(), - option.tokenizer().to_string(), - ) - }, - )?; + let mut tokenizer = + self.tokenizer_manager + .get(option.tokenizer()) + .ok_or_else(|| { + QueryParserError::UnknownTokenizer( + field_entry.name().to_string(), + option.tokenizer().to_string(), + ) + })?; let mut terms: Vec<(usize, Term)> = Vec::new(); let mut token_stream = tokenizer.token_stream(phrase); token_stream.process(&mut |token| { @@ -423,8 +421,7 @@ impl QueryParser { lower: self.resolve_bound(field, &lower)?, upper: self.resolve_bound(field, &upper)?, }))) - }) - .collect::, QueryParserError>>()?; + }).collect::, QueryParserError>>()?; let result_ast = if clauses.len() == 1 { clauses.pop().unwrap() } else { @@ -452,7 +449,9 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { value_type, lower, upper, - } => Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper)), + } => Box::new(RangeQuery::new_term_bounds( + field, value_type, &lower, &upper, + )), LogicalLiteral::All => Box::new(AllQuery), } } diff --git a/src/query/range_query.rs b/src/query/range_query.rs index fd739652c..43da4bd8c 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -96,8 +96,8 @@ impl RangeQuery { pub fn new_term_bounds( field: Field, value_type: Type, - left_bound: Bound, - right_bound: Bound, + left_bound: &Bound, + right_bound: &Bound, ) -> RangeQuery { let verify_and_unwrap_term = |val: &Term| { assert_eq!(field, val.field()); @@ -184,11 +184,7 @@ impl RangeQuery { /// /// If the field is not of the type `Str`, tantivy /// will panic when the `Weight` object is created. - pub fn new_str_bounds<'b>( - field: Field, - left: Bound<&'b str>, - right: Bound<&'b str>, - ) -> RangeQuery { + pub fn new_str_bounds(field: Field, left: Bound<&str>, right: Bound<&str>) -> RangeQuery { let make_term_val = |val: &&str| val.as_bytes().to_vec(); RangeQuery { field, @@ -202,7 +198,7 @@ impl RangeQuery { /// /// If the field is not of the type `Str`, tantivy /// will panic when the `Weight` object is created. - pub fn new_str<'b>(field: Field, range: Range<&'b str>) -> RangeQuery { + pub fn new_str(field: Field, range: Range<&str>) -> RangeQuery { RangeQuery::new_str_bounds( field, Bound::Included(range.start), diff --git a/src/query/union.rs b/src/query/union.rs index b4a7441a3..5bbe902a0 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -55,8 +55,7 @@ where None } }, - ) - .collect(); + ).collect(); Union { docsets: non_empty_docsets, bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), @@ -215,7 +214,10 @@ where // The target is outside of the buffered horizon. // advance all docsets to a doc >= to the target. - #[cfg_attr(feature = "cargo-clippy", allow(clippy::clippy::collapsible_if))] + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::clippy::collapsible_if) + )] unordered_drain_filter(&mut self.docsets, |docset| { if docset.doc() < target { if docset.skip_next(target) == SkipResult::End { diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 6a34e8d42..bb685c277 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -97,16 +97,12 @@ impl Facet { } /// Returns `true` iff other is a subfacet of `self`. - #[cfg_attr(feature = "cargo-clippy", allow(clippy::collapsible_if))] pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_bytes: &[u8] = self.encoded_bytes(); let other_bytes: &[u8] = other.encoded_bytes(); - if self_bytes.len() < other_bytes.len() { - if other_bytes.starts_with(self_bytes) { - return other_bytes[self_bytes.len()] == 0u8; - } - } - false + self_bytes.len() < other_bytes.len() + && other_bytes.starts_with(self_bytes) + && other_bytes[self_bytes.len()] == 0u8 } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 0855200f4..85d8d14f3 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -441,8 +441,7 @@ mod tests { "count": 4, "popularity": 10 }"#, - ) - .unwrap(); + ).unwrap(); assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); assert_eq!( doc.get_first(author_field).unwrap().text(), diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 4f33dc39e..ba653a411 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -152,10 +152,7 @@ fn search_fragments<'a>( /// /// Takes a vector of `FragmentCandidate`s and the text. /// Figures out the best fragment from it and creates a snippet. -fn select_best_fragment_combination<'a>( - fragments: Vec, - text: &'a str, -) -> Snippet { +fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet { let best_fragment_opt = fragments.iter().max_by(|left, right| { let cmp_score = left .score @@ -177,8 +174,7 @@ fn select_best_fragment_combination<'a>( item.start - fragment.start_offset, item.stop - fragment.start_offset, ) - }) - .collect(); + }).collect(); Snippet { fragments: fragment_text.to_string(), highlighted, @@ -289,7 +285,7 @@ impl SnippetGenerator { &self.terms_text, self.max_num_chars, ); - select_best_fragment_combination(fragment_candidates, &text) + select_best_fragment_combination(&fragment_candidates[..], &text) } } @@ -332,7 +328,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } - let snippet = select_best_fragment_combination(fragments, &TEST_TEXT); + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) } @@ -356,7 +352,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.stop_offset, 7); } - let snippet = select_best_fragment_combination(fragments, &text); + let snippet = select_best_fragment_combination(&fragments[..], &text); assert_eq!(snippet.fragments, "c d"); assert_eq!(snippet.to_html(), "c d"); } @@ -380,7 +376,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 8); } - let snippet = select_best_fragment_combination(fragments, &text); + let snippet = select_best_fragment_combination(&fragments[..], &text); assert_eq!(snippet.fragments, "e f"); assert_eq!(snippet.to_html(), "e f"); } @@ -405,7 +401,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(first.start_offset, 0); } - let snippet = select_best_fragment_combination(fragments, &text); + let snippet = select_best_fragment_combination(&fragments[..], &text); assert_eq!(snippet.fragments, "e f g"); assert_eq!(snippet.to_html(), "e f g"); } @@ -423,7 +419,7 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination(fragments, &text); + let snippet = select_best_fragment_combination(&fragments[..], &text); assert_eq!(snippet.fragments, ""); assert_eq!(snippet.to_html(), ""); } @@ -438,7 +434,7 @@ Survey in 2016, 2017, and 2018."#; let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); assert_eq!(fragments.len(), 0); - let snippet = select_best_fragment_combination(fragments, &text); + let snippet = select_best_fragment_combination(&fragments[..], &text); assert_eq!(snippet.fragments, ""); assert_eq!(snippet.to_html(), ""); } diff --git a/src/store/reader.rs b/src/store/reader.rs index bdf02f00c..428b013f0 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -89,7 +89,10 @@ impl StoreReader { } } -#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] +#[cfg_attr( + feature = "cargo-clippy", + allow(clippy::needless_pass_by_value) +)] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); diff --git a/src/store/writer.rs b/src/store/writer.rs index f1446ab8b..3fbdee074 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -51,7 +51,8 @@ impl StoreWriter { stored_document.serialize(&mut self.intermediary_buffer)?; let doc_num_bytes = self.intermediary_buffer.len(); VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; - self.current_block.write_all(&self.intermediary_buffer[..])?; + self.current_block + .write_all(&self.intermediary_buffer[..])?; self.doc += 1; if self.current_block.len() > BLOCK_SIZE { self.write_and_compress_block()?; diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 407a49e90..1d3844067 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -53,8 +53,7 @@ impl<'a> TermMerger<'a> { .map(|(ord, streamer)| HeapItem { streamer, segment_ord: ord, - }) - .collect(), + }).collect(), } } @@ -123,7 +122,10 @@ impl<'a> TermMerger<'a> { } /// Iterates through terms - #[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))] + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::should_implement_trait) + )] pub fn next(&mut self) -> Option> { if self.advance() { Some(Term::wrap(self.current_streamers[0].streamer.key())) diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 5ae259425..54102a9f4 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -75,7 +75,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); for (term_ord, term) in COUNTRIES.iter().enumerate() { assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); let mut bytes = vec![]; @@ -102,7 +102,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); @@ -189,7 +189,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); { let mut streamer = term_dictionary.stream(); let mut i = 0; @@ -224,7 +224,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let mut kv_stream = term_dictionary.stream(); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); @@ -256,7 +256,7 @@ mod tests { let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); { for i in (0..20).chain(6000..8_000) { let &(ref target_key, _) = &ids[i]; @@ -324,7 +324,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let mut stream = term_dictionary.stream(); assert!(stream.advance()); assert!(stream.key().is_empty()); @@ -348,7 +348,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let value_list = |mut streamer: TermStreamer| { let mut res: Vec = vec![]; @@ -417,7 +417,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); // We can now build an entire dfa. let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); diff --git a/src/termdict/streamer.rs b/src/termdict/streamer.rs index 48eb56c7d..98277f2ef 100644 --- a/src/termdict/streamer.rs +++ b/src/termdict/streamer.rs @@ -132,6 +132,10 @@ where } /// Return the next `(key, value)` pair. + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::should_implement_trait) + )] pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> { if self.advance() { Some((self.key(), self.value())) diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index 8b6a0159b..130b5d62f 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -91,7 +91,7 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { assert!(data.len() >= addr_byte + 7); let val_unshifted_unmasked: u64 = unsafe { // ok because the pointer is only accessed using `ptr::read_unaligned` - #[cfg_attr(feature="cargo-clippy", allow(clippy::cast_ptr_alignment))] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let addr = data.as_ptr().add(addr_byte) as *const u64; // ok thanks to the 7 byte padding ptr::read_unaligned(addr) diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index 99bea0b09..0f8a28231 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -77,7 +77,8 @@ where let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?; { let mut counting_writer = CountingWriter::wrap(&mut file); - self.term_info_store_writer.serialize(&mut counting_writer)?; + self.term_info_store_writer + .serialize(&mut counting_writer)?; let footer_size = counting_writer.written_bytes(); (footer_size as u64).serialize(&mut counting_writer)?; counting_writer.flush()?; @@ -112,7 +113,7 @@ pub struct TermDictionary { impl TermDictionary { /// Opens a `TermDictionary` given a data source. - pub fn from_source(source: ReadOnlySource) -> Self { + pub fn from_source(source: &ReadOnlySource) -> Self { let total_len = source.len(); let length_offset = total_len - 8; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; @@ -136,7 +137,7 @@ impl TermDictionary { .finish() .expect("Writing in a Vec should never fail"); let source = ReadOnlySource::from(term_dictionary_data); - Self::from_source(source) + Self::from_source(&source) } /// Returns the number of terms in the dictionary. diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 4c91bfb93..064662889 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "cargo-clippy", allow(clippy::new_without_default))] + use super::{Token, TokenFilter, TokenStream}; use rust_stemmers::{self, Algorithm}; use std::sync::Arc; From 5449ec3c110dd485d340f98dadef1c6323b5e697 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 16 Sep 2018 10:21:02 +0900 Subject: [PATCH 55/62] Snippet term score (#423) --- Cargo.toml | 1 + src/lib.rs | 6 +++ src/query/mod.rs | 50 ++++++++++++++++++++ src/snippet/mod.rs | 112 +++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 161 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 94b1d1bd7..7fe162b16 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,6 +55,7 @@ winapi = "0.2" [dev-dependencies] rand = "0.5" +maplit = "1" [profile.release] opt-level = 3 diff --git a/src/lib.rs b/src/lib.rs index 8e717e82f..62802dc91 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -152,6 +152,8 @@ extern crate tempdir; extern crate tempfile; extern crate uuid; + + #[cfg(test)] #[macro_use] extern crate matches; @@ -162,6 +164,10 @@ extern crate winapi; #[cfg(test)] extern crate rand; +#[cfg(test)] +#[macro_use] +extern crate maplit; + #[cfg(all(test, feature = "unstable"))] extern crate test; diff --git a/src/query/mod.rs b/src/query/mod.rs index 78b9cd56b..b7136c232 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -53,3 +53,53 @@ pub use self::scorer::ConstScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT}; + use query::QueryParser; + use Term; + use std::collections::BTreeSet; + + #[test] + fn test_query_terms() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let term_a = Term::from_field_text(text_field, "a"); + let term_b = Term::from_field_text(text_field, "b"); + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a b").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("\"a b\"").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a a a a a").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a -b").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + } +} \ No newline at end of file diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index ba653a411..2c6d3d012 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -247,7 +247,15 @@ impl SnippetGenerator { let terms_text: BTreeMap = terms .into_iter() .filter(|term| term.field() == field) - .map(|term| (term.text().to_string(), 1f32)) + .flat_map(|term| { + let doc_freq = searcher.doc_freq(&term); + let score = 1f32 / (1f32 + doc_freq as f32); + if doc_freq > 0 { + Some((term.text().to_string(), score)) + } else { + None + } + }) .collect(); let tokenizer = searcher.index().tokenizer_for_field(field)?; Ok(SnippetGenerator { @@ -263,6 +271,11 @@ impl SnippetGenerator { self.max_num_chars = max_num_chars; } + #[cfg(test)] + pub fn terms_text(&self) -> &BTreeMap { + &self.terms_text + } + /// Generates a snippet for the given `Document`. /// /// This method extract the text associated to the `SnippetGenerator`'s field @@ -293,7 +306,7 @@ impl SnippetGenerator { mod tests { use super::{search_fragments, select_best_fragment_combination}; use query::QueryParser; - use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions}; + use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions, TEXT}; use std::collections::BTreeMap; use std::iter::Iterator; use tokenizer::{box_tokenizer, SimpleTokenizer}; @@ -315,24 +328,67 @@ to the project are from community members.[15] Rust won first place for "most loved programming language" in the Stack Overflow Developer Survey in 2016, 2017, and 2018."#; + + #[test] fn test_snippet() { let boxed_tokenizer = box_tokenizer(SimpleTokenizer); - let mut terms = BTreeMap::new(); - terms.insert(String::from("rust"), 1.0); - terms.insert(String::from("language"), 0.9); + let terms = btreemap! { + String::from("rust") => 1.0, + String::from("language") => 0.9 + }; let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); assert_eq!(fragments.len(), 7); { - let first = fragments.iter().nth(0).unwrap(); + let first = &fragments[0]; assert_eq!(first.score, 1.9); assert_eq!(first.stop_offset, 89); } let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); - assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a \"safe".to_owned()); - assert_eq!(snippet.to_html(), "Rust is a systems programming language sponsored by Mozilla which\ndescribes it as a "safe".to_owned()) + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by \ + Mozilla which\ndescribes it as a \"safe"); + assert_eq!(snippet.to_html(), "Rust is a systems programming language \ + sponsored by Mozilla which\ndescribes it as a "safe") } + + #[test] + fn test_snippet_scored_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + { + let terms = btreemap! { + String::from("rust") =>1.0f32, + String::from("language") => 0.9f32 + }; + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20); + { + let first = &fragments[0]; + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 17); + } + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is a systems") + } + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + { + let terms = btreemap! { + String::from("rust") =>0.9f32, + String::from("language") => 1.0f32 + }; + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20); + //assert_eq!(fragments.len(), 7); + { + let first = &fragments[0]; + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 17); + } + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); + assert_eq!(snippet.to_html(), "programming language") + } + + } + + #[test] fn test_snippet_in_second_fragment() { let boxed_tokenizer = box_tokenizer(SimpleTokenizer); @@ -439,6 +495,46 @@ Survey in 2016, 2017, and 2018."#; assert_eq!(snippet.to_html(), ""); } + + #[test] + fn test_snippet_generator_term_score() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.add_document(doc!(text_field => "a")); + index_writer.add_document(doc!(text_field => "a")); + index_writer.add_document(doc!(text_field => "a b")); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + } + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + { + let query = query_parser.parse_query("e").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert!(snippet_generator.terms_text().is_empty()); + } + { + let query = query_parser.parse_query("a").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32), snippet_generator.terms_text()); + } + { + let query = query_parser.parse_query("a b").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text()); + } + { + let query = query_parser.parse_query("a b c").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text()); + } + } + #[test] fn test_snippet_generator() { let mut schema_builder = SchemaBuilder::default(); From 0df2a221da67455e9e92bd91c092a042d40f624c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 16 Sep 2018 13:24:14 +0900 Subject: [PATCH 56/62] Bump version pre-release --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7fe162b16..165c9c85a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.7.0-dev" +version = "0.7.0" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] From 8da28fb6cf0bc7868c46cc67224c2dd98cd2279a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 16 Sep 2018 13:26:54 +0900 Subject: [PATCH 57/62] Added iml filewq --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b6f5cc5b8..afaed5719 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +tantivy.iml *.swp target target/debug From 6ff60b8ed8ffce65e60b054d0887c2f312064f5b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 17 Sep 2018 06:20:44 +0900 Subject: [PATCH 58/62] Fixing README (#426) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0ce522a7c..b6535578b 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design. # Non-features -- Distributed search and will not be in the scope of tantivy. +- Distributed search is out of the scope of tantivy. # Supported OS and compiler From f32b4a2ebe4fa761221db8447c4a1cfc9deeb2d6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 17 Sep 2018 06:41:40 +0900 Subject: [PATCH 59/62] Removing release build from ci, disabling lto (#425) --- Cargo.toml | 1 - ci/script.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 165c9c85a..bb84e41b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,6 @@ maplit = "1" [profile.release] opt-level = 3 debug = false -lto = true debug-assertions = false [profile.test] diff --git a/ci/script.sh b/ci/script.sh index 0939344b0..9f3cf889d 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -11,7 +11,6 @@ main() { else echo "Build" cross build --target $TARGET - cross build --target $TARGET --release if [ ! -z $DISABLE_TESTS ]; then return fi From e0cdd3114d05c927a349eb07a5daaf17360661ae Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 17 Sep 2018 08:52:29 +0900 Subject: [PATCH 60/62] Fixing README (#427) Closes #424. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b6535578b..401951035 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,9 @@ Tantivy is, in fact, strongly inspired by Lucene's design. # Non-features -- Distributed search is out of the scope of tantivy. +- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a +library upon which one could build a distributed search. Serializable/mergeable collector state for instance, +are within the scope of tantivy. # Supported OS and compiler From 69d5e4b9b170e93a050ae2f508a4f448340bffe1 Mon Sep 17 00:00:00 2001 From: Konstantin Gribov Date: Fri, 12 Oct 2018 02:46:07 +0300 Subject: [PATCH 61/62] Added proper references for Apache Lucene & Solr (#432) Also, added links to websites for Lucene, Solr & ElasticSearch --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 401951035..1bf169af1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ **Tantivy** is a **full text search engine library** written in rust. -It is closer to Lucene than to Elastic Search and Solr in the sense it is not +It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine. From 0098e3d4285967555c605de955a397fba06a6c6c Mon Sep 17 00:00:00 2001 From: Jason Wolfe Date: Mon, 15 Oct 2018 09:04:36 +0900 Subject: [PATCH 62/62] Compute space usage of a Searcher / SegmentReader / CompositeFile (#282) * Compute space usage of a Searcher / SegmentReader / CompositeFile * Fix typo * Add serde Serialize/Deserialize for all the SpaceUsage structs * Fix indexing * Public methods for consuming space usage information * #281: Add a space usage method that takes a SegmentComponent to support code that is unaware of particular segment components, and to make it more likely to update methods when a new component type is added. * Add support for space usage computation of positions skip index file (#281) * Add some tests for space usage computation (#281) --- src/common/composite_file.rs | 12 + src/core/searcher.rs | 10 + src/core/segment_reader.rs | 16 ++ src/fastfield/delete.rs | 6 + src/lib.rs | 1 + src/space_usage/mod.rs | 484 +++++++++++++++++++++++++++++++++++ src/store/reader.rs | 6 + 7 files changed, 535 insertions(+) create mode 100644 src/space_usage/mod.rs diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index e7d657b65..0cdfdff87 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -4,6 +4,8 @@ use common::VInt; use directory::ReadOnlySource; use directory::WritePtr; use schema::Field; +use space_usage::PerFieldSpaceUsage; +use space_usage::FieldUsage; use std::collections::HashMap; use std::io::Write; use std::io::{self, Read}; @@ -166,6 +168,16 @@ impl CompositeFile { .get(&FileAddr { field, idx }) .map(|&(from, to)| self.data.slice(from, to)) } + + pub fn space_usage(&self) -> PerFieldSpaceUsage { + let mut fields = HashMap::new(); + for (&field_addr, &(start, end)) in self.offsets_index.iter() { + fields.entry(field_addr.field) + .or_insert_with(|| FieldUsage::empty(field_addr.field)) + .add_field_idx(field_addr.idx, end - start); + } + PerFieldSpaceUsage::new(fields) + } } #[cfg(test)] diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 64e5263ee..826bf4501 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -5,6 +5,7 @@ use query::Query; use schema::Document; use schema::Schema; use schema::{Field, Term}; +use space_usage::SearcherSpaceUsage; use std::fmt; use std::sync::Arc; use termdict::TermMerger; @@ -99,6 +100,15 @@ impl Searcher { .collect::>(); FieldSearcher::new(inv_index_readers) } + + /// Summarize total space usage of this searcher. + pub fn space_usage(&self) -> SearcherSpaceUsage { + let mut space_usage = SearcherSpaceUsage::new(); + for segment_reader in self.segment_readers.iter() { + space_usage.add_segment(segment_reader.space_usage()); + } + space_usage + } } pub struct FieldSearcher { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 7cf395c9f..54b465e77 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -16,6 +16,7 @@ use schema::Document; use schema::Field; use schema::FieldType; use schema::Schema; +use space_usage::SegmentSpaceUsage; use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -381,6 +382,21 @@ impl SegmentReader { pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator { SegmentReaderAliveDocsIterator::new(&self) } + + /// Summarize total space usage of this segment. + pub fn space_usage(&self) -> SegmentSpaceUsage { + SegmentSpaceUsage::new( + self.num_docs(), + self.termdict_composite.space_usage(), + self.postings_composite.space_usage(), + self.positions_composite.space_usage(), + self.positions_idx_composite.space_usage(), + self.fast_fields_composite.space_usage(), + self.fieldnorms_composite.space_usage(), + self.store_reader.space_usage(), + self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0), + ) + } } impl fmt::Debug for SegmentReader { diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 15ed658ce..76ff7e43b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -2,6 +2,7 @@ use bit_set::BitSet; use common::HasLen; use directory::ReadOnlySource; use directory::WritePtr; +use space_usage::ByteCount; use std::io; use std::io::Write; use DocId; @@ -63,6 +64,11 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } } + + /// Summarize total space usage of this bitset. + pub fn space_usage(&self) -> ByteCount { + self.data.len() + } } impl HasLen for DeleteBitSet { diff --git a/src/lib.rs b/src/lib.rs index 62802dc91..7aa8572ff 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -213,6 +213,7 @@ pub(crate) mod positions; pub mod postings; pub mod query; pub mod schema; +pub mod space_usage; pub mod store; pub mod termdict; diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs new file mode 100644 index 000000000..9ffd8b849 --- /dev/null +++ b/src/space_usage/mod.rs @@ -0,0 +1,484 @@ +/*! +Representations for the space usage of various parts of a Tantivy index. + +This can be used programmatically, and will also be exposed in a human readable fashion in +tantivy-cli. + +One important caveat for all of this functionality is that none of it currently takes storage-level +details into consideration. For example, if your file system block size is 4096 bytes, we can +under-count actual resultant space usage by up to 4095 bytes per file. +*/ + +use schema::Field; +use std::collections::HashMap; +use SegmentComponent; + +/// Indicates space usage in bytes +pub type ByteCount = usize; + +/// Enum containing any of the possible space usage results for segment components. +pub enum ComponentSpaceUsage { + /// Data is stored per field in a uniform way + PerField(PerFieldSpaceUsage), + /// Data is stored in separate pieces in the store + Store(StoreSpaceUsage), + /// Some sort of raw byte count + Basic(ByteCount), +} + +/// Represents combined space usage of an entire searcher and its component segments. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SearcherSpaceUsage { + segments: Vec, + total: ByteCount, +} + +impl SearcherSpaceUsage { + pub(crate) fn new() -> SearcherSpaceUsage { + SearcherSpaceUsage { + segments: Vec::new(), + total: 0, + } + } + + /// Add a segment, to `self`. + /// Performs no deduplication or other intelligence. + pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) { + self.total += segment.total(); + self.segments.push(segment); + } + + /// Per segment space usage + pub fn segments(&self) -> &[SegmentSpaceUsage] { + &self.segments[..] + } + + /// Returns total byte usage of this searcher, including all large subcomponents. + /// Does not account for smaller things like `meta.json`. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents combined space usage for all of the large components comprising a segment. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SegmentSpaceUsage { + num_docs: u32, + + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + + store: StoreSpaceUsage, + + deletes: ByteCount, + + total: ByteCount, +} + +impl SegmentSpaceUsage { + pub(crate) fn new( + num_docs: u32, + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + store: StoreSpaceUsage, + deletes: ByteCount, + ) -> SegmentSpaceUsage { + let total = termdict.total() + + postings.total() + + positions.total() + + fast_fields.total() + + fieldnorms.total() + + store.total() + + deletes; + SegmentSpaceUsage { + num_docs, + termdict, + postings, + positions, + positions_idx, + fast_fields, + fieldnorms, + store, + deletes, + total, + } + } + + /// Space usage for the given component + /// + /// Clones the underlying data. + /// Use the components directly if this is somehow in performance critical code. + pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { + use SegmentComponent::*; + use self::ComponentSpaceUsage::*; + match component { + POSTINGS => PerField(self.postings().clone()), + POSITIONS => PerField(self.positions().clone()), + POSITIONSSKIP => PerField(self.positions_skip_idx().clone()), + FASTFIELDS => PerField(self.fast_fields().clone()), + FIELDNORMS => PerField(self.fieldnorms().clone()), + TERMS => PerField(self.termdict().clone()), + STORE => Store(self.store().clone()), + DELETE => Basic(self.deletes()), + } + } + + /// Num docs in segment + pub fn num_docs(&self) -> u32 { + self.num_docs + } + + /// Space usage for term dictionary + pub fn termdict(&self) -> &PerFieldSpaceUsage { + &self.termdict + } + + /// Space usage for postings list + pub fn postings(&self) -> &PerFieldSpaceUsage { + &self.postings + } + + /// Space usage for positions + pub fn positions(&self) -> &PerFieldSpaceUsage { + &self.positions + } + + /// Space usage for positions skip idx + pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage { + &self.positions_idx + } + + /// Space usage for fast fields + pub fn fast_fields(&self) -> &PerFieldSpaceUsage { + &self.fast_fields + } + + /// Space usage for field norms + pub fn fieldnorms(&self) -> &PerFieldSpaceUsage { + &self.fieldnorms + } + + /// Space usage for stored documents + pub fn store(&self) -> &StoreSpaceUsage { + &self.store + } + + /// Space usage for document deletions + pub fn deletes(&self) -> ByteCount { + self.deletes + } + + /// Total space usage in bytes for this segment. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage for the Store for this segment. +/// +/// This is composed of two parts. +/// `data` represents the compressed data itself. +/// `offsets` represents a lookup to find the start of a block +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct StoreSpaceUsage { + data: ByteCount, + offsets: ByteCount, +} + +impl StoreSpaceUsage { + pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage { + StoreSpaceUsage { data, offsets } + } + + /// Space usage for the data part of the store + pub fn data_usage(&self) -> ByteCount { + self.data + } + + /// Space usage for the offsets part of the store (doc ID -> offset) + pub fn offsets_usage(&self) -> ByteCount { + self.offsets + } + + /// Total space usage in bytes for this Store + pub fn total(&self) -> ByteCount { + self.data + self.offsets + } +} + +/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile. +/// +/// A field can appear with a single index (typically 0) or with multiple indexes. +/// Multiple indexes are used to handle variable length things, where +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PerFieldSpaceUsage { + fields: HashMap, + total: ByteCount +} + +impl PerFieldSpaceUsage { + pub(crate) fn new(fields: HashMap) -> PerFieldSpaceUsage { + let total = fields.values().map(|x| x.total()).sum(); + PerFieldSpaceUsage { fields, total } + } + + /// Per field space usage + pub fn fields(&self) -> impl Iterator { + self.fields.iter() + } + + /// Bytes used by the represented file + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage of a given field, breaking it down into the (field, index) pairs that +/// comprise it. +/// +/// See documentation for PerFieldSpaceUsage for slightly more information. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct FieldUsage { + field: Field, + num_bytes: ByteCount, + /// A field can be composed of more than one piece. + /// These pieces are indexed by arbitrary numbers starting at zero. + /// `self.num_bytes` includes all of `self.sub_num_bytes`. + sub_num_bytes: Vec>, +} + +impl FieldUsage { + pub(crate) fn empty(field: Field) -> FieldUsage { + FieldUsage { + field, + num_bytes: 0, + sub_num_bytes: Vec::new(), + } + } + + pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) { + if self.sub_num_bytes.len() < idx + 1{ + self.sub_num_bytes.resize(idx + 1, None); + } + assert!(self.sub_num_bytes[idx].is_none()); + self.sub_num_bytes[idx] = Some(size); + self.num_bytes += size + } + + /// Field + pub fn field(&self) -> Field { + self.field + } + + /// Space usage for each index + pub fn sub_num_bytes(&self) -> &[Option] { + &self.sub_num_bytes[..] + } + + /// Total bytes used for this field in this context + pub fn total(&self) -> ByteCount { + self.num_bytes + } +} + +#[cfg(test)] +mod test { + use core::Index; + use schema::SchemaBuilder; + use schema::{FAST, INT_INDEXED, TEXT}; + use schema::Field; + use space_usage::ByteCount; + use space_usage::PerFieldSpaceUsage; + use schema::STORED; + use Term; + + #[test] + fn test_empty() { + let schema = SchemaBuilder::new().build(); + let index = Index::create_in_ram(schema.clone()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert_eq!(0, searcher_space_usage.total()); + } + + fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) { + assert!(field_space.total() >= min_size); + assert!(field_space.total() <= max_size); + assert_eq!( + vec![(field, field_space.total())], + field_space.fields().map(|(x,y)| (x, y.total())).collect::>() + ); + } + + #[test] + fn test_fast_indexed() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 10u64)); + index_writer.add_document(doc!(name => 20u64)); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + expect_single_field(segment.fast_fields(), &name, 1, 512); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_text() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + expect_single_field(segment.positions(), &name, 1, 512); + expect_single_field(segment.positions_skip_idx(), &name, 1, 512); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_store() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + assert_eq!(0, segment.termdict().total()); + assert_eq!(0, segment.postings().total()); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + assert_eq!(0, segment.fieldnorms().total()); + assert!(segment.store().total() > 0); + assert!(segment.store().total() < 512); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_deletes() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 3u64)); + index_writer.add_document(doc!(name => 4u64)); + index_writer.commit().unwrap(); + } + + { + let mut index_writer2 = index.writer(50_000_000).unwrap(); + index_writer2.delete_term(Term::from_field_u64(name, 2u64)); + index_writer2.delete_term(Term::from_field_u64(name, 3u64)); + + // ok, now we should have a deleted doc + index_writer2.commit().unwrap(); + } + + index.load_searchers().unwrap(); + + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(2, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert!(segment.deletes() > 0); + } +} \ No newline at end of file diff --git a/src/store/reader.rs b/src/store/reader.rs index 428b013f0..e94705bb3 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -6,6 +6,7 @@ use common::BinarySerializable; use common::VInt; use directory::ReadOnlySource; use schema::Document; +use space_usage::StoreSpaceUsage; use std::cell::RefCell; use std::io; use std::mem::size_of; @@ -87,6 +88,11 @@ impl StoreReader { cursor = &cursor[..doc_length]; Ok(Document::deserialize(&mut cursor)?) } + + /// Summarize total space usage of this store reader. + pub fn space_usage(&self) -> StoreSpaceUsage { + StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len()) + } } #[cfg_attr(