mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
fix clippy lints from 1.80-1.81 (#2488)
* fix some clippy lints * fix clippy::doc_lazy_continuation * fix some lints for 1.82
This commit is contained in:
@@ -368,9 +368,9 @@ mod test {
|
||||
for start_idx in 0u32..32u32 {
|
||||
output.resize(len, 0);
|
||||
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
||||
for i in 0..len {
|
||||
for (i, output_byte) in output.iter().enumerate() {
|
||||
let expected = (start_idx + i as u32) & mask;
|
||||
assert_eq!(output[i], expected);
|
||||
assert_eq!(*output_byte, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,8 +110,8 @@ fn test_null_index(data: &[bool]) {
|
||||
.map(|(pos, _val)| pos as u32)
|
||||
.collect();
|
||||
let mut select_iter = null_index.select_cursor();
|
||||
for i in 0..orig_idx_with_value.len() {
|
||||
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
|
||||
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
||||
assert_eq!(select_iter.select(i as u32), *expected);
|
||||
}
|
||||
|
||||
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
||||
|
||||
@@ -125,9 +125,8 @@ impl Line {
|
||||
/// Returns a line that attemps to approximate a function
|
||||
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
||||
///
|
||||
/// - The approximation is always lower than the actual value.
|
||||
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
|
||||
/// for any i in [0..ys.len()).
|
||||
/// - The approximation is always lower than the actual value. Or more rigorously, formally
|
||||
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
|
||||
/// - It computes without panicking for any value of it.
|
||||
///
|
||||
/// This function is only invariable by translation if all of the
|
||||
|
||||
@@ -64,10 +64,9 @@ impl From<ColumnType> for ColumnTypeCategory {
|
||||
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
||||
/// happen:
|
||||
/// - If the required column type is compatible with all of the input columnar, the resulsting
|
||||
/// merged
|
||||
/// columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged
|
||||
/// will fail with an InvalidData error.
|
||||
/// merged columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged will
|
||||
/// fail with an InvalidData error.
|
||||
///
|
||||
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
||||
/// `Columnar` table.
|
||||
|
||||
@@ -35,8 +35,7 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// - a slice with the ordinal of the segments containing the terms.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
|
||||
@@ -109,6 +109,9 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
move |input: I| match f.parse(input) {
|
||||
Ok((input, (output, _err))) => Ok((input, output)),
|
||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||
#[allow(unreachable_patterns)]
|
||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,10 +102,8 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||
///
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
|
||||
/// should be your default choice.
|
||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
|
||||
/// should be used mostly for tests.
|
||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this should be your default choice.
|
||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which should be used mostly for tests.
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a file and returns a boxed `FileHandle`.
|
||||
///
|
||||
|
||||
@@ -25,10 +25,9 @@ impl FacetReader {
|
||||
/// Creates a new `FacetReader`.
|
||||
///
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValuedFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
/// - a `MultiValuedFastFieldReader` that makes it possible to access the list of facet ords for
|
||||
/// a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to an ordinal and vice versa.
|
||||
pub fn new(facet_column: StrColumn) -> FacetReader {
|
||||
FacetReader { facet_column }
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ use crate::TantivyError;
|
||||
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
||||
/// either.
|
||||
///
|
||||
/// - In a sync context, you can call `FutureResult::wait()`. The function
|
||||
/// does not rely on `block_on`.
|
||||
/// - In a sync context, you can call `FutureResult::wait()`. The function does not rely on
|
||||
/// `block_on`.
|
||||
/// - In an async context, you can call simply use `FutureResult` as a future.
|
||||
pub struct FutureResult<T> {
|
||||
inner: Inner<T>,
|
||||
|
||||
@@ -49,10 +49,8 @@ fn load_metas(
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_new_metas(
|
||||
@@ -529,12 +527,12 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// - `num_threads` defines the number of indexing workers that should work at the same time.
|
||||
///
|
||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory allocated for all indexing
|
||||
/// thread.
|
||||
///
|
||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
|
||||
@@ -179,8 +179,7 @@ impl DeleteCursor {
|
||||
/// Skips operations and position it so that
|
||||
/// - either all of the delete operation currently in the queue are consume and the next get
|
||||
/// will return `None`.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
/// - the next get will return the first operation with an `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
|
||||
@@ -29,8 +29,8 @@ impl MergeOperationInventory {
|
||||
|
||||
/// A `MergeOperation` has two roles.
|
||||
/// It carries all of the information required to describe a merge:
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||
/// delete queue and reflect their deletes.
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the delete queue and reflect
|
||||
/// their deletes.
|
||||
/// - `segment_ids` is the list of segment to be merged.
|
||||
///
|
||||
/// The second role is to ensure keep track of the fact that these
|
||||
|
||||
@@ -10,12 +10,9 @@ use crate::indexer::delete_queue::DeleteCursor;
|
||||
///
|
||||
/// In addition to segment `meta`,
|
||||
/// it contains a few transient states
|
||||
/// - `alive_bitset` is a bitset describing
|
||||
/// documents that were alive during the commit
|
||||
/// itself.
|
||||
/// - `delete_cursor` is the position in the delete queue.
|
||||
/// Deletes happening before the cursor are reflected either
|
||||
/// in the .del file or in the `alive_bitset`.
|
||||
/// - `alive_bitset` is a bitset describing documents that were alive during the commit itself.
|
||||
/// - `delete_cursor` is the position in the delete queue. Deletes happening before the cursor are
|
||||
/// reflected either in the .del file or in the `alive_bitset`.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentEntry {
|
||||
meta: SegmentMeta,
|
||||
|
||||
@@ -30,10 +30,8 @@ const NUM_MERGE_THREADS: usize = 4;
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// and flushed.
|
||||
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
|
||||
@@ -125,8 +125,8 @@
|
||||
//!
|
||||
//! - **Searching**: [Searcher] searches the segments with anything that implements
|
||||
//! [Query](query::Query) and merges the results. The list of [supported
|
||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||
//! [Query](query::Query) trait.
|
||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||
//! [Query](query::Query) trait.
|
||||
//!
|
||||
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
|
||||
//!
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
/// # Assumption
|
||||
///
|
||||
/// - The block is sorted. Some elements may appear several times. This is the case at the
|
||||
/// end of the last block for instance.
|
||||
/// end of the last block for instance.
|
||||
/// - The target is assumed smaller or equal to the last element of the block.
|
||||
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
||||
let mut start = 0;
|
||||
|
||||
@@ -5,15 +5,10 @@ use crate::schema::{IndexRecordOption, Term};
|
||||
/// The boolean query returns a set of documents
|
||||
/// that matches the Boolean combination of constituent subqueries.
|
||||
///
|
||||
/// The documents matched by the boolean query are
|
||||
/// those which
|
||||
/// * match all of the sub queries associated with the
|
||||
/// `Must` occurrence
|
||||
/// * match none of the sub queries associated with the
|
||||
/// `MustNot` occurrence.
|
||||
/// * match at least one of the sub queries associated
|
||||
/// with the `Must` or `Should` occurrence.
|
||||
///
|
||||
/// The documents matched by the boolean query are those which
|
||||
/// - match all of the sub queries associated with the `Must` occurrence
|
||||
/// - match none of the sub queries associated with the `MustNot` occurrence.
|
||||
/// - match at least one of the sub queries associated with the `Must` or `Should` occurrence.
|
||||
///
|
||||
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
|
||||
///
|
||||
|
||||
@@ -6,6 +6,9 @@ use crate::query::phrase_query::{intersection_count, PhraseScorer};
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, Score};
|
||||
|
||||
// MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
|
||||
// though, it would be interesting to slim it down if possible.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum PhraseKind<TPostings: Postings> {
|
||||
SinglePrefix {
|
||||
position_offset: u32,
|
||||
|
||||
@@ -219,8 +219,8 @@ fn intersection_exists_with_slop(
|
||||
/// In contrast to the regular algorithm this solves some issues:
|
||||
/// - Keep track of the slop so far. Slop is a budget that is spent on the distance between terms.
|
||||
/// - When encountering a match between two positions, which position is the best match is unclear
|
||||
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
|
||||
/// matches, but only counts one.
|
||||
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
|
||||
/// matches, but only counts one.
|
||||
///
|
||||
/// This algorithm may return an incorrect count in some cases (e.g. left, right expansion and is
|
||||
/// then matches both on the following term.)
|
||||
|
||||
@@ -115,10 +115,10 @@ impl<'a> EnableScoring<'a> {
|
||||
///
|
||||
/// So to sum it up :
|
||||
/// - a `Query` is a recipe to define a set of documents as well the way to score them.
|
||||
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance
|
||||
/// hold statistics about the different term of the query. It is created by the query.
|
||||
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific
|
||||
/// [`SegmentReader`]. It is created by the [`Weight`].
|
||||
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance hold
|
||||
/// statistics about the different term of the query. It is created by the query.
|
||||
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific [`SegmentReader`].
|
||||
/// It is created by the [`Weight`].
|
||||
///
|
||||
/// When implementing a new type of `Query`, it is normal to implement a
|
||||
/// dedicated `Query`, [`Weight`] and [`Scorer`].
|
||||
|
||||
@@ -49,10 +49,10 @@ pub(crate) struct RangeDocSet<T> {
|
||||
///
|
||||
/// There are two patterns.
|
||||
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
|
||||
/// will come, so we start with small chunks
|
||||
/// will come, so we start with small chunks
|
||||
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
|
||||
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
|
||||
/// full scan.
|
||||
/// should load small chunks. When the seeks are small, we can employ the same strategy as on
|
||||
/// a full scan.
|
||||
fetch_horizon: u32,
|
||||
/// Current batch of loaded docs.
|
||||
loaded_docs: VecCursor,
|
||||
|
||||
@@ -169,7 +169,7 @@ impl Facet {
|
||||
|
||||
/// Extract path from the `Facet`.
|
||||
pub fn to_path(&self) -> Vec<&str> {
|
||||
self.encoded_str().split(|c| c == FACET_SEP_CHAR).collect()
|
||||
self.encoded_str().split(FACET_SEP_CHAR).collect()
|
||||
}
|
||||
|
||||
/// This function is the inverse of Facet::from(&str).
|
||||
|
||||
@@ -12,8 +12,7 @@ use crate::schema::{
|
||||
///
|
||||
/// It consists of
|
||||
/// - a field name
|
||||
/// - a field type, itself wrapping up options describing
|
||||
/// how the field should be indexed.
|
||||
/// - a field type, itself wrapping up options describing how the field should be indexed.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct FieldEntry {
|
||||
name: String,
|
||||
|
||||
@@ -639,12 +639,11 @@ mod tests {
|
||||
/// <field> + <type byte> + <value len>
|
||||
///
|
||||
/// - <field> is a big endian encoded u32 field id
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not
|
||||
/// The remaining 7 bits are used to encode the type of the value.
|
||||
/// If this is a JSON term, the type is the type of the leaf of the json.
|
||||
///
|
||||
/// - <type_byte>'s most significant bit expresses whether the term is a json term or not The
|
||||
/// remaining 7 bits are used to encode the type of the value. If this is a JSON term, the
|
||||
/// type is the type of the leaf of the json.
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
|
||||
/// If it is a JSON Term, then it is prepended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -189,8 +189,8 @@ impl TokenizerName {
|
||||
///
|
||||
/// It defines
|
||||
/// - The amount of information that should be stored about the presence of a term in a document.
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`]).
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`]).
|
||||
/// - The name of the `Tokenizer` that should be used to process the field.
|
||||
/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
|
||||
/// to `true`.
|
||||
|
||||
@@ -25,8 +25,8 @@
|
||||
//! Most users should not access the `StoreReader` directly
|
||||
//! and should rely on either
|
||||
//!
|
||||
//! - at the segment level, the
|
||||
//! [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
|
||||
//! - at the segment level, the [`SegmentReader`'s `doc`
|
||||
//! method](../struct.SegmentReader.html#method.doc)
|
||||
//! - at the index level, the [`Searcher::doc()`](crate::Searcher::doc) method
|
||||
|
||||
mod compressors;
|
||||
|
||||
@@ -11,8 +11,7 @@ use crate::termdict::{TermOrdinal, TermStreamer};
|
||||
///
|
||||
/// The item yielded is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the term.
|
||||
/// - a slice with the ordinal of the segments containing the term.
|
||||
pub struct TermMerger<'a> {
|
||||
dictionaries: Vec<&'a TermDictionary>,
|
||||
union: Union<'a>,
|
||||
|
||||
@@ -34,8 +34,7 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// - a slice with the ordinal of the segments containing the terms.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
|
||||
@@ -12,14 +12,12 @@ use crate::tokenizer::{
|
||||
///
|
||||
/// By default, it is populated with the following managers.
|
||||
///
|
||||
/// * `raw` : does not process nor tokenize the text.
|
||||
/// * `default` : Chops the text on according to whitespace and
|
||||
/// punctuation, removes tokens that are too long, and lowercases
|
||||
/// tokens
|
||||
/// * `en_stem` : Like `default`, but also applies stemming on the
|
||||
/// resulting tokens. Stemming can improve the recall of your
|
||||
/// search engine.
|
||||
/// * `whitespace` : Splits the text on whitespaces.
|
||||
/// - `raw` : does not process nor tokenize the text.
|
||||
/// - `default` : Chops the text on according to whitespace and punctuation, removes tokens that are
|
||||
/// too long, and lowercases tokens.
|
||||
/// - `en_stem` : Like `default`, but also applies stemming on the resulting tokens. Stemming can
|
||||
/// improve the recall of your search engine.
|
||||
/// - `whitespace` : Splits the text on whitespaces.
|
||||
#[derive(Clone)]
|
||||
pub struct TokenizerManager {
|
||||
tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
|
||||
|
||||
@@ -12,13 +12,13 @@
|
||||
//! # Limitations
|
||||
//!
|
||||
//! - Your object shall not implement `Drop`.
|
||||
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena
|
||||
//! is 4GB. *(Tantivy's indexer uses one arena per indexing thread.)*
|
||||
//! - The arena only works for objects much smaller than `1MB`.
|
||||
//! Allocating more than `1MB` at a time will result in a panic,
|
||||
//! and allocating a lot of large object (> 500KB) will result in a fragmentation.
|
||||
//! - Your objects are store in an unaligned fashion. For this reason,
|
||||
//! the API does not let you access them as references.
|
||||
//! - `Addr` to the `Arena` are 32-bits. The maximum capacity of the arena is 4GB. *(Tantivy's
|
||||
//! indexer uses one arena per indexing thread.)*
|
||||
//! - The arena only works for objects much smaller than `1MB`. Allocating more than `1MB` at a
|
||||
//! time will result in a panic, and allocating a lot of large object (> 500KB) will result in a
|
||||
//! fragmentation.
|
||||
//! - Your objects are store in an unaligned fashion. For this reason, the API does not let you
|
||||
//! access them as references.
|
||||
//!
|
||||
//! Instead, you store and access your data via `.write(...)` and `.read(...)`, which under the hood
|
||||
//! stores your object using `ptr::write_unaligned` and `ptr::read_unaligned`.
|
||||
|
||||
Reference in New Issue
Block a user