diff --git a/columnar/src/block_accessor.rs b/columnar/src/block_accessor.rs index 85b7fe95f..b04d640f0 100644 --- a/columnar/src/block_accessor.rs +++ b/columnar/src/block_accessor.rs @@ -81,9 +81,7 @@ impl /// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`. /// Return all docs that are not in `hits`. fn find_missing_docs(docs: &[u32], hits: &[u32], mut callback: F) -where - F: FnMut(u32), -{ +where F: FnMut(u32) { let mut docs_iter = docs.iter(); let mut hits_iter = hits.iter(); diff --git a/src/directory/directory.rs b/src/directory/directory.rs index de67748a7..ce4f0c014 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -1,9 +1,3 @@ -use crate::directory::directory_lock::Lock; -use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; -use crate::directory::{FileHandle, FileSlice, WatchCallback, WatchHandle, WritePtr}; -use crate::index::SegmentMetaInventory; -use crate::merge_policy::MergePolicy; -use crate::IndexMeta; use std::any::Any; use std::collections::HashSet; use std::io::Write; @@ -12,6 +6,13 @@ use std::sync::Arc; use std::time::Duration; use std::{fmt, io, thread}; +use crate::directory::directory_lock::Lock; +use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; +use crate::directory::{FileHandle, FileSlice, WatchCallback, WatchHandle, WritePtr}; +use crate::index::SegmentMetaInventory; +use crate::merge_policy::MergePolicy; +use crate::IndexMeta; + /// Retry the logic of acquiring locks is pretty simple. /// We just retry `n` times after a given `duratio`, both /// depending on the type of lock. @@ -268,7 +269,8 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { } // Allows the directory to change the writer's merge policy right before the merge happens - // This is useful for directories that need to change the merge policy based on how many segments were created + // This is useful for directories that need to change the merge policy based on how many + // segments were created fn reconsider_merge_policy( &self, _metas: &IndexMeta, @@ -285,8 +287,7 @@ pub trait DirectoryClone { } impl DirectoryClone for T -where - T: 'static + Directory + Clone, +where T: 'static + Directory + Clone { fn box_clone(&self) -> Box { Box::new(self.clone()) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 185daed91..838da71ed 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -1,4 +1,3 @@ -use crc32fast::Hasher; use std::any::Any; use std::collections::HashSet; use std::io::Write; @@ -6,6 +5,8 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::{io, result}; +use crc32fast::Hasher; + use crate::core::MANAGED_FILEPATH; use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError}; use crate::directory::footer::{Footer, FooterProxy}; @@ -13,9 +14,9 @@ use crate::directory::{ DirectoryLock, FileHandle, FileSlice, GarbageCollectionResult, Lock, WatchCallback, WatchHandle, WritePtr, MANAGED_LOCK, META_LOCK, }; -use crate::merge_policy::MergePolicy; use crate::error::DataCorruption; use crate::index::SegmentMetaInventory; +use crate::merge_policy::MergePolicy; use crate::{Directory, IndexMeta}; /// Returns true if the file is "managed". @@ -356,7 +357,8 @@ impl Directory for ManagedDirectory { metas: &IndexMeta, previous_metas: &IndexMeta, ) -> Option> { - self.directory.reconsider_merge_policy(metas, previous_metas) + self.directory + .reconsider_merge_policy(metas, previous_metas) } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c15d6d456..c13c0b279 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1089,7 +1089,10 @@ mod tests { index_writer.commit()?; reader.reload().unwrap(); - assert_eq!(num_docs_containing("a"), 0); + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. + assert_eq!(num_docs_containing("a"), 1); index_writer.merge(&segments); index_writer.wait_merging_threads().unwrap(); @@ -1135,7 +1138,10 @@ mod tests { index_writer.commit()?; reader.reload().unwrap(); - assert_eq!(num_docs_containing("a"), 0); + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. + assert_eq!(num_docs_containing("a"), 4); index_writer.merge(&segments); index_writer.wait_merging_threads().unwrap(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1af64607b..670771ea7 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1032,12 +1032,15 @@ mod tests { // Test removing all docs index_writer.delete_term(Term::from_field_text(text_field, "g")); index_writer.commit()?; - let segment_ids = index.searchable_segment_ids()?; + let _segment_ids = index.searchable_segment_ids()?; reader.reload()?; let searcher = reader.searcher(); - assert!(segment_ids.is_empty()); - assert!(searcher.segment_readers().is_empty()); + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. + // assert!(segment_ids.is_empty()); + // assert!(searcher.segment_readers().is_empty()); assert_eq!(searcher.num_docs(), 0); } Ok(()) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index b1c82e682..cc528a4b9 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -360,8 +360,8 @@ impl SegmentUpdater { let segment_updater = self.clone(); self.schedule_task(move || { segment_updater.segment_manager.add_segment(segment_entry); - // mingy98: We don't need to consider merge options for every segment, just at the very end - // segment_updater.consider_merge_options(); + // mingy98: We don't need to consider merge options for every segment, just at the very + // end segment_updater.consider_merge_options(); Ok(()) }) } @@ -776,9 +776,11 @@ mod tests { } index_writer.commit()?; - let seg_ids = index.searchable_segment_ids()?; - // docs exist, should have at least 1 segment - assert!(!seg_ids.is_empty()); + let _seg_ids = index.searchable_segment_ids()?; + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. + // assert!(!seg_ids.is_empty()); let term = Term::from_field_text(text_field, "a"); index_writer.delete_term(term); @@ -793,14 +795,15 @@ mod tests { let reader = index.reader()?; assert_eq!(reader.searcher().num_docs(), 0); - let seg_ids = index.searchable_segment_ids()?; - assert!(seg_ids.is_empty()); + let _seg_ids = index.searchable_segment_ids()?; + // Skipped due to custom ParadeDB visibility rules. + // assert!(seg_ids.is_empty()); reader.reload()?; assert_eq!(reader.searcher().num_docs(), 0); - // empty segments should be erased - assert!(index.searchable_segment_metas()?.is_empty()); - assert!(reader.searcher().segment_readers().is_empty()); + // Skipped due to custom ParadeDB visibility rules. + // assert!(index.searchable_segment_metas()?.is_empty()); + // assert!(reader.searcher().segment_readers().is_empty()); Ok(()) } @@ -830,9 +833,11 @@ mod tests { index_writer.add_document(doc!(text_field=>"f"))?; index_writer.commit()?; - let seg_ids = index.searchable_segment_ids()?; - // docs exist, should have at least 1 segment - assert!(!seg_ids.is_empty()); + let _seg_ids = index.searchable_segment_ids()?; + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. + // assert!(!seg_ids.is_empty()); let term_vals = vec!["a", "b", "c", "d", "e", "f"]; for term_val in term_vals { @@ -846,14 +851,15 @@ mod tests { let reader = index.reader()?; assert_eq!(reader.searcher().num_docs(), 0); - let seg_ids = index.searchable_segment_ids()?; - assert!(seg_ids.is_empty()); + let _seg_ids = index.searchable_segment_ids()?; + // Skipped due to custom ParadeDB visibility rules. + // assert!(seg_ids.is_empty()); reader.reload()?; assert_eq!(reader.searcher().num_docs(), 0); - // empty segments should be erased - assert!(index.searchable_segment_metas()?.is_empty()); - assert!(reader.searcher().segment_readers().is_empty()); + // Skipped due to custom ParadeDB visibility rules. + // assert!(index.searchable_segment_metas()?.is_empty()); + // assert!(reader.searcher().segment_readers().is_empty()); Ok(()) } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 4dee06aad..db358a32a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -491,10 +491,12 @@ pub(crate) mod tests { } let searcher = index.reader()?.searcher(); - // finally, check that it's empty + // In Tantivy upstream, this test results in 0 segments after delete. + // However, due to our custom, visibility rules, we leave the segment. + // See committed_segment_metas in segment_manager.rs. { - let searchable_segment_ids = index.searchable_segment_ids()?; - assert!(searchable_segment_ids.is_empty()); + let _searchable_segment_ids = index.searchable_segment_ids()?; + // assert!(searchable_segment_ids.is_empty()); assert_eq!(searcher.num_docs(), 0); } Ok(()) diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index 2a8483322..4f1f196b0 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -1,5 +1,3 @@ -use crate::query::fuzzy_query::DfaWrapper; -use crate::query::score_combiner::SumCombiner; use std::any::{Any, TypeId}; use std::io; use std::sync::Arc; @@ -10,6 +8,8 @@ use super::phrase_prefix_query::prefix_end; use super::BufferedUnionScorer; use crate::index::SegmentReader; use crate::postings::TermInfo; +use crate::query::fuzzy_query::DfaWrapper; +use crate::query::score_combiner::SumCombiner; use crate::query::{ConstScorer, Explanation, Scorer, Weight}; use crate::schema::{Field, IndexRecordOption}; use crate::termdict::{TermDictionary, TermWithStateStreamer}; diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs index d756bf2f4..ea68646ed 100644 --- a/src/termdict/fst_termdict/streamer.rs +++ b/src/termdict/fst_termdict/streamer.rs @@ -83,8 +83,7 @@ where A: Automaton } impl TermStreamer<'_, A> -where - A: Automaton, +where A: Automaton { /// Advance position the stream on the next item. /// Before the first call to `.advance()`, the stream diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 498cf6caa..eec4c1ae0 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -42,10 +42,9 @@ use tantivy_fst::Automaton; #[cfg(feature = "quickwit")] use self::termdict::TermDictionaryExt; -use self::termdict::TermWithStateStreamerBuilder; use self::termdict::{ TermDictionary as InnerTermDict, TermDictionaryBuilder as InnerTermDictBuilder, - TermStreamerBuilder, + TermStreamerBuilder, TermWithStateStreamerBuilder, }; pub use self::termdict::{TermMerger, TermStreamer, TermWithStateStreamer}; use crate::postings::TermInfo; @@ -161,9 +160,7 @@ impl TermDictionary { /// Returns a search builder, to stream all of the terms /// within the Automaton pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> - where - A::State: Clone, - { + where A::State: Clone { self.0.search(automaton) } diff --git a/src/termdict/sstable_termdict/mod.rs b/src/termdict/sstable_termdict/mod.rs index a3c5cb12b..7b9dea974 100644 --- a/src/termdict/sstable_termdict/mod.rs +++ b/src/termdict/sstable_termdict/mod.rs @@ -5,13 +5,13 @@ mod merger; use std::iter::ExactSizeIterator; use common::VInt; +use sstable::streamer::StreamerWithState; use sstable::value::{ValueReader, ValueWriter}; use sstable::SSTable; use tantivy_fst::automaton::AlwaysMatch; +use tantivy_fst::Automaton; pub use self::merger::TermMerger; -use tantivy_fst::Automaton; -use sstable::streamer::StreamerWithState; use crate::postings::TermInfo; pub struct TermWithStateStreamerBuilder<'a, A> @@ -75,9 +75,7 @@ where } pub fn into_stream(self) -> io::Result> { - let streamer_with_state = self - .streamer_builder - .into_stream_with_state()?; + let streamer_with_state = self.streamer_builder.into_stream_with_state()?; Ok(TermWithStateStreamer { streamer_with_state, }) diff --git a/sstable/src/lib.rs b/sstable/src/lib.rs index 8cdfea3bd..4a28384b6 100644 --- a/sstable/src/lib.rs +++ b/sstable/src/lib.rs @@ -137,8 +137,7 @@ pub struct Reader { } impl Reader -where - TValueReader: ValueReader, +where TValueReader: ValueReader { pub fn advance(&mut self) -> io::Result { if !self.delta_reader.advance()? { @@ -171,8 +170,7 @@ impl AsRef<[u8]> for Reader { } pub struct Writer -where - W: io::Write, +where W: io::Write { previous_key: Vec, index_builder: SSTableIndexBuilder, diff --git a/sstable/src/streamer.rs b/sstable/src/streamer.rs index af2225392..061d3d8e7 100644 --- a/sstable/src/streamer.rs +++ b/sstable/src/streamer.rs @@ -179,8 +179,7 @@ where } impl Streamer<'_, TSSTable, AlwaysMatch> -where - TSSTable: SSTable, +where TSSTable: SSTable { pub fn empty() -> Self { Streamer { diff --git a/stacker/src/shared_arena_hashmap.rs b/stacker/src/shared_arena_hashmap.rs index 54f04a052..f558b0f8a 100644 --- a/stacker/src/shared_arena_hashmap.rs +++ b/stacker/src/shared_arena_hashmap.rs @@ -266,9 +266,7 @@ impl SharedArenaHashMap { /// Get a value associated to a key. #[inline] pub fn get(&self, key: &[u8], memory_arena: &MemoryArena) -> Option - where - V: Copy + 'static, - { + where V: Copy + 'static { let hash = self.get_hash(key); let mut probe = self.probe(hash); loop { diff --git a/tests/fuzzy_scoring.rs b/tests/fuzzy_scoring.rs index 2049b1d51..7c6e454dd 100644 --- a/tests/fuzzy_scoring.rs +++ b/tests/fuzzy_scoring.rs @@ -3,11 +3,8 @@ mod test { use maplit::hashmap; use tantivy::collector::TopDocs; use tantivy::query::FuzzyTermQuery; - use tantivy::schema::{Schema, Value}; - use tantivy::schema::{STORED, TEXT}; - use tantivy::Index; - use tantivy::Term; - use tantivy::{doc, TantivyDocument}; + use tantivy::schema::{Schema, Value, STORED, TEXT}; + use tantivy::{doc, Index, TantivyDocument, Term}; #[test] pub fn test_fuzzy_term() {