Update field.rs

Update delete_queue.rs
Refactoring around Field
2026-02-23 00:00:37 +00:00 · 2019-10-25 09:05:56 +09:00 · 2019-10-25 09:02:57 +09:00 · 2019-10-24 10:03:59 +09:00
9 changed files with 12 additions and 156 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,6 @@ Tantivy 0.11.0
 - API change around `Box<BoxableTokenizer>`. See detail in #629
 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
 - Add footer with some metadata to index files. #605 (@fdb-hiroshima)
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
 
 ## How to update?

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
 edition = "2018"

 [dependencies]
-base64 = "0.11.0"
+base64 = "0.10.0"
 byteorder = "1.0"
 crc32fast = "1.2.0"
 once_cell = "1.0"
@@ -34,7 +34,7 @@ itertools = "0.8"
 levenshtein_automata = {version="0.1", features=["fst_automaton"]}
 notify = {version="4", optional=true}
 bit-set = "0.5"
-uuid = { version = "0.8", features = ["v4", "serde"] }
+uuid = { version = "0.7.2", features = ["v4", "serde"] }
 crossbeam = "0.7"
 futures = "0.1"
 futures-cpupool = "0.1"
--- a/src/collector/top_collector.rs
+++ b/src/collector/top_collector.rs
@@ -12,9 +12,6 @@ use std::collections::BinaryHeap;
 /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
 /// default Rust heap is a max heap, whereas a min heap is needed.
 ///
-/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
-/// address is used.
-///
 /// WARNING: equality is not what you would expect here.
 /// Two elements are equal if their feature is equal, and regardless of whether `doc`
 /// is equal. This should be perfectly fine for this usage, but let's make sure this
@@ -24,37 +21,29 @@ struct ComparableDoc<T, D> {
    doc: D,
 }

-impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
+impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }

-impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
+impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
    #[inline]
    fn cmp(&self, other: &Self) -> Ordering {
-        // Reversed to make BinaryHeap work as a min-heap
-        let by_feature = other
+        other
            .feature
            .partial_cmp(&self.feature)
-            .unwrap_or(Ordering::Equal);
-
-        let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
-
-        // In case of a tie on the feature, we sort by ascending
-        // `DocAddress` in order to ensure a stable sorting of the
-        // documents.
-        by_feature.then_with(lazy_by_doc_address)
+            .unwrap_or_else(|| Ordering::Equal)
    }
 }

-impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
+impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == Ordering::Equal
    }
 }

-impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
+impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}

 pub(crate) struct TopCollector<T> {
    limit: usize,
@@ -225,94 +214,4 @@ mod tests {
            ]
        );
    }
-
-    #[test]
-    fn test_top_segment_collector_stable_ordering_for_equal_feature() {
-        // given that the documents are collected in ascending doc id order,
-        // when harvesting we have to guarantee stable sorting in case of a tie
-        // on the score
-        let doc_ids_collection = [4, 5, 6];
-        let score = 3.14;
-
-        let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
-        for id in &doc_ids_collection {
-            top_collector_limit_2.collect(*id, score);
-        }
-
-        let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
-        for id in &doc_ids_collection {
-            top_collector_limit_3.collect(*id, score);
-        }
-
-        assert_eq!(
-            top_collector_limit_2.harvest(),
-            top_collector_limit_3.harvest()[..2].to_vec(),
-        );
-    }
-}
-
-#[cfg(all(test, feature = "unstable"))]
-mod bench {
-    use super::TopSegmentCollector;
-    use test::Bencher;
-
-    #[bench]
-    fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
-        let mut top_collector = TopSegmentCollector::new(0, 400);
-
-        b.iter(|| {
-            for i in 0..100 {
-                top_collector.collect(i, 0.8);
-            }
-        });
-    }
-
-    #[bench]
-    fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
-        let mut top_collector = TopSegmentCollector::new(0, 100);
-
-        for i in 0..100 {
-            top_collector.collect(i, 0.8);
-        }
-
-        b.iter(|| {
-            for i in 0..100 {
-                top_collector.collect(i, 0.8);
-            }
-        });
-    }
-
-    #[bench]
-    fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
-        b.iter(|| {
-            let mut top_collector = TopSegmentCollector::new(0, 100);
-
-            for i in 0..100 {
-                top_collector.collect(i, 0.8);
-            }
-
-            // it would be nice to be able to do the setup N times but still
-            // measure only harvest(). We can't since harvest() consumes
-            // the top_collector.
-            top_collector.harvest()
-        });
-    }
-
-    #[bench]
-    fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
-        b.iter(|| {
-            let mut top_collector = TopSegmentCollector::new(0, 100);
-            let mut score = 1.0;
-
-            for i in 0..100 {
-                score += 1.0;
-                top_collector.collect(i, score);
-            }
-
-            // it would be nice to be able to do the setup N times but still
-            // measure only harvest(). We can't since harvest() consumes
-            // the top_collector.
-            top_collector.harvest()
-        });
-    }
 }
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -15,16 +15,13 @@ use crate::SegmentLocalId;
 use crate::SegmentReader;
 use std::fmt;

-/// The `TopDocs` collector keeps track of the top `K` documents
+/// The Top Score Collector keeps track of the K documents
 /// sorted by their score.
 ///
 /// The implementation is based on a `BinaryHeap`.
 /// The theorical complexity for collecting the top `K` out of `n` documents
 /// is `O(n log K)`.
 ///
-/// This collector guarantees a stable sorting in case of a tie on the
-/// document score. As such, it is suitable to implement pagination.
-///
 /// ```rust
 /// use tantivy::collector::TopDocs;
 /// use tantivy::query::QueryParser;
@@ -431,13 +428,12 @@ impl SegmentCollector for TopScoreSegmentCollector {
 mod tests {
    use super::TopDocs;
    use crate::collector::Collector;
-    use crate::query::{AllQuery, Query, QueryParser};
+    use crate::query::{Query, QueryParser};
    use crate::schema::{Field, Schema, FAST, STORED, TEXT};
    use crate::DocAddress;
    use crate::Index;
    use crate::IndexWriter;
    use crate::Score;
-    use itertools::Itertools;

    fn make_index() -> Index {
        let mut schema_builder = Schema::builder();
@@ -498,29 +494,6 @@ mod tests {
        );
    }

-    #[test]
-    fn test_top_collector_stable_sorting() {
-        let index = make_index();
-
-        // using AllQuery to get a constant score
-        let searcher = index.reader().unwrap().searcher();
-
-        let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
-
-        let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
-
-        // precondition for the test to be meaningful: we did get documents
-        // with the same score
-        assert!(page_1.iter().map(|result| result.0).all_equal());
-        assert!(page_2.iter().map(|result| result.0).all_equal());
-
-        // sanity check since we're relying on make_index()
-        assert_eq!(page_1.len(), 2);
-        assert_eq!(page_2.len(), 3);
-
-        assert_eq!(page_1, &page_2[..page_1.len()]);
-    }
-
    #[test]
    #[should_panic]
    fn test_top_0() {
--- a/src/core/segment_id.rs
+++ b/src/core/segment_id.rs
@@ -76,7 +76,7 @@ impl SegmentId {
 }

 /// Error type used when parsing a `SegmentId` from a string fails.
-pub struct SegmentIdParseError(uuid::Error);
+pub struct SegmentIdParseError(uuid::parser::ParseError);

 impl Error for SegmentIdParseError {}

--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -732,7 +732,7 @@ impl IndexWriter {
                }
                UserOperation::Add(document) => {
                    let add_operation = AddOperation { opstamp, document };
-                    adds.push(add_operpation);
+                    adds.push(add_operation);
                }
            }
        }
--- a/src/indexer/merge_operation.rs
+++ b/src/indexer/merge_operation.rs
@@ -7,10 +7,6 @@ use std::collections::HashSet;
 pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);

 impl MergeOperationInventory {
-    pub fn num_merge_operations(&self) -> usize {
-        self.0.list().len()
-    }
-
    pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
        let mut segment_in_merge = HashSet::default();
        for merge_op in self.0.list() {
--- a/src/indexer/merge_policy.rs
+++ b/src/indexer/merge_policy.rs
@@ -12,10 +12,6 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
 /// Every time a the list of segments changes, the segment updater
 /// asks the merge policy if some segments should be merged.
 pub trait MergePolicy: marker::Send + marker::Sync + Debug {
-    fn maximum_num_threads(&self) -> Option<usize> {
-        None
-    }
-
    /// Given the list of segment metas, returns the list of merge candidates.
    ///
    /// This call happens on the segment updater thread, and will block
--- a/src/indexer/segment_updater.rs
+++ b/src/indexer/segment_updater.rs
@@ -39,7 +39,6 @@ use std::sync::Arc;
 use std::sync::RwLock;
 use std::thread;
 use std::thread::JoinHandle;
-use std::time::Duration;

 /// Save the index meta file.
 /// This operation is atomic :
@@ -201,12 +200,6 @@ impl SegmentUpdater {
    }

    pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
-        let max_num_threads_opt = self.0.merge_policy.read().unwrap().maximum_num_threads();
-        if let Some(max_num_threads) = max_num_threads_opt {
-            while self.0.merge_operations.num_merge_operations() >= max_num_threads_opt {
-                std::thread::sleep(Duration::from_secs(1u64));
-            }
-        }
        self.run_async(|segment_updater| {
            segment_updater.0.segment_manager.add_segment(segment_entry);
            segment_updater.consider_merge_options();
Author	SHA1	Message	Date
Paul Masurel	45d219b6e5	Update field.rs	2019-10-25 09:05:56 +09:00
Paul Masurel	20aaaad04f	Update delete_queue.rs	2019-10-25 09:02:57 +09:00
Paul Masurel	ba0b89da36	Refactoring around Field Removing the contract about the order of the field, and the field id allocation.	2019-10-24 10:03:59 +09:00