Merge pull request #2816 from evance-br/fix-closing-paren-elastic-range

uncomment commented code when testing
2026-01-29 12:20:36 +00:00 · 2026-01-27 17:00:08 +01:00 · 2026-01-27 13:19:41 +00:00 · 2026-01-27 13:08:38 +00:00 · 2026-01-27 13:01:14 +00:00 · 2026-01-27 09:22:11 +01:00
12 changed files with 86 additions and 68 deletions
--- a/query-grammar/src/query_grammar.rs
+++ b/query-grammar/src/query_grammar.rs
@@ -560,7 +560,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
            (
                (
                    value((), tag(">=")),
-                    map(word_infallible("", false), |(bound, err)| {
+                    map(word_infallible(")", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -574,7 +574,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<=")),
-                    map(word_infallible("", false), |(bound, err)| {
+                    map(word_infallible(")", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -588,7 +588,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag(">")),
-                    map(word_infallible("", false), |(bound, err)| {
+                    map(word_infallible(")", false), |(bound, err)| {
                        (
                            (
                                bound
@@ -602,7 +602,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
                ),
                (
                    value((), tag("<")),
-                    map(word_infallible("", false), |(bound, err)| {
+                    map(word_infallible(")", false), |(bound, err)| {
                        (
                            (
                                UserInputBound::Unbounded,
@@ -1323,6 +1323,14 @@ mod test {
        test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
        test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
        test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
+
+        test_parse_query_to_ast_helper("(<=42)", "{\"*\" TO \"42\"]");
+        test_parse_query_to_ast_helper("(<=42 )", "{\"*\" TO \"42\"]");
+        test_parse_query_to_ast_helper("(age:>5)", "\"age\":{\"5\" TO \"*\"}");
+        test_parse_query_to_ast_helper(
+            "(title:bar AND age:>12)",
+            "(+\"title\":bar +\"age\":{\"12\" TO \"*\"})",
+        );
    }

    #[test]
--- a/src/directory/mmap_directory/mod.rs
+++ b/src/directory/mmap_directory/mod.rs
@@ -676,7 +676,7 @@ mod tests {
            let num_segments = reader.searcher().segment_readers().len();
            assert!(num_segments <= 4);
            let num_components_except_deletes_and_tempstore =
-                crate::index::SegmentComponent::iterator().len() - 2;
+                crate::index::SegmentComponent::iterator().len() - 1;
            let max_num_mmapped = num_components_except_deletes_and_tempstore * num_segments;
            assert_eventually(|| {
                let num_mmapped = mmap_directory.get_cache_info().mmapped.len();
--- a/src/docset.rs
+++ b/src/docset.rs
@@ -65,8 +65,8 @@ pub trait DocSet: Send {
    ///   `seek_danger(..)` until it returns `Found`, and get back to a valid state.
    ///
    /// `seek_lower_bound` can be any `DocId` (in the docset or not) as long as it is in
-    /// `(target .. seek_result]` where `seek_result` is the first document in the docset greater
-    /// than to `target`.
+    /// `(target .. seek_result] U {TERMINATED}` where `seek_result` is the first document in the
+    /// docset greater than to `target`.
    ///
    /// `seek_danger` may return `SeekLowerBound(TERMINATED)`.
    ///
@@ -98,7 +98,7 @@ pub trait DocSet: Send {
        if doc == target {
            SeekDangerResult::Found
        } else {
-            SeekDangerResult::SeekLowerBound(self.doc())
+            SeekDangerResult::SeekLowerBound(doc)
        }
    }

--- a/src/index/index_meta.rs
+++ b/src/index/index_meta.rs
@@ -1,8 +1,6 @@
 use std::collections::HashSet;
 use std::fmt;
 use std::path::PathBuf;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;

 use serde::{Deserialize, Serialize};

@@ -37,7 +35,6 @@ impl SegmentMetaInventory {
        let inner = InnerSegmentMeta {
            segment_id,
            max_doc,
-            include_temp_doc_store: Arc::new(AtomicBool::new(true)),
            deletes: None,
        };
        SegmentMeta::from(self.inventory.track(inner))
@@ -85,15 +82,6 @@ impl SegmentMeta {
        self.tracked.segment_id
    }

-    /// Removes the Component::TempStore from the alive list and
-    /// therefore marks the temp docstore file to be deleted by
-    /// the garbage collection.
-    pub fn untrack_temp_docstore(&self) {
-        self.tracked
-            .include_temp_doc_store
-            .store(false, std::sync::atomic::Ordering::Relaxed);
-    }
-
    /// Returns the number of deleted documents.
    pub fn num_deleted_docs(&self) -> u32 {
        self.tracked
@@ -111,20 +99,9 @@ impl SegmentMeta {
    /// is by removing all files that have been created by tantivy
    /// and are not used by any segment anymore.
    pub fn list_files(&self) -> HashSet<PathBuf> {
-        if self
-            .tracked
-            .include_temp_doc_store
-            .load(std::sync::atomic::Ordering::Relaxed)
-        {
-            SegmentComponent::iterator()
-                .map(|component| self.relative_path(*component))
-                .collect::<HashSet<PathBuf>>()
-        } else {
-            SegmentComponent::iterator()
-                .filter(|comp| *comp != &SegmentComponent::TempStore)
-                .map(|component| self.relative_path(*component))
-                .collect::<HashSet<PathBuf>>()
-        }
+        SegmentComponent::iterator()
+            .map(|component| self.relative_path(*component))
+            .collect::<HashSet<PathBuf>>()
    }

    /// Returns the relative path of a component of our segment.
@@ -138,7 +115,6 @@ impl SegmentMeta {
            SegmentComponent::Positions => ".pos".to_string(),
            SegmentComponent::Terms => ".term".to_string(),
            SegmentComponent::Store => ".store".to_string(),
-            SegmentComponent::TempStore => ".store.temp".to_string(),
            SegmentComponent::FastFields => ".fast".to_string(),
            SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
            SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
@@ -183,7 +159,6 @@ impl SegmentMeta {
            segment_id: inner_meta.segment_id,
            max_doc,
            deletes: None,
-            include_temp_doc_store: Arc::new(AtomicBool::new(true)),
        });
        SegmentMeta { tracked }
    }
@@ -202,7 +177,6 @@ impl SegmentMeta {
        let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
            segment_id: inner_meta.segment_id,
            max_doc: inner_meta.max_doc,
-            include_temp_doc_store: Arc::new(AtomicBool::new(true)),
            deletes: Some(delete_meta),
        });
        SegmentMeta { tracked }
@@ -214,14 +188,6 @@ struct InnerSegmentMeta {
    segment_id: SegmentId,
    max_doc: u32,
    pub deletes: Option<DeleteMeta>,
-    /// If you want to avoid the SegmentComponent::TempStore file to be covered by
-    /// garbage collection and deleted, set this to true. This is used during merge.
-    #[serde(skip)]
-    #[serde(default = "default_temp_store")]
-    pub(crate) include_temp_doc_store: Arc<AtomicBool>,
-}
-fn default_temp_store() -> Arc<AtomicBool> {
-    Arc::new(AtomicBool::new(false))
 }

 impl InnerSegmentMeta {
--- a/src/index/segment_component.rs
+++ b/src/index/segment_component.rs
@@ -23,8 +23,6 @@ pub enum SegmentComponent {
    /// Accessing a document from the store is relatively slow, as it
    /// requires to decompress the entire block it belongs to.
    Store,
-    /// Temporary storage of the documents, before streamed to `Store`.
-    TempStore,
    /// Bitset describing which document of the segment is alive.
    /// (It was representing deleted docs but changed to represent alive docs from v0.17)
    Delete,
@@ -33,14 +31,13 @@ pub enum SegmentComponent {
 impl SegmentComponent {
    /// Iterates through the components.
    pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
-        static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
+        static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
            SegmentComponent::Postings,
            SegmentComponent::Positions,
            SegmentComponent::FastFields,
            SegmentComponent::FieldNorms,
            SegmentComponent::Terms,
            SegmentComponent::Store,
-            SegmentComponent::TempStore,
            SegmentComponent::Delete,
        ];
        SEGMENT_COMPONENTS.iter()
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -218,7 +218,7 @@ fn index_documents<D: Document>(
    let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;

    let meta = segment_with_max_doc.meta().clone();
-    meta.untrack_temp_docstore();
+
    // update segment_updater inventory to remove tempstore
    let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
    segment_updater.schedule_add_segment(segment_entry).wait()?;
--- a/src/postings/block_segment_postings.rs
+++ b/src/postings/block_segment_postings.rs
@@ -303,10 +303,10 @@ impl BlockSegmentPostings {
    }

    pub(crate) fn load_block(&mut self) {
-        let offset = self.skip_reader.byte_offset();
        if self.block_is_loaded() {
            return;
        }
+        let offset = self.skip_reader.byte_offset();
        match self.skip_reader.block_info() {
            BlockInfo::BitPacked {
                doc_num_bits,
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -168,12 +168,20 @@ impl DocSet for SegmentPostings {
        self.doc()
    }

+    #[inline]
    fn seek(&mut self, target: DocId) -> DocId {
        debug_assert!(self.doc() <= target);
        if self.doc() >= target {
            return self.doc();
        }

+        // As an optimization, if the block is already loaded, we can
+        // cheaply check the next doc.
+        self.cur = (self.cur + 1).min(COMPRESSION_BLOCK_SIZE - 1);
+        if self.doc() >= target {
+            return self.doc();
+        }
+
        // Delegate block-local search to BlockSegmentPostings::seek, which returns
        // the in-block index of the first doc >= target.
        self.cur = self.block_cursor.seek(target);
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -84,6 +84,14 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
        docsets.sort_by_key(|docset| docset.cost());
        go_to_first_doc(&mut docsets);
        let left = docsets.remove(0);
+        debug_assert!({
+            let doc = left.doc();
+            if doc == TERMINATED {
+                true
+            } else {
+                docsets.iter().all(|docset| docset.doc() == doc)
+            }
+        });
        let right = docsets.remove(0);
        Intersection {
            left,
@@ -112,30 +120,24 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
        // Invariant:
        // - candidate is always <= to the next document in the intersection.
        // - candidate strictly increases at every occurence of the loop.
-        let mut candidate = 0;
+        let mut candidate = left.doc() + 1;

        // Termination: candidate strictly increases.
        'outer: while candidate < TERMINATED {
            // As we enter the loop, we should always have candidate < next_doc.

-            // This step always increases candidate.
-            //
-            // TODO: Think about which value would make sense here
-            // It depends on the DocSet implementation, when a seek would outweigh an advance.
-            candidate = if candidate > left.doc().wrapping_add(100) {
-                left.seek(candidate)
-            } else {
-                left.advance()
-            };
+            candidate = left.seek(candidate);

            // Left is positionned on `candidate`.
            debug_assert_eq!(left.doc(), candidate);

            if let SeekDangerResult::SeekLowerBound(seek_lower_bound) = right.seek_danger(candidate)
            {
-                // The max is technically useless but it makes the invariant
-                // easier to proofread.
-                debug_assert!(seek_lower_bound >= candidate);
+                debug_assert!(
+                    seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
+                    "seek_lower_bound {seek_lower_bound} must be greater than candidate \
+                     {candidate}"
+                );
                candidate = seek_lower_bound;
                continue;
            }
@@ -148,7 +150,11 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
                    other.seek_danger(candidate)
                {
                    // One of the scorer does not match, let's restart at the top of the loop.
-                    debug_assert!(seek_lower_bound >= candidate);
+                    debug_assert!(
+                        seek_lower_bound == TERMINATED || seek_lower_bound > candidate,
+                        "seek_lower_bound {seek_lower_bound} must be greater than candidate \
+                         {candidate}"
+                    );
                    candidate = seek_lower_bound;
                    continue 'outer;
                }
@@ -238,9 +244,12 @@ mod tests {
    use proptest::prelude::*;

    use super::Intersection;
+    use crate::collector::Count;
    use crate::docset::{DocSet, TERMINATED};
    use crate::postings::tests::test_skip_against_unoptimized;
-    use crate::query::VecDocSet;
+    use crate::query::{QueryParser, VecDocSet};
+    use crate::schema::{Schema, TEXT};
+    use crate::Index;

    #[test]
    fn test_intersection() {
@@ -411,4 +420,29 @@ mod tests {
            assert_eq!(intersection.doc(), TERMINATED);
        }
    }
+
+    #[test]
+    fn test_bug_2811_intersection_candidate_should_increase() {
+        let mut schema_builder = Schema::builder();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+
+        let index = Index::create_in_ram(schema);
+        let mut writer = index.writer_for_tests().unwrap();
+        writer
+            .add_document(doc!(text_field=>"hello happy tax"))
+            .unwrap();
+        writer.add_document(doc!(text_field=>"hello")).unwrap();
+        writer.add_document(doc!(text_field=>"hello")).unwrap();
+        writer.add_document(doc!(text_field=>"happy tax")).unwrap();
+
+        writer.commit().unwrap();
+        let query_parser = QueryParser::for_index(&index, Vec::new());
+        let query = query_parser
+            .parse_query(r#"+text:hello +text:"happy tax""#)
+            .unwrap();
+        let searcher = index.reader().unwrap().searcher();
+        let c = searcher.search(&*query, &Count).unwrap();
+        assert_eq!(c, 1);
+    }
 }
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -531,7 +531,12 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
    }

    fn seek_danger(&mut self, target: DocId) -> SeekDangerResult {
-        debug_assert!(target >= self.doc());
+        debug_assert!(
+            target >= self.doc(),
+            "target ({}) should be greater than or equal to doc ({})",
+            target,
+            self.doc()
+        );
        let seek_res = self.intersection_docset.seek_danger(target);
        if seek_res != SeekDangerResult::Found {
            return seek_res;
--- a/src/query/term_query/term_scorer.rs
+++ b/src/query/term_query/term_scorer.rs
@@ -105,6 +105,7 @@ impl DocSet for TermScorer {

    #[inline]
    fn seek(&mut self, target: DocId) -> DocId {
+        debug_assert!(target >= self.doc());
        self.postings.seek(target)
    }

--- a/src/space_usage/mod.rs
+++ b/src/space_usage/mod.rs
@@ -124,7 +124,6 @@ impl SegmentSpaceUsage {
            FieldNorms => PerField(self.fieldnorms().clone()),
            Terms => PerField(self.termdict().clone()),
            SegmentComponent::Store => ComponentSpaceUsage::Store(self.store().clone()),
-            SegmentComponent::TempStore => ComponentSpaceUsage::Store(self.store().clone()),
            Delete => Basic(self.deletes()),
        }
    }
Author	SHA1	Message	Date
trinity-1686a	9b619998bd	Merge pull request #2816 from evance-br/fix-closing-paren-elastic-range	2026-01-27 17:00:08 +01:00
Evance Soumaoro	765c448945	uncomment commented code when testing	2026-01-27 13:19:41 +00:00
Evance Soumaoro	943594ebaa	uncomment commented code when testing	2026-01-27 13:08:38 +00:00
Evance Soumaoro	df17daae0d	fix closing parenthesis error on elastic range queries for lenient parser	2026-01-27 13:01:14 +00:00
Paul Masurel	0ae94baef5	Remove temp file (#2815 ) Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>	2026-01-27 09:22:11 +01:00
Paul Masurel	3f448ecf79	Bugfix on intersection. (#2812 ) The intersection algorithm made it possible for .seek(..) with values lower than the current doc id, breaking the DocSet contract. The fix removes the optimization that caused left.seek(..) to be replaced by a simpler left.advance(..). Simply doing so lead to a performance regression. I therefore integrated that idea within SegmentPostings.seek. We now attempt to check the next doc systematically on seek, PROVIDED the block is already loaded. Closes #2811 Co-authored-by: Paul Masurel <paul.masurel@datadoghq.com>	2026-01-27 09:21:09 +01:00