Fix off by one in optional index

Fixes #2293 Fixes an off by one error in the metadata resize of the optional index when loading the index. Merge variables with the same meaning but different names
Forward regex parser errors to enable understandin their reason. (#2288 )
2026-02-14 03:50:35 +00:00 · 2024-01-09 15:09:49 +08:00 · 2023-12-22 11:01:10 +01:00 · 2023-12-21 11:05:34 +01:00
8 changed files with 81 additions and 10 deletions
--- a/columnar/src/column/mod.rs
+++ b/columnar/src/column/mod.rs
@@ -110,6 +110,9 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
    }

    /// Get the docids of values which are in the provided value range.
+    ///
+    /// # Panic
+    /// Panics if a value in the selected_docid_range range is larger than the number of documents.
    #[inline]
    pub fn get_docids_for_value_range(
        &self,
--- a/columnar/src/column_index/mod.rs
+++ b/columnar/src/column_index/mod.rs
@@ -126,6 +126,8 @@ impl ColumnIndex {
        }
    }

+    /// # Panic
+    /// Panics if a value in the doc_id range is larger than the number of documents.
    pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
        match self {
            ColumnIndex::Empty { .. } => 0..0,
--- a/columnar/src/column_index/optional_index/mod.rs
+++ b/columnar/src/column_index/optional_index/mod.rs
@@ -21,8 +21,6 @@ const DENSE_BLOCK_THRESHOLD: u32 =

 const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1;

-const BLOCK_SIZE: RowId = 1 << 16;
-
 #[derive(Copy, Clone, Debug)]
 struct BlockMeta {
    non_null_rows_before_block: u32,
@@ -109,8 +107,8 @@ struct RowAddr {
 #[inline(always)]
 fn row_addr_from_row_id(row_id: RowId) -> RowAddr {
    RowAddr {
-        block_id: (row_id / BLOCK_SIZE) as u16,
-        in_block_row_id: (row_id % BLOCK_SIZE) as u16,
+        block_id: (row_id / ELEMENTS_PER_BLOCK) as u16,
+        in_block_row_id: (row_id % ELEMENTS_PER_BLOCK) as u16,
    }
 }

@@ -490,8 +488,9 @@ fn deserialize_optional_index_block_metadatas(
        start_byte_offset += block_variant.num_bytes_in_block();
        non_null_rows_before_block += num_non_null_rows;
    }
+    let last_block = row_addr_from_row_id(num_rows).block_id;
    block_metas.resize(
-        ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize,
+        last_block as usize + 1, // +1 since last block is an index
        BlockMeta {
            non_null_rows_before_block,
            start_byte_offset,
--- a/columnar/src/column_index/optional_index/tests.rs
+++ b/columnar/src/column_index/optional_index/tests.rs
@@ -3,6 +3,29 @@ use proptest::strategy::Strategy;
 use proptest::{prop_oneof, proptest};

 use super::*;
+use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle};
+
+#[test]
+fn test_optional_index_bug_2293() {
+    test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1);
+    test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK);
+    test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1);
+}
+fn test_optional_index_with_num_docs(num_docs: u32) {
+    let mut dataframe_writer = ColumnarWriter::default();
+    dataframe_writer.record_numerical(100, "score", 80i64);
+    let mut buffer: Vec<u8> = Vec::new();
+    dataframe_writer
+        .serialize(num_docs, None, &mut buffer)
+        .unwrap();
+    let columnar = ColumnarReader::open(buffer).unwrap();
+    assert_eq!(columnar.num_columns(), 1);
+    let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
+    assert_eq!(cols.len(), 1);
+
+    let col = cols[0].open().unwrap();
+    col.column_index().docid_range_to_rowids(0..num_docs);
+}

 #[test]
 fn test_dense_block_threshold() {
@@ -35,7 +58,7 @@ proptest! {

 #[test]
 fn test_with_random_sets_simple() {
-    let vals = 10..BLOCK_SIZE * 2;
+    let vals = 10..ELEMENTS_PER_BLOCK * 2;
    let mut out: Vec<u8> = Vec::new();
    serialize_optional_index(&vals, 100, &mut out).unwrap();
    let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
@@ -171,7 +194,7 @@ fn test_optional_index_rank() {
    test_optional_index_rank_aux(&[0u32, 1u32]);
    let mut block = Vec::new();
    block.push(3u32);
-    block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
+    block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1));
    test_optional_index_rank_aux(&block);
 }

@@ -185,8 +208,8 @@ fn test_optional_index_iter_empty_one() {
 fn test_optional_index_iter_dense_block() {
    let mut block = Vec::new();
    block.push(3u32);
-    block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
-    test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE);
+    block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1));
+    test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK);
 }

 #[test]
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -1651,6 +1651,7 @@ mod tests {
        force_end_merge: bool,
    ) -> crate::Result<Index> {
        let mut schema_builder = schema::Schema::builder();
+        let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED);
        let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED);
        let ips_field = schema_builder
            .add_ip_addr_field("ips", IpAddrOptions::default().set_fast().set_indexed());
@@ -1729,7 +1730,9 @@ mod tests {
                            id_field=>id,
                        ))?;
                    } else {
+                        let json = json!({"date1": format!("2022-{id}-01T00:00:01Z"), "date2": format!("{id}-05-01T00:00:01Z"), "id": id, "ip": ip.to_string()});
                        index_writer.add_document(doc!(id_field=>id,
+                                json_field=>json,
                                bytes_field => id.to_le_bytes().as_slice(),
                                id_opt_field => id,
                                ip_field => ip,
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -605,6 +605,10 @@ impl IndexMerger {
                            segment_postings.positions(&mut positions_buffer);
                            segment_postings.term_freq()
                        } else {
+                            // The positions_buffer may contain positions from the previous term
+                            // Existence of positions depend on the value type in JSON fields.
+                            // https://github.com/quickwit-oss/tantivy/issues/2283
+                            positions_buffer.clear();
                            0u32
                        };

--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -879,6 +879,31 @@ mod tests {
        assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
    }

+    #[test]
+    fn test_json_term_with_numeric_merge_panic_regression_bug_2283() {
+        // https://github.com/quickwit-oss/tantivy/issues/2283
+        let mut schema_builder = Schema::builder();
+        let json = schema_builder.add_json_field("json", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        let mut writer = index.writer_for_tests().unwrap();
+        let doc = json!({"field": "a"});
+        writer.add_document(doc!(json=>doc)).unwrap();
+        writer.commit().unwrap();
+        let doc = json!({"field": "a", "id": 1});
+        writer.add_document(doc!(json=>doc.clone())).unwrap();
+        writer.commit().unwrap();
+
+        // Force Merge
+        writer.wait_merging_threads().unwrap();
+        let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
+        let segment_ids = index
+            .searchable_segment_ids()
+            .expect("Searchable segments failed.");
+        index_writer.merge(&segment_ids).wait().unwrap();
+        assert!(index_writer.wait_merging_threads().is_ok());
+    }
+
    #[test]
    fn test_bug_regression_1629_position_when_array_with_a_field_value_that_does_not_contain_any_token(
    ) {
--- a/src/query/regex_query.rs
+++ b/src/query/regex_query.rs
@@ -63,7 +63,7 @@ impl RegexQuery {
    /// Creates a new RegexQuery from a given pattern
    pub fn from_pattern(regex_pattern: &str, field: Field) -> crate::Result<Self> {
        let regex = Regex::new(regex_pattern)
-            .map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_string()))?;
+            .map_err(|err| TantivyError::InvalidArgument(format!("RegexQueryError: {err}")))?;
        Ok(RegexQuery::from_regex(regex, field))
    }

@@ -176,4 +176,16 @@ mod test {
        verify_regex_query(matching_one, matching_zero, reader);
        Ok(())
    }
+
+    #[test]
+    pub fn test_pattern_error() {
+        let (_reader, field) = build_test_index().unwrap();
+
+        match RegexQuery::from_pattern(r"(foo", field) {
+            Err(crate::TantivyError::InvalidArgument(msg)) => {
+                assert!(msg.contains("error: unclosed group"))
+            }
+            res => panic!("unexpected result: {:?}", res),
+        }
+    }
 }