fix out of order bug

2026-01-14 04:52:54 +00:00 · 2024-06-25 08:35:58 +08:00
10 changed files with 56 additions and 67 deletions
--- a/columnar/src/columnar/writer/mod.rs
+++ b/columnar/src/columnar/writer/mod.rs
@@ -8,7 +8,6 @@ use std::net::Ipv6Addr;

 use column_operation::ColumnOperation;
 pub(crate) use column_writers::CompatibleNumericalTypes;
-use common::json_path_writer::JSON_END_OF_PATH;
 use common::CountingWriter;
 pub(crate) use serializer::ColumnarSerializer;
 use stacker::{Addr, ArenaHashMap, MemoryArena};
@@ -284,17 +283,12 @@ impl ColumnarWriter {
                .iter()
                .map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
        );
+        // TODO: replace JSON_END_OF_PATH with b'0' in columns
        columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));

        let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
        let mut symbol_byte_buffer: Vec<u8> = Vec::new();
        for (column_name, column_type, addr) in columns {
-            if column_name.contains(&JSON_END_OF_PATH) {
-                // Tantivy uses b'0' as a separator for nested fields in JSON.
-                // Column names with a b'0' are not simply ignored by the columnar (and the inverted
-                // index).
-                continue;
-            }
            match column_type {
                ColumnType::Bool => {
                    let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
--- a/columnar/src/columnar/writer/serializer.rs
+++ b/columnar/src/columnar/writer/serializer.rs
@@ -93,3 +93,18 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
        self.columnar_serializer.wrt.write_all(buf)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_prepare_key_bytes() {
+        let mut buffer: Vec<u8> = b"somegarbage".to_vec();
+        prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
+        assert_eq!(buffer.len(), 12);
+        assert_eq!(&buffer[..10], b"root0child");
+        assert_eq!(buffer[10], 0u8);
+        assert_eq!(buffer[11], ColumnType::Str.to_code());
+    }
+}
--- a/src/core/json_utils.rs
+++ b/src/core/json_utils.rs
@@ -1,4 +1,4 @@
-use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
+use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
 use common::{replace_in_place, JsonPathWriter};
 use rustc_hash::FxHashMap;

@@ -83,9 +83,6 @@ fn index_json_object<'a, V: Value<'a>>(
    positions_per_path: &mut IndexingPositionsPerPath,
 ) {
    for (json_path_segment, json_value_visitor) in json_visitor {
-        if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
-            continue;
-        }
        json_path_writer.push(json_path_segment);
        index_json_value(
            doc,
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -815,9 +815,8 @@ mod tests {
    use crate::indexer::NoMergePolicy;
    use crate::query::{QueryParser, TermQuery};
    use crate::schema::{
-        self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, JsonObjectOptions,
-        NumericOptions, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
-        STRING, TEXT,
+        self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions,
+        TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
    };
    use crate::store::DOCSTORE_CACHE_CAPACITY;
    use crate::{
@@ -2379,11 +2378,11 @@ mod tests {

    #[test]
    fn test_bug_1617_2() {
-        test_operation_strategy(
+        assert!(test_operation_strategy(
            &[
                IndexingOp::AddDoc {
                    id: 13,
-                    value: Default::default(),
+                    value: Default::default()
                },
                IndexingOp::DeleteDoc { id: 13 },
                IndexingOp::Commit,
@@ -2391,9 +2390,9 @@ mod tests {
                IndexingOp::Commit,
                IndexingOp::Merge,
            ],
-            true,
+            true
        )
-        .unwrap();
+        .is_ok());
    }

    #[test]
@@ -2493,9 +2492,9 @@ mod tests {
    }

    #[test]
-    fn test_bug_2442_reserved_character_fast_field() -> crate::Result<()> {
+    fn test_bug_2442() -> crate::Result<()> {
        let mut schema_builder = schema::Schema::builder();
-        let json_field = schema_builder.add_json_field("json", FAST | TEXT);
+        let json_field = schema_builder.add_json_field("json", TEXT | FAST);

        let schema = schema_builder.build();
        let index = Index::builder().schema(schema).create_in_ram()?;
@@ -2516,21 +2515,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn test_bug_2442_reserved_character_columnar() -> crate::Result<()> {
-        let mut schema_builder = Schema::builder();
-        let options = JsonObjectOptions::from(FAST).set_expand_dots_enabled();
-        let field = schema_builder.add_json_field("json", options);
-        let index = Index::create_in_ram(schema_builder.build());
-        let mut index_writer = index.writer_for_tests().unwrap();
-        index_writer
-            .add_document(doc!(field=>json!({"\u{0000}": "A"})))
-            .unwrap();
-        index_writer
-            .add_document(doc!(field=>json!({format!("\u{0000}\u{0000}"): "A"})))
-            .unwrap();
-        index_writer.commit().unwrap();
-        Ok(())
-    }
 }
--- a/src/indexer/mod.rs
+++ b/src/indexer/mod.rs
@@ -145,27 +145,15 @@ mod tests_mmap {
        }
    }
    #[test]
-    fn test_json_field_null_byte_is_ignored() {
-        let mut schema_builder = Schema::builder();
-        let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
-        let field = schema_builder.add_json_field("json", options);
-        let index = Index::create_in_ram(schema_builder.build());
-        let mut index_writer = index.writer_for_tests().unwrap();
-        index_writer
-            .add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
-            .unwrap();
-        index_writer.commit().unwrap();
-        let reader = index.reader().unwrap();
-        let searcher = reader.searcher();
-        let segment_reader = searcher.segment_reader(0);
-        let inv_indexer = segment_reader.inverted_index(field).unwrap();
-        let term_dict = inv_indexer.terms();
-        assert_eq!(term_dict.num_terms(), 1);
-        let mut term_bytes = Vec::new();
-        term_dict.ord_to_term(0, &mut term_bytes).unwrap();
-        assert_eq!(term_bytes, b"key\0stest1");
+    fn test_json_field_null_byte() {
+        // Test when field name contains a zero byte, which has special meaning in tantivy.
+        // As a workaround, we convert the zero byte to the ASCII character '0'.
+        // https://github.com/quickwit-oss/tantivy/issues/2340
+        // https://github.com/quickwit-oss/tantivy/issues/2193
+        let field_name_in = "\u{0000}";
+        let field_name_out = "0";
+        test_json_field_name(field_name_in, field_name_out);
    }
-
    #[test]
    fn test_json_field_1byte() {
        // Test when field name contains a '1' byte, which has special meaning in tantivy.
--- a/src/indexer/path_to_unordered_id.rs
+++ b/src/indexer/path_to_unordered_id.rs
@@ -1,3 +1,5 @@
+use common::json_path_writer::JSON_END_OF_PATH;
+use common::replace_in_place;
 use fnv::FnvHashMap;

 /// `Field` is represented by an unsigned 32-bit integer type.
@@ -38,7 +40,13 @@ impl PathToUnorderedId {
    #[cold]
    fn insert_new_path(&mut self, path: &str) -> u32 {
        let next_id = self.map.len() as u32;
-        let new_path = path.to_string();
+        let mut new_path = path.to_string();
+
+        // The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
+        // valid single byte ut8 strings.
+        // By utf-8 design, they cannot be part of another codepoint.
+        unsafe { replace_in_place(JSON_END_OF_PATH, b'0', new_path.as_bytes_mut()) };
+
        self.map.insert(new_path, next_id);
        next_id
    }
--- a/src/postings/json_postings_writer.rs
+++ b/src/postings/json_postings_writer.rs
@@ -59,7 +59,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
    /// The actual serialization format is handled by the `PostingsSerializer`.
    fn serialize(
        &self,
-        ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
+        term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
        ordered_id_to_path: &[&str],
        ctx: &IndexingContext,
        serializer: &mut FieldSerializer,
@@ -69,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
        term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
        let mut prev_term_id = u32::MAX;
        let mut term_path_len = 0; // this will be set in the first iteration
-        for (_field, path_id, term, addr) in ordered_term_addrs {
+        for (_field, path_id, term, addr) in term_addrs {
            if prev_term_id != path_id.path_id() {
                term_buffer.truncate_value_bytes(0);
                term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -15,7 +15,6 @@ pub trait Postings: DocSet + 'static {
    fn term_freq(&self) -> u32;

    /// Returns the positions offsetted with a given value.
-    /// It is not necessary to clear the `output` before calling this method.
    /// The output vector will be resized to the `term_freq`.
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

--- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
+++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs
@@ -97,7 +97,6 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
    suffixes: Vec<TPostings>,
    suffix_offset: u32,
    phrase_count: u32,
-    suffix_position_buffer: Vec<u32>,
 }

 impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
@@ -141,7 +140,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            suffixes,
            suffix_offset: (max_offset - suffix_pos) as u32,
            phrase_count: 0,
-            suffix_position_buffer: Vec::with_capacity(100),
        };
        if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
            phrase_prefix_scorer.advance();
@@ -155,6 +153,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {

    fn matches_prefix(&mut self) -> bool {
        let mut count = 0;
+        let mut positions = Vec::new();
        let current_doc = self.doc();
        let pos_matching = self.phrase_scorer.get_intersection();
        for suffix in &mut self.suffixes {
@@ -163,8 +162,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
            }
            let doc = suffix.seek(current_doc);
            if doc == current_doc {
-                suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
-                count += intersection_count(pos_matching, &self.suffix_position_buffer);
+                suffix.positions_with_offset(self.suffix_offset, &mut positions);
+                count += intersection_count(pos_matching, &positions);
            }
        }
        self.phrase_count = count as u32;
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -249,8 +249,15 @@ impl Term {
    #[inline]
    pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] {
        let len_before = self.0.len();
-        assert!(!bytes.contains(&JSON_END_OF_PATH));
-        self.0.extend_from_slice(bytes);
+        if bytes.contains(&JSON_END_OF_PATH) {
+            self.0.extend(
+                bytes
+                    .iter()
+                    .map(|&b| if b == JSON_END_OF_PATH { b'0' } else { b }),
+            );
+        } else {
+            self.0.extend_from_slice(bytes);
+        }
        &mut self.0[len_before..]
    }
 }