mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
fix null byte handling in JSON paths (#2345)
* fix null byte handling in JSON paths closes https://github.com/quickwit-oss/tantivy/issues/2193 closes https://github.com/quickwit-oss/tantivy/issues/2340 * avoid repeated term truncation * fix test * Apply suggestions from code review Co-authored-by: Paul Masurel <paul@quickwit.io> * add comment --------- Co-authored-by: Paul Masurel <paul@quickwit.io>
This commit is contained in:
@@ -18,7 +18,12 @@ pub struct ColumnarSerializer<W: io::Write> {
|
||||
/// code.
|
||||
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(key);
|
||||
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
|
||||
if key.contains(&0u8) {
|
||||
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
||||
} else {
|
||||
buffer.extend_from_slice(key);
|
||||
}
|
||||
buffer.push(0u8);
|
||||
buffer.push(column_type.to_code());
|
||||
}
|
||||
@@ -102,7 +107,7 @@ mod tests {
|
||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
||||
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
|
||||
assert_eq!(buffer.len(), 12);
|
||||
assert_eq!(&buffer[..10], b"root\0child");
|
||||
assert_eq!(&buffer[..10], b"root0child");
|
||||
assert_eq!(buffer[10], 0u8);
|
||||
assert_eq!(buffer[11], ColumnType::Str.to_code());
|
||||
}
|
||||
|
||||
@@ -144,6 +144,115 @@ mod tests_mmap {
|
||||
assert_eq!(num_docs, 256);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_json_field_null_byte() {
|
||||
// Test when field name contains a zero byte, which has special meaning in tantivy.
|
||||
// As a workaround, we convert the zero byte to the ASCII character '0'.
|
||||
// https://github.com/quickwit-oss/tantivy/issues/2340
|
||||
// https://github.com/quickwit-oss/tantivy/issues/2193
|
||||
let field_name_in = "\u{0000}";
|
||||
let field_name_out = "0";
|
||||
test_json_field_name(field_name_in, field_name_out);
|
||||
}
|
||||
#[test]
|
||||
fn test_json_field_1byte() {
|
||||
// Test when field name contains a 1 byte, which has special meaning in tantivy.
|
||||
let field_name_in = "\u{0001}";
|
||||
let field_name_out = "\u{0001}";
|
||||
test_json_field_name(field_name_in, field_name_out);
|
||||
|
||||
// Test when field name contains a 1 byte, which has special meaning in tantivy.
|
||||
let field_name_in = "\u{0001}";
|
||||
let field_name_out = ".";
|
||||
test_json_field_name(field_name_in, field_name_out);
|
||||
}
|
||||
fn test_json_field_name(field_name_in: &str, field_name_out: &str) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
|
||||
let field = schema_builder.add_json_field("json", options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"})))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"})))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field=>json!({format!("a{field_name_in}a"): "test3"})))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(
|
||||
doc!(field=>json!({format!("a{field_name_in}a{field_name_in}"): "test4"})),
|
||||
)
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(
|
||||
doc!(field=>json!({format!("a{field_name_in}.ab{field_name_in}"): "test5"})),
|
||||
)
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(
|
||||
doc!(field=>json!({format!("a{field_name_in}"): json!({format!("a{field_name_in}"): "test6"}) })),
|
||||
)
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(field=>json!({format!("{field_name_in}a" ): "test7"})))
|
||||
.unwrap();
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let parse_query = QueryParser::for_index(&index, Vec::new());
|
||||
let test_query = |field_name: &str| {
|
||||
let query = parse_query.parse_query(field_name).unwrap();
|
||||
let num_docs = searcher.search(&query, &Count).unwrap();
|
||||
assert_eq!(num_docs, 1);
|
||||
};
|
||||
test_query(format!("json.{field_name_out}:test1").as_str());
|
||||
test_query(format!("json.a{field_name_out}:test2").as_str());
|
||||
test_query(format!("json.a{field_name_out}a:test3").as_str());
|
||||
test_query(format!("json.a{field_name_out}a{field_name_out}:test4").as_str());
|
||||
test_query(format!("json.a{field_name_out}.ab{field_name_out}:test5").as_str());
|
||||
test_query(format!("json.a{field_name_out}.a{field_name_out}:test6").as_str());
|
||||
test_query(format!("json.{field_name_out}a:test7").as_str());
|
||||
|
||||
let test_agg = |field_name: &str, expected: &str| {
|
||||
let agg_req_str = json!(
|
||||
{
|
||||
"termagg": {
|
||||
"terms": {
|
||||
"field": field_name,
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(agg_req_str).unwrap();
|
||||
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
let res = serde_json::to_value(agg_res).unwrap();
|
||||
assert_eq!(res["termagg"]["buckets"][0]["doc_count"], 1);
|
||||
assert_eq!(res["termagg"]["buckets"][0]["key"], expected);
|
||||
};
|
||||
|
||||
test_agg(format!("json.{field_name_out}").as_str(), "test1");
|
||||
test_agg(format!("json.a{field_name_out}").as_str(), "test2");
|
||||
test_agg(format!("json.a{field_name_out}a").as_str(), "test3");
|
||||
test_agg(
|
||||
format!("json.a{field_name_out}a{field_name_out}").as_str(),
|
||||
"test4",
|
||||
);
|
||||
test_agg(
|
||||
format!("json.a{field_name_out}.ab{field_name_out}").as_str(),
|
||||
"test5",
|
||||
);
|
||||
test_agg(
|
||||
format!("json.a{field_name_out}.a{field_name_out}").as_str(),
|
||||
"test6",
|
||||
);
|
||||
test_agg(format!("json.{field_name_out}a").as_str(), "test7");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
|
||||
|
||||
@@ -67,10 +67,18 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
) -> io::Result<()> {
|
||||
let mut term_buffer = Term::with_capacity(48);
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
||||
let mut prev_term_id = u32::MAX;
|
||||
let mut term_path_len = 0; // this will be set in the first iteration
|
||||
for (_field, path_id, term, addr) in term_addrs {
|
||||
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
||||
term_buffer.append_bytes(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
||||
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
|
||||
if prev_term_id != path_id.path_id() {
|
||||
term_buffer.truncate_value_bytes(0);
|
||||
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
||||
term_buffer.append_bytes(&[JSON_END_OF_PATH]);
|
||||
term_path_len = term_buffer.len_bytes();
|
||||
prev_term_id = path_id.path_id();
|
||||
}
|
||||
term_buffer.truncate_value_bytes(term_path_len);
|
||||
term_buffer.append_bytes(term);
|
||||
if let Some(json_value) = term_buffer.value().as_json_value_bytes() {
|
||||
let typ = json_value.typ();
|
||||
|
||||
@@ -218,6 +218,23 @@ impl Term {
|
||||
&mut self.0[len_before..]
|
||||
}
|
||||
|
||||
/// Appends json path bytes to the Term.
|
||||
/// If the path contains 0 bytes, they are replaced by a "0" string.
|
||||
/// The 0 byte is used to mark the end of the path.
|
||||
///
|
||||
/// This function returns the segment that has just been added.
|
||||
#[inline]
|
||||
pub fn append_path(&mut self, bytes: &[u8]) -> &mut [u8] {
|
||||
let len_before = self.0.len();
|
||||
if bytes.contains(&0u8) {
|
||||
self.0
|
||||
.extend(bytes.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
||||
} else {
|
||||
self.0.extend_from_slice(bytes);
|
||||
}
|
||||
&mut self.0[len_before..]
|
||||
}
|
||||
|
||||
/// Appends a JSON_PATH_SEGMENT_SEP to the term.
|
||||
/// Only used for JSON type.
|
||||
#[inline]
|
||||
|
||||
Reference in New Issue
Block a user