Compare commits

..

2 Commits

Author SHA1 Message Date
Pascal Seitz
6761237ec7 chore: Release 0.19.2 2023-02-10 12:20:20 +08:00
Pascal Seitz
3da08e92c7 fix: doc store for files larger 4GB
Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32.
`get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB).
Causes index corruption, if a segment with a doc store larger 4GB is merged.

tantivy version 0.19 is affected
2023-02-10 12:12:47 +08:00
2 changed files with 11 additions and 2 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.1"
version = "0.19.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -90,7 +90,7 @@ impl CheckpointBlock {
return Ok(());
}
let mut doc = read_u32_vint(data);
let mut start_offset = read_u32_vint(data) as usize;
let mut start_offset = VInt::deserialize_u64(data)? as usize;
for _ in 0..len {
let num_docs = read_u32_vint(data);
let block_num_bytes = read_u32_vint(data) as usize;
@@ -147,6 +147,15 @@ mod tests {
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize_large_byte_range() -> io::Result<()> {
let checkpoints = vec![Checkpoint {
doc_range: 10..12,
byte_range: 8_000_000_000..9_000_000_000,
}];
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize() -> io::Result<()> {
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();