fix: doc store for files larger 4GB (#1856)

Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32.
`get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB).
Causes index corruption, if a segment with a doc store larger 4GB is merged.

tantivy version 0.19 is affected
This commit is contained in:
PSeitz
2023-02-10 21:29:43 +08:00
committed by Paul Masurel
parent 7bf0a14041
commit 7ce8a65619

View File

@@ -90,7 +90,7 @@ impl CheckpointBlock {
return Ok(());
}
let mut doc = read_u32_vint(data);
let mut start_offset = read_u32_vint(data) as usize;
let mut start_offset = VInt::deserialize_u64(data)? as usize;
for _ in 0..len {
let num_docs = read_u32_vint(data);
let block_num_bytes = read_u32_vint(data) as usize;
@@ -147,6 +147,15 @@ mod tests {
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize_large_byte_range() -> io::Result<()> {
let checkpoints = vec![Checkpoint {
doc_range: 10..12,
byte_range: 8_000_000_000..9_000_000_000,
}];
test_aux_ser_deser(&checkpoints)
}
#[test]
fn test_block_serialize() -> io::Result<()> {
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();