Skip to content

Commit 3da08e9

Browse files
committed
fix: doc store for files larger 4GB
Fixes an issue in the skip list deserialization, which deserialized the byte start offset incorrectly as u32. `get_doc` will fail for any docs that live in a block with start offset larger than u32::MAX (~4GB). Causes index corruption, if a segment with a doc store larger 4GB is merged. tantivy version 0.19 is affected
1 parent 6c4b8d9 commit 3da08e9

File tree

1 file changed

+10
-1
lines changed

1 file changed

+10
-1
lines changed

src/store/index/block.rs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ impl CheckpointBlock {
9090
return Ok(());
9191
}
9292
let mut doc = read_u32_vint(data);
93-
let mut start_offset = read_u32_vint(data) as usize;
93+
let mut start_offset = VInt::deserialize_u64(data)? as usize;
9494
for _ in 0..len {
9595
let num_docs = read_u32_vint(data);
9696
let block_num_bytes = read_u32_vint(data) as usize;
@@ -147,6 +147,15 @@ mod tests {
147147
test_aux_ser_deser(&checkpoints)
148148
}
149149

150+
#[test]
151+
fn test_block_serialize_large_byte_range() -> io::Result<()> {
152+
let checkpoints = vec![Checkpoint {
153+
doc_range: 10..12,
154+
byte_range: 8_000_000_000..9_000_000_000,
155+
}];
156+
test_aux_ser_deser(&checkpoints)
157+
}
158+
150159
#[test]
151160
fn test_block_serialize() -> io::Result<()> {
152161
let offsets: Vec<usize> = (0..11).map(|i| i * i * i).collect();

0 commit comments

Comments
 (0)