mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
pageserver/delta_layer: add vectored reconstruct
Visit the btree index only one and collect all the offsets which need to be read (stored in ascending order). Then issue the reads. Again, read amplification is still an issue for now.
This commit is contained in:
@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -59,7 +62,9 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -824,6 +829,130 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
//
|
||||
// Currently, the index is visited for each range, but this
|
||||
// can be further optimised to visit the index only once.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut ignore_key = None;
|
||||
|
||||
// Scan the page versions backwards, starting from the last key in the range.
|
||||
// to collect all the offsets at which need to be read.
|
||||
let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
|
||||
tree_reader
|
||||
.visit(
|
||||
&end_key.0,
|
||||
VisitDirection::Backwards,
|
||||
|raw_key, value| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
|
||||
if entry_lsn >= end_lsn {
|
||||
return true;
|
||||
}
|
||||
|
||||
if key < range.start {
|
||||
return false;
|
||||
}
|
||||
|
||||
if key >= range.end {
|
||||
return true;
|
||||
}
|
||||
|
||||
if Some(key) == ignore_key {
|
||||
return true;
|
||||
}
|
||||
|
||||
if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
|
||||
if entry_lsn <= cached_lsn {
|
||||
return key != range.start;
|
||||
}
|
||||
}
|
||||
|
||||
let blob_ref = BlobRef(value);
|
||||
let lsns_at = offsets.entry(key).or_default();
|
||||
lsns_at.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
if blob_ref.will_init() {
|
||||
if key == range.start {
|
||||
return false;
|
||||
} else {
|
||||
ignore_key = Some(key);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
}
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, lsns_at) in offsets {
|
||||
for (lsn, block_offset) in lsns_at {
|
||||
let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
reconstruct_state.update_key(&key, lsn, value.unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
|
||||
@@ -455,6 +455,9 @@ pub(crate) enum GetVectoredError {
|
||||
|
||||
#[error("Requested at invalid LSN: {0}")]
|
||||
InvalidLsn(Lsn),
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
|
||||
Reference in New Issue
Block a user