pageserver: fixes for layer path changes (#7786)

## Problem

- When a layer with legacy local path format is evicted and then
re-downloaded, a panic happened because the path downloaded by remote
storage didn't match the path stored in Layer.
- While investigating, I also realized that secondary locations would
have a similar issue with evictions.

Closes: #7783 

## Summary of changes

- Make remote timeline client take local paths as an input: it should
not have its own ideas about local paths, instead it just uses the layer
path that the Layer has.
- Make secondary state store an explicit local path, populated on scan
of local disk at startup. This provides the same behavior as for Layer,
that our local_layer_path is a _default_, but the layer path can
actually be anything (e.g. an old style one).
- Add tests for both cases.
This commit is contained in:
John Spray
2024-05-17 13:24:03 +01:00
committed by GitHub
parent c1390bfc3b
commit a8e6d259cb
7 changed files with 119 additions and 88 deletions

View File

@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
}
EvictionLayer::Secondary(layer) => {
let file_size = layer.metadata.file_size();
let tenant_manager = tenant_manager.clone();
js.spawn(async move {
layer
.secondary_tenant
.evict_layer(
tenant_manager.get_conf(),
layer.timeline_id,
layer.name,
layer.metadata,
)
.evict_layer(layer.timeline_id, layer.name)
.await;
Ok(file_size)
});

View File

@@ -518,6 +518,7 @@ impl RemoteTimelineClient {
&self,
layer_file_name: &LayerName,
layer_metadata: &LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
@@ -536,6 +537,7 @@ impl RemoteTimelineClient {
self.timeline_id,
layer_file_name,
layer_metadata,
local_path,
cancel,
ctx,
)

View File

@@ -21,7 +21,6 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::Generation;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -50,19 +49,13 @@ pub async fn download_layer_file<'a>(
timeline_id: TimelineId,
layer_file_name: &'a LayerName,
layer_metadata: &'a LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = local_layer_path(
conf,
&tenant_shard_id,
&timeline_id,
layer_file_name,
&layer_metadata.generation,
);
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
let bytes_amount = download_retry(
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },

View File

@@ -6,11 +6,9 @@ mod scheduler;
use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
context::RequestContext,
disk_usage_eviction_task::DiskUsageEvictionInfo,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
virtual_file::MaybeFatalIo,
};
use self::{
@@ -21,9 +19,8 @@ use self::{
use super::{
config::{SecondaryLocationConfig, TenantConfOpt},
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerName},
storage_layer::LayerName,
};
use pageserver_api::{
@@ -178,13 +175,7 @@ impl SecondaryTenant {
/// Cancellation safe, but on cancellation the eviction will go through
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
pub(crate) async fn evict_layer(
self: &Arc<Self>,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerName,
metadata: LayerFileMetadata,
) {
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
debug_assert_current_span_has_tenant_id();
let guard = match self.gate.enter() {
@@ -197,41 +188,11 @@ impl SecondaryTenant {
let now = SystemTime::now();
let local_path = local_layer_path(
conf,
&self.tenant_shard_id,
&timeline_id,
&name,
&metadata.generation,
);
let this = self.clone();
// spawn it to be cancellation safe
tokio::task::spawn_blocking(move || {
let _guard = guard;
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(local_path);
let not_found = deleted
.as_ref()
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
let deleted = if not_found {
false
} else {
deleted
.map(|()| true)
.fatal_err("Deleting layer during eviction")
};
if !deleted {
// skip updating accounting and putting perhaps later timestamp
return;
}
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
@@ -250,8 +211,15 @@ impl SecondaryTenant {
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
let removed = timeline_detail.on_disk_layers.remove(&name);
// We might race with removal of the same layer during downloads, if it was removed
// from the heatmap. If we see that the OnDiskState is gone, then no need to
// do a physical deletion or store in evicted_at.
if let Some(removed) = removed {
removed.remove_blocking();
timeline_detail.evicted_at.insert(name, now);
}
}
})
.await

View File

@@ -111,6 +111,7 @@ struct SecondaryDownloader {
pub(super) struct OnDiskState {
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
}
impl OnDiskState {
@@ -121,12 +122,26 @@ impl OnDiskState {
_ame: LayerName,
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
) -> Self {
Self {
metadata,
access_time,
local_path,
}
}
// This is infallible, because all errors are either acceptable (ENOENT), or totally
// unexpected (fatal).
pub(super) fn remove_blocking(&self) {
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
std::fs::remove_file(&self.local_path)
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting secondary layer")
}
}
#[derive(Debug, Clone, Default)]
@@ -816,20 +831,12 @@ impl<'a> TenantDownloader<'a> {
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
match tokio::fs::metadata(&local_path).await {
match tokio::fs::metadata(&on_disk.local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
local_path,
on_disk.local_path,
meta.len(),
);
}
@@ -837,7 +844,7 @@ impl<'a> TenantDownloader<'a> {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
local_path,
on_disk.local_path,
e
);
debug_assert!(false);
@@ -926,6 +933,13 @@ impl<'a> TenantDownloader<'a> {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&t.name,
&t.metadata.generation,
);
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
@@ -933,6 +947,7 @@ impl<'a> TenantDownloader<'a> {
t.name,
LayerFileMetadata::from(&t.metadata),
t.access_time,
local_path,
));
}
}
@@ -955,6 +970,14 @@ impl<'a> TenantDownloader<'a> {
&self.secondary_state.cancel
);
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
timeline_id,
&layer.name,
&layer.metadata.generation,
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
let downloaded_bytes = match download_layer_file(
self.conf,
@@ -963,6 +986,7 @@ impl<'a> TenantDownloader<'a> {
*timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&local_path,
&self.secondary_state.cancel,
ctx,
)
@@ -1116,6 +1140,7 @@ async fn init_timeline_state(
name,
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
file_path,
),
);
}

View File

@@ -1108,6 +1108,7 @@ impl LayerInner {
.download_layer_file(
&self.desc.layer_name(),
&self.metadata(),
&self.path,
&timeline.cancel,
ctx,
)

View File

@@ -25,6 +25,7 @@ from fixtures.neon_fixtures import (
S3Scrubber,
generate_uploads_and_deletions,
)
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_tenant_state,
@@ -632,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
generation numbers: it should accept these layer files, and avoid doing
a delete/download cycle on them.
"""
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
)
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(1000)
env.pageserver.stop()
attached_pageserver = env.get_tenant_pageserver(tenant_id)
secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
0
]
attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
# Rename the local paths to legacy format, to simulate what
# we would see when upgrading
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
files_renamed = 0
for filename in os.listdir(timeline_dir):
path = os.path.join(timeline_dir, filename)
log.info(f"Found file {path}")
if path.endswith("-v1-00000001"):
new_path = path[:-12]
os.rename(path, new_path)
log.info(f"Renamed {path} -> {new_path}")
files_renamed += 1
# we would see when upgrading. Do this on both attached and secondary locations, as we will
# test the behavior of both.
for pageserver in env.pageservers:
pageserver.stop()
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
files_renamed = 0
for filename in os.listdir(timeline_dir):
path = os.path.join(timeline_dir, filename)
log.info(f"Found file {path}")
if path.endswith("-v1-00000001"):
new_path = path[:-12]
os.rename(path, new_path)
log.info(f"Renamed {path} -> {new_path}")
files_renamed += 1
assert files_renamed > 0
assert files_renamed > 0
env.pageserver.start()
pageserver.start()
workload.validate()
# Assert that there were no on-demand downloads
assert (
env.pageserver.http_client().get_metric_value(
attached_pageserver.http_client().get_metric_value(
"pageserver_remote_ondemand_downloaded_layers_total"
)
== 0
)
# Do a secondary download and ensure there were no layer downloads
secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
assert (
secondary_pageserver.http_client().get_metric_value(
"pageserver_secondary_download_layer_total"
)
== 0
)
# Check that when we evict and promote one of the legacy-named layers, everything works as
# expected
local_layers = list(
(
parse_layer_file_name(path.name),
os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
)
for path in attached_pageserver.list_layers(tenant_id, timeline_id)
)
(victim_layer_name, victim_path) = local_layers[0]
assert os.path.exists(victim_path)
attached_pageserver.http_client().evict_layer(
tenant_id, timeline_id, victim_layer_name.to_str()
)
assert not os.path.exists(victim_path)
attached_pageserver.http_client().download_layer(
tenant_id, timeline_id, victim_layer_name.to_str()
)
# We should download into the same local path we started with
assert os.path.exists(victim_path)