pageserver: fix secondary download scheduling (#7396)

## Problem

Some tenants were observed to stop doing downloads after some time

## Summary of changes

- Fix a rogue `<` that was incorrectly scheduling work when `now` was
_before_ the scheduling target, rather than after. This usually resulted
in too-frequent execution, but could also result in never executing, if
the current time has advanced ahead of `next_download` at the time we
call `schedule()`.
- Fix in-memory list of timelines not being amended after timeline
deletion: the resulted in repeated harmless logs about the timeline
being removed, and redundant calls to remove_dir_all for the timeline
path.
- Add a log at startup to make it easier to see a particular tenant
starting in secondary mode (this is for parity with the logging that
exists when spawning an attached tenant). Previously searching on tenant
ID didn't provide a clear signal as to how the tenant was started during
pageserver start.
- Add a test that exercises secondary downloads using the background
scheduling, whereas existing tests were using the API hook to invoke
download directly.
This commit is contained in:
John Spray
2024-04-18 13:16:03 +01:00
committed by GitHub
parent 8d0f701767
commit 637ad4a638
4 changed files with 112 additions and 10 deletions

View File

@@ -1518,7 +1518,8 @@ pub(crate) struct SecondaryModeMetrics {
pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
@@ -1536,7 +1537,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations"
"Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
)
.expect("failed to define a metric"),
download_layer: register_int_counter!(
@@ -1544,6 +1545,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
"Number of downloads of layers by secondary mode locations"
)
.expect("failed to define a metric"),
}
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

View File

@@ -678,12 +678,19 @@ pub async fn init_tenant_mgr(
}
}
}
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
shard_identity,
location_conf.tenant_conf,
&secondary_conf,
)),
LocationMode::Secondary(secondary_conf) => {
info!(
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
"Starting secondary tenant"
);
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
shard_identity,
location_conf.tenant_conf,
&secondary_conf,
))
}
};
tenants.insert(tenant_shard_id, slot);

View File

@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
(detail.last_download, detail.next_download.unwrap())
};
if now < next_download {
if now > next_download {
Some(PendingDownload {
secondary_state: secondary_tenant,
last_download,
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
progress.bytes_downloaded += layer_byte_count;
progress.layers_downloaded += layer_count;
}
for delete_timeline in &delete_timelines {
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
// from disk fails that will be a fatal error.
detail.timelines.remove(delete_timeline);
}
}
// Execute accumulated deletions
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
.await
.map_err(UpdateError::from)?;
SECONDARY_MODE.download_heatmap.inc();
if Some(&download.etag) == prev_etag {
Ok(HeatMapDownload::Unmodified)
} else {
let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
SECONDARY_MODE.download_heatmap.inc();
Ok(HeatMapDownload::Modified(HeatMapModified {
etag: download.etag,
last_modified: download.last_modified,

View File

@@ -1,6 +1,7 @@
import json
import os
import random
import time
from pathlib import Path
from typing import Any, Dict, Optional
@@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
"""
Slow test that runs in realtime, checks that the background scheduling of secondary
downloads happens as expected.
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
# Create this many tenants, each with two timelines
tenant_count = 4
tenant_timelines = {}
# This mirrors a constant in `downloader.rs`
freshen_interval_secs = 60
for _i in range(0, tenant_count):
tenant_id = TenantId.generate()
timeline_a = TimelineId.generate()
timeline_b = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id,
timeline_a,
placement_policy='{"Attached":1}',
# Run with a low heatmap period so that we can avoid having to do synthetic API calls
# to trigger the upload promptly.
conf={"heatmap_period": "1s"},
)
env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
tenant_timelines[tenant_id] = [timeline_a, timeline_b]
t_start = time.time()
# Wait long enough that the background downloads should happen; we expect all the inital layers
# of all the initial timelines to show up on the secondary location of each tenant.
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
for timeline_id in timelines:
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
# One or more layers should be present for all timelines
assert list_layers(ps_secondary, tenant_id, timeline_id)
# Delete the second timeline: this should be reflected later on the secondary
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
# Wait long enough for the secondary locations to see the deletion
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
# This one was not deleted
assert list_layers(ps_secondary, tenant_id, timelines[0])
# This one was deleted
assert not list_layers(ps_secondary, tenant_id, timelines[1])
t_end = time.time()
# Measure how many heatmap downloads we did in total: this checks that we succeeded with
# proper scheduling, and not some bug that just runs downloads in a loop.
total_heatmap_downloads = 0
for ps in env.pageservers:
v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total")
assert v is not None
total_heatmap_downloads += int(v)
download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
expect_download_rate = 1.0 / freshen_interval_secs
log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
assert download_rate < expect_download_rate * 2
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
@pytest.mark.parametrize("via_controller", [True, False])
def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):