Merge remote-tracking branch 'upstream/jcsp/tenant-snapshot' into jcsp/hack

This commit is contained in:
John Spray
2024-04-22 10:16:41 +01:00
35 changed files with 1828 additions and 221 deletions

View File

@@ -1,6 +1,7 @@
import json
import os
import random
import time
from pathlib import Path
from typing import Any, Dict, Optional
@@ -582,6 +583,91 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
"""
Slow test that runs in realtime, checks that the background scheduling of secondary
downloads happens as expected.
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
# Create this many tenants, each with two timelines
tenant_count = 4
tenant_timelines = {}
# This mirrors a constant in `downloader.rs`
freshen_interval_secs = 60
for _i in range(0, tenant_count):
tenant_id = TenantId.generate()
timeline_a = TimelineId.generate()
timeline_b = TimelineId.generate()
env.neon_cli.create_tenant(
tenant_id,
timeline_a,
placement_policy='{"Attached":1}',
# Run with a low heatmap period so that we can avoid having to do synthetic API calls
# to trigger the upload promptly.
conf={"heatmap_period": "1s"},
)
env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
tenant_timelines[tenant_id] = [timeline_a, timeline_b]
t_start = time.time()
# Wait long enough that the background downloads should happen; we expect all the inital layers
# of all the initial timelines to show up on the secondary location of each tenant.
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
for timeline_id in timelines:
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
# One or more layers should be present for all timelines
assert list_layers(ps_secondary, tenant_id, timeline_id)
# Delete the second timeline: this should be reflected later on the secondary
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
# Wait long enough for the secondary locations to see the deletion
time.sleep(freshen_interval_secs * 1.5)
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
# This one was not deleted
assert list_layers(ps_secondary, tenant_id, timelines[0])
# This one was deleted
assert not list_layers(ps_secondary, tenant_id, timelines[1])
t_end = time.time()
# Measure how many heatmap downloads we did in total: this checks that we succeeded with
# proper scheduling, and not some bug that just runs downloads in a loop.
total_heatmap_downloads = 0
for ps in env.pageservers:
v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total")
assert v is not None
total_heatmap_downloads += int(v)
download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
expect_download_rate = 1.0 / freshen_interval_secs
log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
assert download_rate < expect_download_rate * 2
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
@pytest.mark.parametrize("via_controller", [True, False])
def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):

View File

@@ -274,7 +274,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
but imports the generation number.
"""
neon_env_builder.num_pageservers = 2
# One pageserver to simulate legacy environment, two to be managed by storage controller
neon_env_builder.num_pageservers = 3
# Start services by hand so that we can skip registration on one of the pageservers
env = neon_env_builder.init_configs()
@@ -289,10 +290,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
origin_ps = env.pageservers[0]
# This is the pageserver managed by the sharding service, where the tenant
# These are the pageservers managed by the sharding service, where the tenant
# will be attached after onboarding
env.pageservers[1].start()
dest_ps = env.pageservers[1]
env.pageservers[2].start()
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
for sk in env.safekeepers:
@@ -331,6 +332,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
virtual_ps_http.tenant_secondary_download(tenant_id)
warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
"node_secondary"
][0]
# Call into storage controller to onboard the tenant
generation += 1
@@ -345,6 +349,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
)
assert len(r["shards"]) == 1
describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
dest_ps_id = describe["node_attached"]
dest_ps = env.get_pageserver(dest_ps_id)
if warm_up:
# The storage controller should have attached the tenant to the same placce
# it had a secondary location, otherwise there was no point warming it up
assert dest_ps_id == warm_up_ps
# It should have been given a new secondary location as well
assert len(describe["node_secondary"]) == 1
assert describe["node_secondary"][0] != warm_up_ps
# As if doing a live migration, detach the original pageserver
origin_ps.http_client().tenant_location_conf(
tenant_id,
@@ -416,6 +432,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
)
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
# Storage controller auto-sets heatmap period, ignore it for the comparison
del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
env.storage_controller.consistency_check()