feat(pageserver): integrate lsn lease into synthetic size (#8220)

Part of #7497, closes #8071. (accidentally closed #8208, reopened here)

## Problem

After the changes in #8084, we need synthetic size to also account for
leased LSNs so that users do not get free retention by running a small
ephemeral endpoint for a long time.

## Summary of changes

This PR integrates LSN leases into the synthetic size calculation. We
model leases as read-only branches started at the leased LSN (except it
does not have a timeline id).

Other changes:
- Add new unit tests testing whether a lease behaves like a read-only
branch.
- Change `/size_debug` response to include lease point in the SVG
visualization.
- Fix `/lsn_lease` HTTP API to do proper parsing for POST.



Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
Yuchen Liang
2024-07-04 11:09:05 -04:00
committed by Vlad Lazar
parent bd2046e1ab
commit 32828cddd6
9 changed files with 256 additions and 27 deletions

View File

@@ -599,6 +599,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
res_json = res.json()
return res_json
def timeline_lsn_lease(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
):
data = {
"lsn": str(lsn),
}
log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}")
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
json=data,
)
self.verbose_error(res)
res_json = res.json()
return res_json
def timeline_get_timestamp_of_lsn(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
):

View File

@@ -10,6 +10,7 @@ from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
flush_ep_to_pageserver,
wait_for_last_flush_lsn,
wait_for_wal_insert_lsn,
)
@@ -710,3 +711,90 @@ def mask_model_inputs(x):
return newlist
else:
return x
@pytest.mark.parametrize("zero_gc", [True, False])
def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool):
"""
Compare a LSN lease to a read-only branch for synthetic size calculation.
They should have the same effect.
"""
conf = {
"pitr_interval": "0s" if zero_gc else "3600s",
"gc_period": "0s",
}
env = neon_env_builder.init_start(initial_tenant_conf=conf)
ro_branch_res = insert_with_action(
env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch"
)
tenant, timeline = env.neon_cli.create_tenant(conf=conf)
lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
assert_size_approx_equal(lease_res, ro_branch_res)
def insert_with_action(
env: NeonEnv,
tenant: TenantId,
timeline: TimelineId,
test_output_dir: Path,
action: str,
) -> int:
"""
Inserts some data on the timeline, perform an action, and insert more data on the same timeline.
Returns the size at the end of the insertion.
Valid actions:
- "lease": Acquires a lease.
- "branch": Creates a child branch but never writes to it.
"""
client = env.pageserver.http_client()
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
initial_size = client.tenant_size(tenant)
log.info(f"initial size: {initial_size}")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
if action == "lease":
res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn)
log.info(f"result from lsn_lease api: {res}")
elif action == "branch":
ro_branch = env.neon_cli.create_branch(
"ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn
)
log.info(f"{ro_branch=} created")
else:
raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
with ep.cursor() as cur:
cur.execute(
"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
cur.execute(
"CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
)
last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
# Avoid flakiness when calculating logical size.
flush_ep_to_pageserver(env, ep, tenant, timeline)
size_after_action_and_insert = client.tenant_size(tenant)
log.info(f"{size_after_action_and_insert=}")
size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w")
size_debug = client.tenant_size_debug(tenant)
size_debug_file.write(size_debug)
return size_after_action_and_insert