fix(pageserver): handle lsn lease requests for unnormalized lsns (#9137)

Fixes https://github.com/neondatabase/neon/issues/9098.

## Problem

See
https://github.com/neondatabase/neon/issues/9098#issuecomment-2372484969.

### Related

A similar problem happened with branch creation, which was discussed
[here](https://github.com/neondatabase/neon/pull/2143#issuecomment-1199969052)
and fixed by https://github.com/neondatabase/neon/pull/2529.

## Summary of changes

- Normalize the lsn on pageserver side upon lsn lease request, stores
the normalized LSN.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
Yuchen Liang
2024-09-25 10:57:38 -04:00
committed by GitHub
parent c5972389aa
commit d447f49bc3
2 changed files with 16 additions and 5 deletions

View File

@@ -112,7 +112,7 @@ use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::to_pg_timestamp;
use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
use utils::{
completion,
generation::Generation,
@@ -1337,6 +1337,10 @@ impl Timeline {
_ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
let lease = {
// Normalize the requested LSN to be aligned, and move to the first record
// if it points to the beginning of the page (header).
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
let mut gc_info = self.gc_info.write().unwrap();
let valid_until = SystemTime::now() + length;

View File

@@ -122,6 +122,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
Test static endpoint is protected from GC by acquiring and renewing lsn leases.
"""
LSN_LEASE_LENGTH = 8
neon_env_builder.num_pageservers = 2
# GC is manual triggered.
env = neon_env_builder.init_start(
@@ -139,7 +140,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
"image_creation_threshold": "1",
"image_layer_creation_check_threshold": "0",
# Short lease length to fit test.
"lsn_lease_length": "3s",
"lsn_lease_length": f"{LSN_LEASE_LENGTH}s",
},
initial_tenant_shard_count=2,
)
@@ -170,10 +171,14 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
with env.endpoints.create_start("main") as ep_main:
with ep_main.cursor() as cur:
cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
lsn = None
lsn = Lsn(0)
for i in range(2):
lsn = generate_updates_on_main(env, ep_main, i)
# Round down to the closest LSN on page boundary (unnormalized).
XLOG_BLCKSZ = 8192
lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)
with env.endpoints.create_start(
branch_name="main",
endpoint_id="static",
@@ -183,7 +188,8 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
cur.execute("SELECT count(*) FROM t0")
assert cur.fetchone() == (ROW_COUNT,)
time.sleep(3)
# Wait for static compute to renew lease at least once.
time.sleep(LSN_LEASE_LENGTH / 2)
generate_updates_on_main(env, ep_main, i, end=100)
@@ -204,8 +210,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
# Do some update so we can increment latest_gc_cutoff
generate_updates_on_main(env, ep_main, i, end=100)
# Wait for the existing lease to expire.
time.sleep(LSN_LEASE_LENGTH)
# Now trigger GC again, layers should be removed.
time.sleep(4)
for shard, ps in tenant_get_shards(env, env.initial_tenant):
client = ps.http_client()
gc_result = client.timeline_gc(shard, env.initial_timeline, 0)