feat(pageserver): add an optional lease to the get_lsn_by_timestamp API (#8104)

Part of #7497, closes #8072.

## Problem

Currently the `get_lsn_by_timestamp` and branch creation pageserver APIs do not provide a pleasant client experience where the looked-up LSN might be GC-ed between the two API calls.

This PR attempts to prevent common races between GC and branch creation by making use of LSN leases provided in #8084. A lease can be optionally granted to a looked-up LSN. With the lease, GC will not touch layers needed to reconstruct all pages at this LSN for the duration of the lease.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
This commit is contained in:
Yuchen Liang
2024-06-24 16:12:24 -04:00
committed by GitHub
parent 1ea5d8b132
commit 219e78f885
4 changed files with 76 additions and 11 deletions

View File

@@ -236,6 +236,13 @@ paths:
type: string
format: date-time
description: A timestamp to get the LSN
- name: with_lease
in: query
required: false
schema:
type: boolean
description: Whether to grant a lease to the corresponding LSN. Default to false.
responses:
"200":
description: OK
@@ -1029,6 +1036,10 @@ components:
kind:
type: string
enum: [past, present, future, nodata]
valid_until:
type: string
format: date-time
description: The expiration time of the granted lease.
LsnLease:
type: object

View File

@@ -21,6 +21,7 @@ use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
@@ -728,6 +729,8 @@ async fn get_lsn_by_timestamp_handler(
.map_err(ApiError::BadRequest)?;
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline =
@@ -736,10 +739,15 @@ async fn get_lsn_by_timestamp_handler(
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
#[derive(serde::Serialize, Debug)]
struct Result {
lsn: Lsn,
kind: &'static str,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(flatten)]
lease: Option<LsnLease>,
}
let (lsn, kind) = match result {
LsnForTimestamp::Present(lsn) => (lsn, "present"),
@@ -747,11 +755,28 @@ async fn get_lsn_by_timestamp_handler(
LsnForTimestamp::Past(lsn) => (lsn, "past"),
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
};
let result = Result { lsn, kind };
let lease = if with_lease {
timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
.inspect_err(|_| {
warn!("fail to grant a lease to {}", lsn);
})
.ok()
} else {
None
};
let result = Result { lsn, kind, lease };
let valid_until = result
.lease
.as_ref()
.map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string());
tracing::info!(
lsn=?result.lsn,
kind=%result.kind,
timestamp=%timestamp_raw,
valid_until=?valid_until,
"lsn_by_timestamp finished"
);
json_response(StatusCode::OK, result)

View File

@@ -619,13 +619,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
timestamp: datetime,
with_lease: bool = False,
**kwargs,
):
log.info(
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}, {with_lease=}"
)
with_lease_query = f"{with_lease=}".lower()
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z&{with_lease_query}",
**kwargs,
)
self.verbose_error(res)

View File

@@ -12,10 +12,24 @@ from fixtures.utils import query_scalar, wait_until
from requests.exceptions import ReadTimeout
#
# Test pageserver get_lsn_by_timestamp API
#
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
def assert_lsn_lease_granted(result, with_lease: bool):
"""
Asserts an LSN lease is granted when `with_lease` flag is turned on.
Always asserts no LSN lease is granted when `with_lease` flag is off.
"""
if with_lease:
assert result.get("valid_until")
else:
assert result.get("valid_until") is None
@pytest.mark.parametrize("with_lease", [True, False])
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool):
"""
Test pageserver get_lsn_by_timestamp API.
:param with_lease: Whether to get a lease associated with returned LSN.
"""
env = neon_env_builder.init_start()
tenant_id, _ = env.neon_cli.create_tenant(
@@ -67,23 +81,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
# Check edge cases
# Timestamp is in the future
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "future"
assert_lsn_lease_granted(result, with_lease)
# make sure that we return a well advanced lsn here
assert Lsn(result["lsn"]) > start_lsn
# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "past"
assert_lsn_lease_granted(result, with_lease)
# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) < start_lsn
# Probe a bunch of timestamps in the valid range
for i in range(1, len(tbl), 100):
probe_timestamp = tbl[i][1]
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] not in ["past", "nodata"]
assert_lsn_lease_granted(result, with_lease)
lsn = result["lsn"]
# Call get_lsn_by_timestamp to get the LSN
# Launch a new read-only node at that LSN, and check that only the rows
@@ -105,8 +129,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "past"
assert_lsn_lease_granted(result, with_lease)
# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) >= last_flush_lsn