mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 08:52:56 +00:00
timeline creation: reflect failures due to ancestor LSN issues in status code (#4600)
Before, it was a `500` and control plane would retry, wereas it actually should have stopped retrying. (Stacked on top of https://github.com/neondatabase/neon/pull/4597 ) fixes https://github.com/neondatabase/neon/issues/4595 part of https://github.com/neondatabase/cloud/issues/5626 --------- Co-authored-by: Shany Pozin <shany@neon.tech>
This commit is contained in:
committed by
GitHub
parent
26828560a8
commit
24eaa3b7ca
@@ -722,6 +722,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"406":
|
||||
description: Permanently unsatisfiable request, don't retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Timeline already exists, creation skipped
|
||||
content:
|
||||
|
||||
@@ -338,6 +338,11 @@ async fn timeline_create_handler(
|
||||
Err(tenant::CreateTimelineError::AlreadyExists) => {
|
||||
json_response(StatusCode::CONFLICT, ())
|
||||
}
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
|
||||
json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
|
||||
format!("{err:#}")
|
||||
))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -506,6 +506,8 @@ pub enum CreateTimelineError {
|
||||
#[error("a timeline with the given ID already exists")]
|
||||
AlreadyExists,
|
||||
#[error(transparent)]
|
||||
AncestorLsn(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
@@ -1432,7 +1434,7 @@ impl Tenant {
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
return Err(CreateTimelineError::Other(anyhow::anyhow!(
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||
lsn,
|
||||
ancestor_timeline_id,
|
||||
@@ -2715,7 +2717,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
|
||||
.await?;
|
||||
@@ -2732,7 +2734,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
|
||||
.await
|
||||
}
|
||||
@@ -2743,7 +2745,7 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
|
||||
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||
@@ -2783,16 +2785,17 @@ impl Tenant {
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))?;
|
||||
))
|
||||
.map_err(CreateTimelineError::AncestorLsn)?;
|
||||
|
||||
// and then the planned GC cutoff
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
|
||||
if start_lsn < cutoff {
|
||||
bail!(format!(
|
||||
return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
));
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3825,6 +3828,9 @@ mod tests {
|
||||
{
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
let CreateTimelineError::AncestorLsn(err) = err else {
|
||||
panic!("wrong error type")
|
||||
};
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
@@ -3854,6 +3860,9 @@ mod tests {
|
||||
{
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
let CreateTimelineError::AncestorLsn(err) = err else {
|
||||
panic!("wrong error type");
|
||||
};
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
|
||||
@@ -21,6 +21,18 @@ class PageserverApiException(Exception):
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
class TimelineCreate406(PageserverApiException):
|
||||
def __init__(self, res: requests.Response):
|
||||
assert res.status_code == 406
|
||||
super().__init__(res.json()["msg"], res.status_code)
|
||||
|
||||
|
||||
class TimelineCreate409(PageserverApiException):
|
||||
def __init__(self, res: requests.Response):
|
||||
assert res.status_code == 409
|
||||
super().__init__("", res.status_code)
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryLayerInfo:
|
||||
kind: str
|
||||
@@ -309,9 +321,12 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
|
||||
raise TimelineCreate409(res)
|
||||
if res.status_code == 406:
|
||||
raise TimelineCreate406(res)
|
||||
|
||||
self.verbose_error(res)
|
||||
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
|
||||
@@ -4,7 +4,8 @@ import time
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.pageserver.http import TimelineCreate406
|
||||
from fixtures.types import Lsn, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -173,5 +174,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
# The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
|
||||
with pytest.raises(Exception, match="invalid branch start lsn: .*"):
|
||||
env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn)
|
||||
# retry the same with the HTTP API, so that we can inspect the status code
|
||||
with pytest.raises(TimelineCreate406):
|
||||
new_timeline_id = TimelineId.generate()
|
||||
log.info(
|
||||
f"Expecting failure for branch behind gc'ing LSN, new_timeline_id={new_timeline_id}"
|
||||
)
|
||||
pageserver_http_client.timeline_create(env.pg_version, tenant, new_timeline_id, b0, lsn)
|
||||
|
||||
thread.join()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.pageserver.http import TimelineCreate406
|
||||
from fixtures.types import Lsn, TimelineId
|
||||
from fixtures.utils import print_gc_result, query_scalar
|
||||
|
||||
@@ -17,7 +18,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
env.neon_cli.create_branch("test_branch_behind")
|
||||
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
|
||||
endpoint_main = env.endpoints.create_start("test_branch_behind")
|
||||
log.info("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
@@ -89,6 +90,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100
|
||||
|
||||
# Check bad lsn's for branching
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# branch at segment boundary
|
||||
env.neon_cli.create_branch(
|
||||
@@ -97,27 +99,52 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint = env.endpoints.create_start("test_branch_segment_boundary")
|
||||
assert endpoint.safe_psql("SELECT 1")[0][0] == 1
|
||||
|
||||
# branch at pre-initdb lsn
|
||||
# branch at pre-initdb lsn (from main branch)
|
||||
with pytest.raises(Exception, match="invalid branch start lsn: .*"):
|
||||
env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42"))
|
||||
# retry the same with the HTTP API, so that we can inspect the status code
|
||||
with pytest.raises(TimelineCreate406):
|
||||
new_timeline_id = TimelineId.generate()
|
||||
log.info(f"Expecting failure for branch pre-initdb LSN, new_timeline_id={new_timeline_id}")
|
||||
pageserver_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, new_timeline_id, env.initial_timeline, Lsn("0/42")
|
||||
)
|
||||
|
||||
# branch at pre-ancestor lsn
|
||||
with pytest.raises(Exception, match="less than timeline ancestor lsn"):
|
||||
env.neon_cli.create_branch(
|
||||
"test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42")
|
||||
)
|
||||
# retry the same with the HTTP API, so that we can inspect the status code
|
||||
with pytest.raises(TimelineCreate406):
|
||||
new_timeline_id = TimelineId.generate()
|
||||
log.info(
|
||||
f"Expecting failure for branch pre-ancestor LSN, new_timeline_id={new_timeline_id}"
|
||||
)
|
||||
pageserver_http.timeline_create(
|
||||
env.pg_version,
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
branch_behind_timeline_id,
|
||||
Lsn("0/42"),
|
||||
)
|
||||
|
||||
# check that we cannot create branch based on garbage collected data
|
||||
with env.pageserver.http_client() as pageserver_http:
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
|
||||
gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
|
||||
print_gc_result(gc_result)
|
||||
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
|
||||
gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
|
||||
print_gc_result(gc_result)
|
||||
with pytest.raises(Exception, match="invalid branch start lsn: .*"):
|
||||
# this gced_lsn is pretty random, so if gc is disabled this woudln't fail
|
||||
env.neon_cli.create_branch(
|
||||
"test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn
|
||||
)
|
||||
# retry the same with the HTTP API, so that we can inspect the status code
|
||||
with pytest.raises(TimelineCreate406):
|
||||
new_timeline_id = TimelineId.generate()
|
||||
log.info(f"Expecting failure for branch behind gc'd LSN, new_timeline_id={new_timeline_id}")
|
||||
pageserver_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, new_timeline_id, branch_behind_timeline_id, gced_lsn
|
||||
)
|
||||
|
||||
# check that after gc everything is still there
|
||||
assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100
|
||||
|
||||
Reference in New Issue
Block a user