From 8dfe3a070cd04dd2310ed07e1f38f4257dd43cd8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 26 Mar 2024 15:20:05 +0000 Subject: [PATCH] pageserver: return 429 on timeline creation in progress (#7225) ## Problem Currently, we return 409 (Conflict) in two cases: - Temporary: Timeline creation cannot proceed because another timeline with the same ID is being created - Permanent: Timeline creation cannot proceed because another timeline exists with different parameters but the same ID. Callers which time out a request and retry should be able to distinguish these cases. Closes: #7208 ## Summary of changes - Expose `AlreadyCreating` errors as 429 instead of 409 --- pageserver/src/http/openapi_spec.yml | 10 ++++++++-- pageserver/src/http/routes.rs | 11 +++++++---- test_runner/regress/test_tenants.py | 3 +++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0771229845..bb477f89c5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1038,7 +1038,7 @@ paths: format: hex responses: "201": - description: TimelineInfo + description: Timeline was created, or already existed with matching parameters content: application/json: schema: @@ -1068,11 +1068,17 @@ paths: schema: $ref: "#/components/schemas/Error" "409": - description: Timeline already exists, creation skipped + description: Timeline already exists, with different parameters. Creation cannot proceed. content: application/json: schema: $ref: "#/components/schemas/ConflictError" + "429": + description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" "500": description: Generic operation error content: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 26f23fb8c2..3cc92216ed 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -535,10 +535,13 @@ async fn timeline_create_handler( HttpErrorBody::from_msg("Tenant shutting down".to_string()), ) } - Err( - e @ tenant::CreateTimelineError::Conflict - | e @ tenant::CreateTimelineError::AlreadyCreating, - ) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())), + Err(e @ tenant::CreateTimelineError::Conflict) => { + json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())) + } + Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response( + StatusCode::TOO_MANY_REQUESTS, + HttpErrorBody::from_msg(e.to_string()), + ), Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(format!("{err:#}")), diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index f8701b65d7..2832304dcc 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -389,6 +389,9 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): if e.status_code == 409: log.info(f"delay_ms={delay_ms} 409") pass + elif e.status_code == 429: + log.info(f"delay_ms={delay_ms} 429") + pass elif e.status_code == 400: if "is less than existing" in e.message: # We send creation requests very close together in time: it is expected that these