Compare commits

...

4 Commits

Author SHA1 Message Date
Alex Chi Z
c939110d0a test(storcon): add test cases for 404 passthrough
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-16 17:19:40 -04:00
Alex Chi Z
7d4eb50d48 404 if no tenant if found
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-16 17:07:49 -04:00
Alex Chi Z
77271bca07 fix errors
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-16 14:05:39 -04:00
Alex Chi Z
cad28d273e fix(storcon): only convert 404 when tenant not found
Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-16 14:00:56 -04:00
5 changed files with 141 additions and 12 deletions

View File

@@ -834,6 +834,10 @@ impl TenantManager {
mut spawn_mode: SpawnMode,
ctx: &RequestContext,
) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
fail::fail_point!("upsert-location", |_| {
return Ok(None);
});
debug_assert_current_span_has_tenant_id();
info!("configuring tenant location to state {new_location_config:?}");

View File

@@ -785,23 +785,39 @@ async fn handle_tenant_timeline_passthrough(
error_counter.inc(labels);
}
// Transform 404 into 503 if we raced with a migration
if resp.status() == reqwest::StatusCode::NOT_FOUND && !consistent {
// Rather than retry here, send the client a 503 to prompt a retry: this matches
// the pageserver's use of 503, and all clients calling this API should retry on 503.
return Err(ApiError::ResourceUnavailable(
format!("Pageserver {node} returned 404 due to ongoing migration, retry later").into(),
));
}
let resp_staus = resp.status();
// We have a reqest::Response, would like a http::Response
let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp_staus)?);
for (k, v) in resp.headers() {
builder = builder.header(k.as_str(), v.as_bytes());
}
let resp_bytes = resp
.bytes()
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
// Transform 404 into 503 if we raced with a migration
if resp_staus == reqwest::StatusCode::NOT_FOUND && !consistent {
let resp_str = std::str::from_utf8(&resp_bytes)
.map_err(|e| ApiError::InternalServerError(e.into()))?;
// We only handle "tenant not found" errors; other 404s like timeline not found should
// be forwarded as-is.
if resp_str.contains(&format!("tenant {tenant_or_shard_id}")) {
// Rather than retry here, send the client a 503 to prompt a retry: this matches
// the pageserver's use of 503, and all clients calling this API should retry on 503.
return Err(ApiError::ResourceUnavailable(
format!(
"Pageserver {node} returned tenant 404 due to ongoing migration, retry later"
)
.into(),
));
}
}
let response = builder
.body(Body::wrap_stream(resp.bytes_stream()))
.body(Body::from(resp_bytes))
.map_err(|e| ApiError::InternalServerError(e.into()))?;
Ok(response)

View File

@@ -4815,6 +4815,12 @@ impl Service {
}
}
if targets.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
}
Ok(TenantShardAttachState {
targets,
by_node_id,

View File

@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
return res_json
def timeline_lsn_lease(
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
):
data = {
"lsn": str(lsn),
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
json=data,
**kwargs,
)
self.verbose_error(res)
res_json = res.json()

View File

@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
import fixtures.utils
import pytest
from fixtures.auth_tokens import TokenScope
from fixtures.common_types import TenantId, TenantShardId, TimelineId
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
DEFAULT_AZ_ID,
@@ -47,6 +47,7 @@ from fixtures.utils import (
wait_until,
)
from fixtures.workload import Workload
from requests.adapters import HTTPAdapter
from urllib3 import Retry
from werkzeug.wrappers.response import Response
@@ -4814,3 +4815,104 @@ def test_storage_controller_migrate_with_pageserver_restart(
"shards": [{"node_id": int(secondary.id), "shard_number": 0}],
"preferred_az": DEFAULT_AZ_ID,
}
@run_only_on_default_postgres("PG version is not important for this test")
def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
"""
Ensures that the storage controller correctly forwards 404s and converts some of them
into 503s before forwarding to the client.
"""
neon_env_builder.num_pageservers = 2
neon_env_builder.num_azs = 2
env = neon_env_builder.init_start()
env.storage_controller.allowed_errors.append(".*Reconcile error.*")
env.storage_controller.allowed_errors.append(".*Timed out.*")
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
env.storage_controller.reconcile_until_idle()
# 404s on tenants and timelines are forwarded as-is when reconciler is not running.
# Access a non-existing timeline -> 404
with pytest.raises(PageserverApiException) as e:
env.storage_controller.pageserver_api().timeline_detail(
env.initial_tenant, TimelineId.generate()
)
assert e.value.status_code == 404
with pytest.raises(PageserverApiException) as e:
env.storage_controller.pageserver_api().timeline_lsn_lease(
env.initial_tenant, TimelineId.generate(), Lsn(0)
)
assert e.value.status_code == 404
# Access a non-existing tenant when reconciler is not running -> 404
with pytest.raises(PageserverApiException) as e:
env.storage_controller.pageserver_api().timeline_detail(
TenantId.generate(), env.initial_timeline
)
assert e.value.status_code == 404
with pytest.raises(PageserverApiException) as e:
env.storage_controller.pageserver_api().timeline_lsn_lease(
TenantId.generate(), env.initial_timeline, Lsn(0)
)
assert e.value.status_code == 404
# Normal requests should succeed
detail = env.storage_controller.pageserver_api().timeline_detail(
env.initial_tenant, env.initial_timeline
)
last_record_lsn = Lsn(detail["last_record_lsn"])
env.storage_controller.pageserver_api().timeline_lsn_lease(
env.initial_tenant, env.initial_timeline, last_record_lsn
)
# Get into a situation where the intent state is not the same as the observed state.
describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
current_primary = describe["node_attached"]
current_secondary = describe["node_secondary"][0]
assert current_primary != current_secondary
# Do a shard migration to force switch the primary; but do not wait for it to complete.
# Disable attach operations on the pageservers. Configure `upsert-location` to error
# so that pageserver won't attach.
for ps in env.pageservers:
ps.http_client().configure_failpoints(("upsert-location", "return"))
# Do the migration in another thread; the request will be dropped as we don't wait.
shard_zero = TenantShardId(env.initial_tenant, 0, 0)
concurrent.futures.ThreadPoolExecutor(max_workers=1).submit(
env.storage_controller.tenant_shard_migrate,
shard_zero,
current_secondary,
StorageControllerMigrationConfig(override_scheduler=True),
)
# Not the best way to do this, we should wait until the migration gets started.
time.sleep(1)
placement = env.storage_controller.get_tenants_placement()[str(shard_zero)]
assert placement["observed"] != placement["intent"]
assert placement["observed"]["attached"] == current_primary
assert placement["intent"]["attached"] == current_secondary
# Now we issue requests that would cause 404 again
retry_strategy = Retry(total=0)
adapter = HTTPAdapter(max_retries=retry_strategy)
no_retry_api = env.storage_controller.pageserver_api()
no_retry_api.mount("http://", adapter)
no_retry_api.mount("https://", adapter)
# As intent state != observed state, tenant not found error should return 503
with pytest.raises(PageserverApiException) as e:
no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate())
assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
with pytest.raises(PageserverApiException) as e:
no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0))
assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
# Unblock attach operations
for ps in env.pageservers:
ps.http_client().configure_failpoints(("upsert-location", "off"))