mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
Merge branch 'main' into amasterov/random-ops-add-snapshots
# Conflicts: # test_runner/random_ops/test_random_ops.py
This commit is contained in:
@@ -66,6 +66,12 @@ class EndpointHttpClient(requests.Session):
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def autoscaling_metrics(self):
|
||||
res = self.get(f"http://localhost:{self.external_port}/autoscaling_metrics")
|
||||
res.raise_for_status()
|
||||
log.debug("raw compute metrics: %s", res.text)
|
||||
return res.text
|
||||
|
||||
def prewarm_lfc_status(self) -> dict[str, str]:
|
||||
res = self.get(self.prewarm_url)
|
||||
res.raise_for_status()
|
||||
|
||||
@@ -24,6 +24,7 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
|
||||
|
||||
# Some API calls not yet implemented.
|
||||
# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
|
||||
@final
|
||||
class NeonAPI:
|
||||
def __init__(self, neon_api_key: str, neon_api_base_url: str):
|
||||
self.__neon_api_key = neon_api_key
|
||||
@@ -171,7 +172,7 @@ class NeonAPI:
|
||||
protected: bool | None = None,
|
||||
archived: bool | None = None,
|
||||
init_source: str | None = None,
|
||||
add_endpoint=True,
|
||||
add_endpoint: bool = True,
|
||||
) -> dict[str, Any]:
|
||||
data: dict[str, Any] = {}
|
||||
if add_endpoint:
|
||||
|
||||
@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
):
|
||||
cmd = ["storage_controller", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
cmd.append(f"--instance-id={instance_id}")
|
||||
if base_port is not None:
|
||||
cmd.append(f"--base-port={base_port}")
|
||||
if handle_ps_local_disk_loss is not None:
|
||||
cmd.append(
|
||||
f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
|
||||
)
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
|
||||
@@ -503,6 +508,7 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple=False,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> subprocess.CompletedProcess[str]:
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -534,6 +540,8 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
args.extend(["--allow-multiple"])
|
||||
if update_catalog:
|
||||
args.extend(["--update-catalog"])
|
||||
if privileged_role_name is not None:
|
||||
args.extend(["--privileged-role-name", privileged_role_name])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
|
||||
@@ -728,7 +728,7 @@ class NeonEnvBuilder:
|
||||
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
|
||||
# However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
|
||||
# controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
|
||||
# So, from_repo_dir patches up the the storcon database.
|
||||
# So, from_repo_dir patches up the storcon database.
|
||||
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
|
||||
assert not patch_script_path.exists()
|
||||
patch_script = ""
|
||||
@@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
) -> Self:
|
||||
assert not self.running
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.env.neon_cli.storage_controller_start(
|
||||
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
|
||||
)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2119,11 +2122,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
def node_delete(self, node_id):
|
||||
def node_delete(self, node_id, force: bool = False):
|
||||
log.info(f"node_delete({node_id})")
|
||||
query = f"{self.api}/control/v1/node/{node_id}/delete"
|
||||
if force:
|
||||
query += "?force=true"
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.api}/control/v1/node/{node_id}/delete",
|
||||
query,
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
@@ -2835,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController):
|
||||
timeout_in_seconds: int | None = None,
|
||||
instance_id: int | None = None,
|
||||
base_port: int | None = None,
|
||||
handle_ps_local_disk_loss: bool | None = None,
|
||||
) -> Self:
|
||||
assert instance_id is not None and base_port is not None
|
||||
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
|
||||
self.env.neon_cli.storage_controller_start(
|
||||
timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
|
||||
)
|
||||
self.instances[instance_id] = {"running": True}
|
||||
|
||||
self.running = True
|
||||
@@ -4118,6 +4127,294 @@ class NeonAuthBroker:
|
||||
self._popen.kill()
|
||||
|
||||
|
||||
class NeonLocalProxy(LogUtils):
|
||||
"""
|
||||
An object managing a local_proxy instance for rest broker testing.
|
||||
The local_proxy serves as a direct connection to VanillaPostgres.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
http_port: int,
|
||||
metrics_port: int,
|
||||
vanilla_pg: VanillaPostgres,
|
||||
config_path: Path | None = None,
|
||||
):
|
||||
self.neon_binpath = neon_binpath
|
||||
self.test_output_dir = test_output_dir
|
||||
self.http_port = http_port
|
||||
self.metrics_port = metrics_port
|
||||
self.vanilla_pg = vanilla_pg
|
||||
self.config_path = config_path or (test_output_dir / "local_proxy.json")
|
||||
self.host = "127.0.0.1"
|
||||
self.running = False
|
||||
self.logfile = test_output_dir / "local_proxy.log"
|
||||
self._popen: subprocess.Popen[bytes] | None = None
|
||||
super().__init__(logfile=self.logfile)
|
||||
|
||||
def start(self) -> Self:
|
||||
assert self._popen is None
|
||||
assert not self.running
|
||||
|
||||
# Ensure vanilla_pg is running
|
||||
if not self.vanilla_pg.is_running():
|
||||
self.vanilla_pg.start()
|
||||
|
||||
args = [
|
||||
str(self.neon_binpath / "local_proxy"),
|
||||
"--http",
|
||||
f"{self.host}:{self.http_port}",
|
||||
"--metrics",
|
||||
f"{self.host}:{self.metrics_port}",
|
||||
"--postgres",
|
||||
f"127.0.0.1:{self.vanilla_pg.default_options['port']}",
|
||||
"--config-path",
|
||||
str(self.config_path),
|
||||
"--disable-pg-session-jwt",
|
||||
]
|
||||
|
||||
logfile = open(self.logfile, "w")
|
||||
self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
|
||||
self.running = True
|
||||
self._wait_until_ready()
|
||||
return self
|
||||
|
||||
def stop(self) -> Self:
|
||||
if self._popen is not None and self.running:
|
||||
self._popen.terminate()
|
||||
try:
|
||||
self._popen.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning("failed to gracefully terminate local_proxy; killing")
|
||||
self._popen.kill()
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def get_binary_version(self) -> str:
|
||||
"""Get the version string of the local_proxy binary"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[str(self.neon_binpath / "local_proxy"), "--version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError):
|
||||
return ""
|
||||
|
||||
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
|
||||
def _wait_until_ready(self):
|
||||
assert self._popen and self._popen.poll() is None, (
|
||||
"Local proxy exited unexpectedly. Check test log."
|
||||
)
|
||||
requests.get(f"http://{self.host}:{self.http_port}/metrics")
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
response = requests.get(f"http://{self.host}:{self.metrics_port}/metrics")
|
||||
return response.text
|
||||
|
||||
def assert_no_errors(self):
|
||||
# Define allowed error patterns for local_proxy
|
||||
allowed_errors = [
|
||||
# Add patterns as needed
|
||||
]
|
||||
not_allowed = [
|
||||
"error",
|
||||
"panic",
|
||||
"failed",
|
||||
]
|
||||
|
||||
for na in not_allowed:
|
||||
if na not in allowed_errors:
|
||||
assert not self.log_contains(na), f"Found disallowed error pattern: {na}"
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
traceback: TracebackType | None,
|
||||
):
|
||||
self.stop()
|
||||
|
||||
|
||||
class NeonRestBrokerProxy(LogUtils):
|
||||
"""
|
||||
An object managing a proxy instance configured as both auth broker and rest broker.
|
||||
This is the main proxy binary with --is-auth-broker and --is-rest-broker flags.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
wss_port: int,
|
||||
http_port: int,
|
||||
mgmt_port: int,
|
||||
config_path: Path | None = None,
|
||||
):
|
||||
self.neon_binpath = neon_binpath
|
||||
self.test_output_dir = test_output_dir
|
||||
self.wss_port = wss_port
|
||||
self.http_port = http_port
|
||||
self.mgmt_port = mgmt_port
|
||||
self.config_path = config_path or (test_output_dir / "rest_broker_proxy.json")
|
||||
self.host = "127.0.0.1"
|
||||
self.running = False
|
||||
self.logfile = test_output_dir / "rest_broker_proxy.log"
|
||||
self._popen: subprocess.Popen[Any] | None = None
|
||||
|
||||
def start(self) -> Self:
|
||||
if self.running:
|
||||
return self
|
||||
|
||||
# Generate self-signed TLS certificates
|
||||
cert_path = self.test_output_dir / "server.crt"
|
||||
key_path = self.test_output_dir / "server.key"
|
||||
|
||||
if not cert_path.exists() or not key_path.exists():
|
||||
import subprocess
|
||||
|
||||
log.info("Generating self-signed TLS certificate for rest broker")
|
||||
subprocess.run(
|
||||
[
|
||||
"openssl",
|
||||
"req",
|
||||
"-new",
|
||||
"-x509",
|
||||
"-days",
|
||||
"365",
|
||||
"-nodes",
|
||||
"-text",
|
||||
"-out",
|
||||
str(cert_path),
|
||||
"-keyout",
|
||||
str(key_path),
|
||||
"-subj",
|
||||
"/CN=*.local.neon.build",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
log.info(
|
||||
f"Starting rest broker proxy on WSS port {self.wss_port}, HTTP port {self.http_port}"
|
||||
)
|
||||
|
||||
cmd = [
|
||||
str(self.neon_binpath / "proxy"),
|
||||
"-c",
|
||||
str(cert_path),
|
||||
"-k",
|
||||
str(key_path),
|
||||
"--is-auth-broker",
|
||||
"true",
|
||||
"--is-rest-broker",
|
||||
"true",
|
||||
"--wss",
|
||||
f"{self.host}:{self.wss_port}",
|
||||
"--http",
|
||||
f"{self.host}:{self.http_port}",
|
||||
"--mgmt",
|
||||
f"{self.host}:{self.mgmt_port}",
|
||||
"--auth-backend",
|
||||
"local",
|
||||
"--config-path",
|
||||
str(self.config_path),
|
||||
]
|
||||
|
||||
log.info(f"Starting rest broker proxy with command: {' '.join(cmd)}")
|
||||
|
||||
with open(self.logfile, "w") as logfile:
|
||||
self._popen = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=logfile,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=self.test_output_dir,
|
||||
env={
|
||||
**os.environ,
|
||||
"RUST_LOG": "info",
|
||||
"LOGFMT": "text",
|
||||
"OTEL_SDK_DISABLED": "true",
|
||||
},
|
||||
)
|
||||
|
||||
self.running = True
|
||||
self._wait_until_ready()
|
||||
return self
|
||||
|
||||
def stop(self) -> Self:
|
||||
if not self.running:
|
||||
return self
|
||||
|
||||
log.info("Stopping rest broker proxy")
|
||||
|
||||
if self._popen is not None:
|
||||
self._popen.terminate()
|
||||
try:
|
||||
self._popen.wait(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning("failed to gracefully terminate rest broker proxy; killing")
|
||||
self._popen.kill()
|
||||
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def get_binary_version(self) -> str:
|
||||
cmd = [str(self.neon_binpath / "proxy"), "--version"]
|
||||
res = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return res.stdout.strip()
|
||||
|
||||
@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
|
||||
def _wait_until_ready(self):
|
||||
# Check if the WSS port is ready using a simple HTTPS request
|
||||
# REST API is served on the WSS port with HTTPS
|
||||
requests.get(f"https://{self.host}:{self.wss_port}/", timeout=1, verify=False)
|
||||
# Any response (even error) means the server is up - we just need to connect
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
# Metrics are still on the HTTP port
|
||||
response = requests.get(f"http://{self.host}:{self.http_port}/metrics", timeout=5)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
def assert_no_errors(self):
|
||||
# Define allowed error patterns for rest broker proxy
|
||||
allowed_errors = [
|
||||
"connection closed before message completed",
|
||||
"connection reset by peer",
|
||||
"broken pipe",
|
||||
"client disconnected",
|
||||
"Authentication failed",
|
||||
"connection timed out",
|
||||
"no connection available",
|
||||
"Pool dropped",
|
||||
]
|
||||
|
||||
with open(self.logfile) as f:
|
||||
for line in f:
|
||||
if "ERROR" in line or "FATAL" in line:
|
||||
if not any(allowed in line for allowed in allowed_errors):
|
||||
raise AssertionError(
|
||||
f"Found error in rest broker proxy log: {line.strip()}"
|
||||
)
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_value: BaseException | None,
|
||||
traceback: TracebackType | None,
|
||||
):
|
||||
self.stop()
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def link_proxy(
|
||||
port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
|
||||
@@ -4200,6 +4497,81 @@ def static_proxy(
|
||||
yield proxy
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def local_proxy(
|
||||
vanilla_pg: VanillaPostgres,
|
||||
port_distributor: PortDistributor,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Iterator[NeonLocalProxy]:
|
||||
"""Local proxy that connects directly to vanilla postgres for rest broker testing."""
|
||||
|
||||
# Start vanilla_pg without database bootstrapping
|
||||
vanilla_pg.start()
|
||||
|
||||
http_port = port_distributor.get_port()
|
||||
metrics_port = port_distributor.get_port()
|
||||
|
||||
with NeonLocalProxy(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
http_port=http_port,
|
||||
metrics_port=metrics_port,
|
||||
vanilla_pg=vanilla_pg,
|
||||
) as proxy:
|
||||
proxy.start()
|
||||
yield proxy
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def local_proxy_fixed_port(
|
||||
vanilla_pg: VanillaPostgres,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Iterator[NeonLocalProxy]:
|
||||
"""Local proxy that connects directly to vanilla postgres on the hardcoded port 7432."""
|
||||
|
||||
# Start vanilla_pg without database bootstrapping
|
||||
vanilla_pg.start()
|
||||
|
||||
# Use the hardcoded port that the rest broker proxy expects
|
||||
http_port = 7432
|
||||
metrics_port = 7433 # Use a different port for metrics
|
||||
|
||||
with NeonLocalProxy(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
http_port=http_port,
|
||||
metrics_port=metrics_port,
|
||||
vanilla_pg=vanilla_pg,
|
||||
) as proxy:
|
||||
proxy.start()
|
||||
yield proxy
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def rest_broker_proxy(
|
||||
port_distributor: PortDistributor,
|
||||
neon_binpath: Path,
|
||||
test_output_dir: Path,
|
||||
) -> Iterator[NeonRestBrokerProxy]:
|
||||
"""Rest broker proxy that handles both auth broker and rest broker functionality."""
|
||||
|
||||
wss_port = port_distributor.get_port()
|
||||
http_port = port_distributor.get_port()
|
||||
mgmt_port = port_distributor.get_port()
|
||||
|
||||
with NeonRestBrokerProxy(
|
||||
neon_binpath=neon_binpath,
|
||||
test_output_dir=test_output_dir,
|
||||
wss_port=wss_port,
|
||||
http_port=http_port,
|
||||
mgmt_port=mgmt_port,
|
||||
) as proxy:
|
||||
proxy.start()
|
||||
yield proxy
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def neon_authorize_jwk() -> jwk.JWK:
|
||||
kid = str(uuid.uuid4())
|
||||
@@ -4324,6 +4696,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple: bool = False,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Create a new Postgres endpoint.
|
||||
@@ -4351,6 +4724,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
update_catalog=update_catalog,
|
||||
privileged_role_name=privileged_role_name,
|
||||
)
|
||||
path = Path("endpoints") / self.endpoint_id / "pgdata"
|
||||
self.pgdata_dir = self.env.repo_dir / path
|
||||
@@ -4800,6 +5174,7 @@ class EndpointFactory:
|
||||
config_lines: list[str] | None = None,
|
||||
pageserver_id: int | None = None,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -4823,6 +5198,7 @@ class EndpointFactory:
|
||||
config_lines=config_lines,
|
||||
pageserver_id=pageserver_id,
|
||||
update_catalog=update_catalog,
|
||||
privileged_role_name=privileged_role_name,
|
||||
)
|
||||
|
||||
def stop_all(self, fail_on_error=True) -> Self:
|
||||
@@ -5417,6 +5793,7 @@ SKIP_FILES = frozenset(
|
||||
"postmaster.pid",
|
||||
"pg_control",
|
||||
"pg_dynshmem",
|
||||
"neon-communicator.socket",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -152,6 +152,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
".*reconciler.*neon_local error.*",
|
||||
# Tenant rate limits may fire in tests that submit lots of API requests.
|
||||
".*tenant \\S+ is rate limited.*",
|
||||
# Reconciliations may get stuck/delayed e.g. in chaos tests.
|
||||
".*background_reconcile: Shard reconciliation is stuck.*",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
return res_json
|
||||
|
||||
def timeline_lsn_lease(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
|
||||
):
|
||||
data = {
|
||||
"lsn": str(lsn),
|
||||
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
|
||||
json=data,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
|
||||
@@ -741,3 +741,29 @@ def shared_buffers_for_max_cu(max_cu: float) -> str:
|
||||
sharedBuffersMb = int(max(128, (1023 + maxBackends * 256) / 1024))
|
||||
sharedBuffers = int(sharedBuffersMb * 1024 / 8)
|
||||
return str(sharedBuffers)
|
||||
|
||||
|
||||
def skip_if_proxy_lacks_rest_broker(reason: str = "proxy was built without 'rest_broker' feature"):
|
||||
# Determine the binary path using the same logic as neon_binpath fixture
|
||||
def has_rest_broker_feature():
|
||||
# Find the neon binaries
|
||||
if env_neon_bin := os.environ.get("NEON_BIN"):
|
||||
binpath = Path(env_neon_bin)
|
||||
else:
|
||||
base_dir = Path(__file__).parents[2] # Same as BASE_DIR in paths.py
|
||||
build_type = os.environ.get("BUILD_TYPE", "debug")
|
||||
binpath = base_dir / "target" / build_type
|
||||
|
||||
proxy_bin = binpath / "proxy"
|
||||
if not proxy_bin.exists():
|
||||
return False
|
||||
|
||||
try:
|
||||
cmd = [str(proxy_bin), "--help"]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=10)
|
||||
help_output = result.stdout
|
||||
return "--is-rest-broker" in help_output
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
||||
return False
|
||||
|
||||
return pytest.mark.skipif(not has_rest_broker_feature(), reason=reason)
|
||||
|
||||
@@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
".*Local notification hook failed.*",
|
||||
".*Marking shard.*for notification retry.*",
|
||||
".*Failed to notify compute.*",
|
||||
# As an optimization, the storage controller kicks the downloads on the secondary
|
||||
# after the shard split. However, secondaries are created async, so it's possible
|
||||
# that the intent state was modified, but the actual secondary hasn't been created,
|
||||
# which results in an error.
|
||||
".*Error calling secondary download after shard split.*",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -711,6 +711,9 @@ def test_api_random(
|
||||
# To not go to the past where pgbench tables do not exist
|
||||
time.sleep(1)
|
||||
project.min_time = datetime.now(UTC)
|
||||
# To not go to the past where pgbench tables do not exist
|
||||
time.sleep(1)
|
||||
project.min_time = datetime.now(UTC)
|
||||
for _ in range(num_operations):
|
||||
log.info("Starting action #%s", _ + 1)
|
||||
while not do_action(
|
||||
|
||||
@@ -24,10 +24,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
[
|
||||
".*get_values_reconstruct_data for layer .*",
|
||||
".*could not find data for key.*",
|
||||
".*is not active. Current state: Broken.*",
|
||||
".*will not become active. Current state: Broken.*",
|
||||
".*failed to load metadata.*",
|
||||
".*load failed.*load local timeline.*",
|
||||
".*: layer load failed, assuming permanent failure:.*",
|
||||
".*failed to get checkpoint bytes.*",
|
||||
".*failed to get control bytes.*",
|
||||
|
||||
54
test_runner/regress/test_communicator_metrics_exporter.py
Normal file
54
test_runner/regress/test_communicator_metrics_exporter.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
import requests_unixsocket # type: ignore [import-untyped]
|
||||
from fixtures.metrics import parse_metrics
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
NEON_COMMUNICATOR_SOCKET_NAME = "neon-communicator.socket"
|
||||
|
||||
|
||||
def test_communicator_metrics(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test the communicator's built-in HTTP prometheus exporter
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.start()
|
||||
|
||||
# Change current directory to the data directory, so that we can use
|
||||
# a short relative path to refer to the socket. (There's a 100 char
|
||||
# limitation on the path.)
|
||||
os.chdir(str(endpoint.pgdata_dir))
|
||||
session = requests_unixsocket.Session()
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
|
||||
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
|
||||
|
||||
# quick test that the endpoint returned something expected. (We don't validate
|
||||
# that the metrics returned are sensible.)
|
||||
m = parse_metrics(r.text)
|
||||
m.query_one("lfc_hits")
|
||||
m.query_one("lfc_misses")
|
||||
|
||||
# Test panic handling. The /debug/panic endpoint raises a Rust panic. It's
|
||||
# expected to unwind and drop the HTTP connection without response, but not
|
||||
# kill the process or the server.
|
||||
with pytest.raises(
|
||||
requests.ConnectionError, match="Remote end closed connection without response"
|
||||
):
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/debug/panic")
|
||||
assert r.status_code == 500
|
||||
|
||||
# Test that subsequent requests after the panic still work.
|
||||
r = session.get(f"http+unix://{NEON_COMMUNICATOR_SOCKET_NAME}/metrics")
|
||||
assert r.status_code == 200, f"got response {r.status_code}: {r.text}"
|
||||
m = parse_metrics(r.text)
|
||||
m.query_one("lfc_hits")
|
||||
m.query_one("lfc_misses")
|
||||
@@ -687,7 +687,7 @@ def test_sharding_compaction(
|
||||
for _i in range(0, 10):
|
||||
# Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
|
||||
# these should result in image layers each time we write some data into a shard, and also shards
|
||||
# recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
|
||||
# receiving less data hitting their "empty image layer" path (where they should skip writing the layer,
|
||||
# rather than asserting)
|
||||
workload.churn_rows(64)
|
||||
|
||||
|
||||
@@ -187,19 +187,21 @@ def test_create_snapshot(
|
||||
env.pageserver.stop()
|
||||
env.storage_controller.stop()
|
||||
|
||||
# Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
|
||||
compatibility_snapshot_dir = (
|
||||
# Directory `new_compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
|
||||
new_compatibility_snapshot_dir = (
|
||||
top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
|
||||
)
|
||||
if compatibility_snapshot_dir.exists():
|
||||
shutil.rmtree(compatibility_snapshot_dir)
|
||||
if new_compatibility_snapshot_dir.exists():
|
||||
shutil.rmtree(new_compatibility_snapshot_dir)
|
||||
|
||||
shutil.copytree(
|
||||
test_output_dir,
|
||||
compatibility_snapshot_dir,
|
||||
ignore=shutil.ignore_patterns("pg_dynshmem"),
|
||||
new_compatibility_snapshot_dir,
|
||||
ignore=shutil.ignore_patterns("pg_dynshmem", "neon-communicator.socket"),
|
||||
)
|
||||
|
||||
log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
|
||||
|
||||
|
||||
# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
|
||||
ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
|
||||
@@ -218,6 +220,7 @@ def test_backward_compatibility(
|
||||
"""
|
||||
Test that the new binaries can read old data
|
||||
"""
|
||||
log.info(f"Using snapshot dir at {compatibility_snapshot_dir}")
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
|
||||
env.pageserver.allowed_errors.append(ingest_lag_log_line)
|
||||
@@ -242,7 +245,6 @@ def test_forward_compatibility(
|
||||
test_output_dir: Path,
|
||||
top_output_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
compatibility_snapshot_dir: Path,
|
||||
compute_reconfigure_listener: ComputeReconfigure,
|
||||
):
|
||||
"""
|
||||
@@ -266,8 +268,14 @@ def test_forward_compatibility(
|
||||
neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
|
||||
neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
|
||||
|
||||
# Note that we are testing with new data, so we should use `new_compatibility_snapshot_dir`, which is created by test_create_snapshot.
|
||||
new_compatibility_snapshot_dir = (
|
||||
top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
|
||||
)
|
||||
|
||||
log.info(f"Using snapshot dir at {new_compatibility_snapshot_dir}")
|
||||
env = neon_env_builder.from_repo_dir(
|
||||
compatibility_snapshot_dir / "repo",
|
||||
new_compatibility_snapshot_dir / "repo",
|
||||
)
|
||||
# there may be an arbitrary number of unrelated tests run between create_snapshot and here
|
||||
env.pageserver.allowed_errors.append(ingest_lag_log_line)
|
||||
@@ -296,7 +304,7 @@ def test_forward_compatibility(
|
||||
check_neon_works(
|
||||
env,
|
||||
test_output_dir=test_output_dir,
|
||||
sql_dump_path=compatibility_snapshot_dir / "dump.sql",
|
||||
sql_dump_path=new_compatibility_snapshot_dir / "dump.sql",
|
||||
repo_dir=env.repo_dir,
|
||||
)
|
||||
|
||||
|
||||
@@ -50,11 +50,15 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder):
|
||||
)["result"]
|
||||
)
|
||||
|
||||
env.endpoints.create_start("main") # trigger basebackup
|
||||
env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant)
|
||||
|
||||
# Check if the properties exist
|
||||
result = env.pageserver.http_client().evaluate_feature_flag_multivariate(
|
||||
env.initial_tenant, "test-feature-flag"
|
||||
)
|
||||
|
||||
assert "tenant_remote_size_mb" in result["properties"]
|
||||
assert "tenant_db_count_max" in result["properties"]
|
||||
assert "tenant_rel_count_max" in result["properties"]
|
||||
assert "tenant_id" in result["properties"]
|
||||
|
||||
47
test_runner/regress/test_hcc_handling_ps_data_loss.py
Normal file
47
test_runner/regress/test_hcc_handling_ps_data_loss.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import shutil
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def test_hcc_handling_ps_data_loss(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test that following a pageserver local data loss event, the system can recover automatically (i.e.
|
||||
rehydrating the restarted pageserver from remote storage) without manual intervention. The
|
||||
pageserver indicates to the storage controller that it has restarted without any local tenant
|
||||
data in its "reattach" request and the storage controller uses this information to detect the
|
||||
data loss condition and reconfigure the pageserver as necessary.
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.start()
|
||||
env.storage_controller.start(handle_ps_local_disk_loss=True)
|
||||
env.pageserver.start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
|
||||
# create new nenant
|
||||
tenant_id, _ = env.create_tenant(shard_count=4)
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
|
||||
cur.execute("CREATE DATABASE testdb")
|
||||
|
||||
with endpoint.cursor(dbname="testdb") as cur:
|
||||
cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
|
||||
endpoint.stop()
|
||||
|
||||
# Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
|
||||
# that restarted with the same ID but has lost all its local disk data.
|
||||
env.pageserver.stop(immediate=True)
|
||||
shutil.rmtree(env.pageserver.tenant_dir())
|
||||
env.pageserver.start()
|
||||
|
||||
# Test that the endpoint can start and query the database after the pageserver restarts. This
|
||||
# indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
|
||||
# storage automatically.
|
||||
endpoint.start()
|
||||
with endpoint.cursor(dbname="testdb") as cur:
|
||||
assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100
|
||||
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.utils import USE_LFC, query_scalar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -75,10 +76,24 @@ WITH (fillfactor='100');
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
|
||||
# verify working set size after some index access of a few select pages only
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(false)")
|
||||
log.info(f"working set size after some index access of a few select pages only {blocks}")
|
||||
assert blocks < 20
|
||||
|
||||
# Also test the metrics from the /autoscaling_metrics endpoint
|
||||
autoscaling_metrics = endpoint.http_client().autoscaling_metrics()
|
||||
log.debug(f"Raw metrics: {autoscaling_metrics}")
|
||||
m = parse_metrics(autoscaling_metrics)
|
||||
|
||||
http_estimate = m.query_one(
|
||||
"lfc_approximate_working_set_size_windows",
|
||||
{
|
||||
"duration_seconds": "60",
|
||||
},
|
||||
).value
|
||||
log.info(f"http estimate: {http_estimate}, blocks: {blocks}")
|
||||
assert http_estimate > 0 and http_estimate < 20
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
|
||||
|
||||
@@ -103,3 +103,90 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
query = "DROP SUBSCRIPTION sub CASCADE"
|
||||
log.info(f"Dropping subscription: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
|
||||
def test_privileged_role_override(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
"""
|
||||
Test that we can override the privileged role for an endpoint and when we do it,
|
||||
everything is correctly bootstrapped inside Postgres and we don't have neon_superuser
|
||||
role in the database.
|
||||
"""
|
||||
PRIVILEGED_ROLE_NAME = "my_superuser"
|
||||
|
||||
env = neon_simple_env
|
||||
env.create_branch("test_privileged_role_override")
|
||||
ep = env.endpoints.create(
|
||||
"test_privileged_role_override",
|
||||
privileged_role_name=PRIVILEGED_ROLE_NAME,
|
||||
update_catalog=True,
|
||||
)
|
||||
|
||||
ep.start()
|
||||
|
||||
ep.wait_for_migrations()
|
||||
|
||||
member_roles = [
|
||||
"pg_read_all_data",
|
||||
"pg_write_all_data",
|
||||
"pg_monitor",
|
||||
"pg_signal_backend",
|
||||
]
|
||||
|
||||
non_member_roles = [
|
||||
"pg_execute_server_program",
|
||||
"pg_read_server_files",
|
||||
"pg_write_server_files",
|
||||
]
|
||||
|
||||
role_attributes = {
|
||||
"rolsuper": False,
|
||||
"rolinherit": True,
|
||||
"rolcreaterole": True,
|
||||
"rolcreatedb": True,
|
||||
"rolcanlogin": False,
|
||||
"rolreplication": True,
|
||||
"rolconnlimit": -1,
|
||||
"rolbypassrls": True,
|
||||
}
|
||||
|
||||
if pg_version >= PgVersion.V15:
|
||||
non_member_roles.append("pg_checkpoint")
|
||||
|
||||
if pg_version >= PgVersion.V16:
|
||||
member_roles.append("pg_create_subscription")
|
||||
non_member_roles.append("pg_use_reserved_connections")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(f"SELECT rolname FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
|
||||
|
||||
cur.execute("SELECT rolname FROM pg_roles WHERE rolname = 'neon_superuser'")
|
||||
assert len(cur.fetchall()) == 0
|
||||
|
||||
cur.execute("SHOW neon.privileged_role_name")
|
||||
assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role is created
|
||||
cur.execute(f"select * from pg_roles where rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
assert cur.fetchone() is not None
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role has the correct member roles
|
||||
for role in member_roles:
|
||||
cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
|
||||
assert cur.fetchone() == (True,), (
|
||||
f"Role {role} should be a member of {PRIVILEGED_ROLE_NAME}"
|
||||
)
|
||||
|
||||
for role in non_member_roles:
|
||||
cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
|
||||
assert cur.fetchone() == (False,), (
|
||||
f"Role {role} should not be a member of {PRIVILEGED_ROLE_NAME}"
|
||||
)
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role has the correct role attributes
|
||||
for attr, val in role_attributes.items():
|
||||
cur.execute(f"SELECT {attr} FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
curr_val = cur.fetchone()
|
||||
assert curr_val == (val,), (
|
||||
f"Role attribute {attr} should be {val} instead of {curr_val}"
|
||||
)
|
||||
|
||||
@@ -246,9 +246,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
system_memory = psutil.virtual_memory().total
|
||||
|
||||
# The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
|
||||
# a system with 128GB of RAM). We will then write enough data to violate this limit.
|
||||
max_dirty_data = 128 * 1024 * 1024
|
||||
# The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 256MB on
|
||||
# a system with 256GB of RAM). We will then write enough data to violate this limit.
|
||||
max_dirty_data = 256 * 1024 * 1024
|
||||
ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
|
||||
assert ephemeral_bytes_per_memory_kb > 0
|
||||
|
||||
@@ -272,7 +272,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
|
||||
timeline_count = 10
|
||||
|
||||
# This is about 2MiB of data per timeline
|
||||
entries_per_timeline = 100_000
|
||||
entries_per_timeline = 200_000
|
||||
|
||||
last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
@@ -356,6 +357,81 @@ def test_sql_regress(
|
||||
post_checks(env, test_output_dir, DBNAME, endpoint)
|
||||
|
||||
|
||||
def test_max_wal_rate(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test the databricks.max_wal_mb_per_second GUC and how it affects WAL rate
|
||||
limiting.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
DBNAME = "regression"
|
||||
superuser_name = "databricks_superuser"
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
# we need this option because default max_cluster_size < 0 will disable throttling completely
|
||||
"neon.max_cluster_size=10GB",
|
||||
],
|
||||
)
|
||||
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
f"CREATE ROLE {superuser_name}",
|
||||
f"CREATE DATABASE {DBNAME}",
|
||||
"CREATE EXTENSION neon",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE usertable (YCSB_KEY INT, FIELD0 TEXT);", dbname=DBNAME)
|
||||
|
||||
# Write ~1 MB data.
|
||||
with endpoint.cursor(dbname=DBNAME) as cur:
|
||||
for _ in range(0, 1000):
|
||||
cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
|
||||
|
||||
# No backpressure
|
||||
tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
|
||||
assert tuples[0][0] == 0, "Backpressure throttling detected"
|
||||
|
||||
# 0 MB/s max_wal_rate. WAL proposer can still push some WALs but will be super slow.
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"ALTER SYSTEM SET databricks.max_wal_mb_per_second = 0;",
|
||||
"SELECT pg_reload_conf();",
|
||||
]
|
||||
)
|
||||
|
||||
# Write ~10 KB data should hit backpressure.
|
||||
with endpoint.cursor(dbname=DBNAME) as cur:
|
||||
cur.execute("SET databricks.max_wal_mb_per_second = 0;")
|
||||
for _ in range(0, 10):
|
||||
cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
|
||||
|
||||
tuples = endpoint.safe_psql("SELECT backpressure_throttling_time();")
|
||||
assert tuples[0][0] > 0, "No backpressure throttling detected"
|
||||
|
||||
# 1 MB/s max_wal_rate.
|
||||
endpoint.safe_psql_many(
|
||||
[
|
||||
"ALTER SYSTEM SET databricks.max_wal_mb_per_second = 1;",
|
||||
"SELECT pg_reload_conf();",
|
||||
]
|
||||
)
|
||||
|
||||
# Write 10 MB data.
|
||||
with endpoint.cursor(dbname=DBNAME) as cur:
|
||||
start = int(time.time())
|
||||
for _ in range(0, 10000):
|
||||
cur.execute("INSERT INTO usertable SELECT random(), repeat('a', 1000);")
|
||||
|
||||
end = int(time.time())
|
||||
assert end - start >= 10, (
|
||||
"Throttling should cause the previous inserts to take greater than or equal to 10 seconds"
|
||||
)
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
|
||||
def test_tx_abort_with_many_relations(
|
||||
|
||||
137
test_runner/regress/test_rest_broker.py
Normal file
137
test_runner/regress/test_rest_broker.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import json
|
||||
import signal
|
||||
import time
|
||||
|
||||
import requests
|
||||
from fixtures.utils import skip_if_proxy_lacks_rest_broker
|
||||
from jwcrypto import jwt
|
||||
|
||||
|
||||
@skip_if_proxy_lacks_rest_broker()
|
||||
def test_rest_broker_happy(
|
||||
local_proxy_fixed_port, rest_broker_proxy, vanilla_pg, neon_authorize_jwk, httpserver
|
||||
):
|
||||
"""Test REST API endpoint using local_proxy and rest_broker_proxy."""
|
||||
|
||||
# Use the fixed port local proxy
|
||||
local_proxy = local_proxy_fixed_port
|
||||
|
||||
# Create the required roles for PostgREST authentication
|
||||
vanilla_pg.safe_psql("CREATE ROLE authenticator LOGIN")
|
||||
vanilla_pg.safe_psql("CREATE ROLE authenticated")
|
||||
vanilla_pg.safe_psql("CREATE ROLE anon")
|
||||
vanilla_pg.safe_psql("GRANT authenticated TO authenticator")
|
||||
vanilla_pg.safe_psql("GRANT anon TO authenticator")
|
||||
|
||||
# Create the pgrst schema and configuration function required by the rest broker
|
||||
vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS pgrst")
|
||||
vanilla_pg.safe_psql("""
|
||||
CREATE OR REPLACE FUNCTION pgrst.pre_config()
|
||||
RETURNS VOID AS $$
|
||||
SELECT
|
||||
set_config('pgrst.db_schemas', 'test', true)
|
||||
, set_config('pgrst.db_aggregates_enabled', 'true', true)
|
||||
, set_config('pgrst.db_anon_role', 'anon', true)
|
||||
, set_config('pgrst.jwt_aud', '', true)
|
||||
, set_config('pgrst.jwt_secret', '', true)
|
||||
, set_config('pgrst.jwt_role_claim_key', '."role"', true)
|
||||
|
||||
$$ LANGUAGE SQL;
|
||||
""")
|
||||
vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA pgrst TO authenticator")
|
||||
vanilla_pg.safe_psql("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA pgrst TO authenticator")
|
||||
|
||||
# Bootstrap the database with test data
|
||||
vanilla_pg.safe_psql("CREATE SCHEMA IF NOT EXISTS test")
|
||||
vanilla_pg.safe_psql("""
|
||||
CREATE TABLE IF NOT EXISTS test.items (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT NOT NULL
|
||||
)
|
||||
""")
|
||||
vanilla_pg.safe_psql("INSERT INTO test.items (name) VALUES ('test_item')")
|
||||
|
||||
# Grant access to the test schema for the authenticated role
|
||||
vanilla_pg.safe_psql("GRANT USAGE ON SCHEMA test TO authenticated")
|
||||
vanilla_pg.safe_psql("GRANT SELECT ON ALL TABLES IN SCHEMA test TO authenticated")
|
||||
|
||||
# Set up HTTP server to serve JWKS (like static_auth_broker)
|
||||
# Generate public key from the JWK
|
||||
public_key = neon_authorize_jwk.export_public(as_dict=True)
|
||||
|
||||
# Set up the httpserver to serve the JWKS
|
||||
httpserver.expect_request("/.well-known/jwks.json").respond_with_json({"keys": [public_key]})
|
||||
|
||||
# Create JWKS configuration for the rest broker proxy
|
||||
jwks_config = {
|
||||
"jwks": [
|
||||
{
|
||||
"id": "1",
|
||||
"role_names": ["authenticator", "authenticated", "anon"],
|
||||
"jwks_url": httpserver.url_for("/.well-known/jwks.json"),
|
||||
"provider_name": "foo",
|
||||
"jwt_audience": None,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Write the JWKS config to the config file that rest_broker_proxy expects
|
||||
config_file = rest_broker_proxy.config_path
|
||||
with open(config_file, "w") as f:
|
||||
json.dump(jwks_config, f)
|
||||
|
||||
# Write the same config to the local_proxy config file
|
||||
local_config_file = local_proxy.config_path
|
||||
with open(local_config_file, "w") as f:
|
||||
json.dump(jwks_config, f)
|
||||
|
||||
# Signal both proxies to reload their config
|
||||
if rest_broker_proxy._popen is not None:
|
||||
rest_broker_proxy._popen.send_signal(signal.SIGHUP)
|
||||
if local_proxy._popen is not None:
|
||||
local_proxy._popen.send_signal(signal.SIGHUP)
|
||||
# Wait a bit for config to reload
|
||||
time.sleep(0.5)
|
||||
|
||||
# Generate a proper JWT token using the JWK (similar to test_auth_broker.py)
|
||||
token = jwt.JWT(
|
||||
header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"},
|
||||
claims={
|
||||
"sub": "user",
|
||||
"role": "authenticated", # role that's in role_names
|
||||
"exp": 9999999999, # expires far in the future
|
||||
"iat": 1000000000, # issued at
|
||||
},
|
||||
)
|
||||
token.make_signed_token(neon_authorize_jwk)
|
||||
|
||||
# Debug: Print the JWT claims and config for troubleshooting
|
||||
print(f"JWT claims: {token.claims}")
|
||||
print(f"JWT header: {token.header}")
|
||||
print(f"Config file contains: {jwks_config}")
|
||||
print(f"Public key kid: {public_key.get('kid')}")
|
||||
|
||||
# Test REST API call - following SUBZERO.md pattern
|
||||
# REST API is served on the WSS port with HTTPS and includes database name
|
||||
# ep-purple-glitter-adqior4l-pooler.c-2.us-east-1.aws.neon.tech
|
||||
url = f"https://foo.apirest.c-2.local.neon.build:{rest_broker_proxy.wss_port}/postgres/rest/v1/items"
|
||||
|
||||
response = requests.get(
|
||||
url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {token.serialize()}",
|
||||
},
|
||||
params={"id": "eq.1", "select": "name"},
|
||||
verify=False, # Skip SSL verification for self-signed certs
|
||||
)
|
||||
|
||||
print(f"Response status: {response.status_code}")
|
||||
print(f"Response headers: {response.headers}")
|
||||
print(f"Response body: {response.text}")
|
||||
|
||||
# For now, let's just check that we get some response
|
||||
# We can refine the assertions once we see what the actual response looks like
|
||||
assert response.status_code in [200] # Any response means the proxies are working
|
||||
|
||||
# check the response body
|
||||
assert response.json() == [{"name": "test_item"}]
|
||||
@@ -3,11 +3,22 @@ from __future__ import annotations
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import StorageControllerApiException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
||||
# Fix the code so it handles the migration better.
|
||||
ALLOWED_PAGESERVER_ERRORS = [
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
|
||||
|
||||
def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
@@ -24,16 +35,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
"timeline_safekeeper_count": 1,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
# TODO(diko): pageserver spams with various errors during safekeeper migration.
|
||||
# Fix the code so it handles the migration better.
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Timeline .* was cancelled and cannot be used anymore.*",
|
||||
".*Timeline .* has been deleted.*",
|
||||
".*Timeline .* was not found in global map.*",
|
||||
".*wal receiver task finished with an error.*",
|
||||
]
|
||||
)
|
||||
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
|
||||
@@ -42,15 +44,23 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
assert len(mconf["sk_set"]) == 1
|
||||
assert mconf["generation"] == 1
|
||||
|
||||
current_sk = mconf["sk_set"][0]
|
||||
|
||||
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
|
||||
expected_gen = 1
|
||||
|
||||
for active_sk in range(1, 4):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, [active_sk]
|
||||
)
|
||||
|
||||
if active_sk != current_sk:
|
||||
expected_gen += 2
|
||||
current_sk = active_sk
|
||||
|
||||
other_sks = [sk for sk in range(1, 4) if sk != active_sk]
|
||||
|
||||
for sk in other_sks:
|
||||
@@ -65,9 +75,6 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
||||
|
||||
# 1 initial generation + 2 migrations on each loop iteration.
|
||||
expected_gen = 1 + 2 * 3
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == expected_gen
|
||||
|
||||
@@ -113,3 +120,79 @@ def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
|
||||
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
|
||||
|
||||
expect_fail([sk_set[0], decom_sk], "decomissioned")
|
||||
|
||||
|
||||
def test_safekeeper_migration_common_set_failpoints(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that safekeeper migration handles failures well.
|
||||
|
||||
Two main conditions are checked:
|
||||
1. safekeeper migration handler can be retried on different failures.
|
||||
2. writes do not stuck if sk_set and new_sk_set have a quorum in common.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 4
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 3,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.allowed_errors.extend(ALLOWED_PAGESERVER_ERRORS)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert len(mconf["sk_set"]) == 3
|
||||
assert mconf["generation"] == 1
|
||||
|
||||
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
|
||||
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
|
||||
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
|
||||
ep.safe_psql("CREATE TABLE t(a int)")
|
||||
|
||||
excluded_sk = mconf["sk_set"][-1]
|
||||
added_sk = [sk.id for sk in env.safekeepers if sk.id not in mconf["sk_set"]][0]
|
||||
new_sk_set = mconf["sk_set"][:-1] + [added_sk]
|
||||
log.info(f"migrating sk set from {mconf['sk_set']} to {new_sk_set}")
|
||||
|
||||
failpoints = [
|
||||
"sk-migration-after-step-3",
|
||||
"sk-migration-after-step-4",
|
||||
"sk-migration-after-step-5",
|
||||
"sk-migration-after-step-7",
|
||||
"sk-migration-after-step-8",
|
||||
"sk-migration-step-9-after-set-membership",
|
||||
"sk-migration-step-9-mid-exclude",
|
||||
"sk-migration-step-9-after-exclude",
|
||||
"sk-migration-after-step-9",
|
||||
]
|
||||
|
||||
for i, fp in enumerate(failpoints):
|
||||
env.storage_controller.configure_failpoints((fp, "return(1)"))
|
||||
|
||||
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, new_sk_set
|
||||
)
|
||||
ep.safe_psql(f"INSERT INTO t VALUES ({i})")
|
||||
|
||||
env.storage_controller.configure_failpoints((fp, "off"))
|
||||
|
||||
# No failpoints, migration should succeed.
|
||||
env.storage_controller.migrate_safekeepers(env.initial_tenant, env.initial_timeline, new_sk_set)
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["new_sk_set"] is None
|
||||
assert mconf["sk_set"] == new_sk_set
|
||||
assert mconf["generation"] == 3
|
||||
|
||||
ep.clear_buffers()
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(len(failpoints))]
|
||||
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith("g#3:")
|
||||
|
||||
# Check that we didn't forget to remove the timeline on the excluded safekeeper.
|
||||
with pytest.raises(requests.exceptions.HTTPError) as exc:
|
||||
env.safekeepers[excluded_sk - 1].http_client().timeline_status(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
assert exc.value.response.status_code == 404
|
||||
assert (
|
||||
f"timeline {env.initial_tenant}/{env.initial_timeline} deleted" in exc.value.response.text
|
||||
)
|
||||
|
||||
@@ -1810,6 +1810,8 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
"config_lines": [
|
||||
# Tip: set to 100MB to make the test fail
|
||||
"max_replication_write_lag=1MB",
|
||||
# Hadron: Need to set max_cluster_size to some value to enable any backpressure at all.
|
||||
"neon.max_cluster_size=1GB",
|
||||
],
|
||||
# We need `neon` extension for calling backpressure functions,
|
||||
# this flag instructs `compute_ctl` to pre-install it.
|
||||
|
||||
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
|
||||
import fixtures.utils
|
||||
import pytest
|
||||
from fixtures.auth_tokens import TokenScope
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
DEFAULT_AZ_ID,
|
||||
@@ -47,6 +47,7 @@ from fixtures.utils import (
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.workload import Workload
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3 import Retry
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
@@ -72,6 +73,12 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
|
||||
return counts
|
||||
|
||||
|
||||
class DeletionAPIKind(Enum):
|
||||
OLD = "old"
|
||||
FORCE = "force"
|
||||
GRACEFUL = "graceful"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(**fixtures.utils.allpairs_versions())
|
||||
def test_storage_controller_smoke(
|
||||
neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination
|
||||
@@ -990,7 +997,7 @@ def test_storage_controller_compute_hook_retry(
|
||||
|
||||
|
||||
@run_only_on_default_postgres("postgres behavior is not relevant")
|
||||
def test_storage_controller_compute_hook_keep_failing(
|
||||
def test_storage_controller_compute_hook_stuck_reconciles(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
httpserver_listen_address: ListenAddress,
|
||||
@@ -1040,7 +1047,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
|
||||
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
|
||||
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
|
||||
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
|
||||
|
||||
# Migrate all allowed tenant shards to the first alive pageserver
|
||||
@@ -1055,7 +1062,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
|
||||
# Make some reconcile_all calls to trigger optimizations
|
||||
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
|
||||
RECONCILE_COUNT = 12
|
||||
RECONCILE_COUNT = 20
|
||||
for i in range(RECONCILE_COUNT):
|
||||
try:
|
||||
n = env.storage_controller.reconcile_all()
|
||||
@@ -1068,6 +1075,8 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
|
||||
time.sleep(2)
|
||||
|
||||
env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
|
||||
|
||||
# Check that the allowed tenant shards are optimized due to affinity rules
|
||||
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
|
||||
not_optimized_shard_count = 0
|
||||
@@ -2572,9 +2581,11 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("while_offline", [True, False])
|
||||
@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
|
||||
def test_storage_controller_node_deletion(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
while_offline: bool,
|
||||
deletion_api: DeletionAPIKind,
|
||||
):
|
||||
"""
|
||||
Test that deleting a node works & properly reschedules everything that was on the node.
|
||||
@@ -2598,6 +2609,8 @@ def test_storage_controller_node_deletion(
|
||||
assert env.storage_controller.reconcile_all() == 0
|
||||
|
||||
victim = env.pageservers[-1]
|
||||
if deletion_api == DeletionAPIKind.FORCE and not while_offline:
|
||||
victim.allowed_errors.append(".*request was dropped before completing.*")
|
||||
|
||||
# The procedure a human would follow is:
|
||||
# 1. Mark pageserver scheduling=pause
|
||||
@@ -2621,7 +2634,12 @@ def test_storage_controller_node_deletion(
|
||||
wait_until(assert_shards_migrated)
|
||||
|
||||
log.info(f"Deleting pageserver {victim.id}")
|
||||
env.storage_controller.node_delete_old(victim.id)
|
||||
if deletion_api == DeletionAPIKind.FORCE:
|
||||
env.storage_controller.node_delete(victim.id, force=True)
|
||||
elif deletion_api == DeletionAPIKind.OLD:
|
||||
env.storage_controller.node_delete_old(victim.id)
|
||||
else:
|
||||
raise AssertionError(f"Invalid deletion API: {deletion_api}")
|
||||
|
||||
if not while_offline:
|
||||
|
||||
@@ -2634,7 +2652,15 @@ def test_storage_controller_node_deletion(
|
||||
wait_until(assert_victim_evacuated)
|
||||
|
||||
# The node should be gone from the list API
|
||||
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
|
||||
def assert_node_is_gone():
|
||||
assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
|
||||
|
||||
if deletion_api == DeletionAPIKind.FORCE:
|
||||
wait_until(assert_node_is_gone)
|
||||
elif deletion_api == DeletionAPIKind.OLD:
|
||||
assert_node_is_gone()
|
||||
else:
|
||||
raise AssertionError(f"Invalid deletion API: {deletion_api}")
|
||||
|
||||
# No tenants should refer to the node in their intent
|
||||
for tenant_id in tenant_ids:
|
||||
@@ -2656,7 +2682,11 @@ def test_storage_controller_node_deletion(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL])
|
||||
def test_storage_controller_node_delete_cancellation(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
deletion_api: DeletionAPIKind,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.num_azs = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
@@ -2680,12 +2710,16 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
|
||||
assert len(nodes) == 3
|
||||
|
||||
env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
|
||||
env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "pause"))
|
||||
|
||||
ps_id_to_delete = env.pageservers[0].id
|
||||
|
||||
env.storage_controller.warm_up_all_secondaries()
|
||||
|
||||
assert deletion_api in [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL]
|
||||
force = deletion_api == DeletionAPIKind.FORCE
|
||||
env.storage_controller.retryable_node_operation(
|
||||
lambda ps_id: env.storage_controller.node_delete(ps_id),
|
||||
lambda ps_id: env.storage_controller.node_delete(ps_id, force),
|
||||
ps_id_to_delete,
|
||||
max_attempts=3,
|
||||
backoff=2,
|
||||
@@ -2701,6 +2735,8 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
|
||||
|
||||
env.storage_controller.cancel_node_delete(ps_id_to_delete)
|
||||
|
||||
env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "off"))
|
||||
|
||||
env.storage_controller.poll_node_status(
|
||||
ps_id_to_delete,
|
||||
PageserverAvailability.ACTIVE,
|
||||
@@ -3252,7 +3288,10 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
|
||||
wait_until(reconfigure_node_again)
|
||||
|
||||
|
||||
def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
|
||||
def test_ps_unavailable_after_delete(
|
||||
neon_env_builder: NeonEnvBuilder, deletion_api: DeletionAPIKind
|
||||
):
|
||||
neon_env_builder.num_pageservers = 3
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -3265,10 +3304,16 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
|
||||
assert_nodes_count(3)
|
||||
|
||||
ps = env.pageservers[0]
|
||||
env.storage_controller.node_delete_old(ps.id)
|
||||
|
||||
# After deletion, the node count must be reduced
|
||||
assert_nodes_count(2)
|
||||
if deletion_api == DeletionAPIKind.FORCE:
|
||||
ps.allowed_errors.append(".*request was dropped before completing.*")
|
||||
env.storage_controller.node_delete(ps.id, force=True)
|
||||
wait_until(lambda: assert_nodes_count(2))
|
||||
elif deletion_api == DeletionAPIKind.OLD:
|
||||
env.storage_controller.node_delete_old(ps.id)
|
||||
assert_nodes_count(2)
|
||||
else:
|
||||
raise AssertionError(f"Invalid deletion API: {deletion_api}")
|
||||
|
||||
# Running pageserver CLI init in a separate thread
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
@@ -4814,3 +4859,103 @@ def test_storage_controller_migrate_with_pageserver_restart(
|
||||
"shards": [{"node_id": int(secondary.id), "shard_number": 0}],
|
||||
"preferred_az": DEFAULT_AZ_ID,
|
||||
}
|
||||
|
||||
|
||||
@run_only_on_default_postgres("PG version is not important for this test")
|
||||
def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Ensures that the storage controller correctly forwards 404s and converts some of them
|
||||
into 503s before forwarding to the client.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.num_azs = 2
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env.storage_controller.allowed_errors.append(".*Reconcile error.*")
|
||||
env.storage_controller.allowed_errors.append(".*Timed out.*")
|
||||
|
||||
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# 404s on tenants and timelines are forwarded as-is when reconciler is not running.
|
||||
|
||||
# Access a non-existing timeline -> 404
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
env.storage_controller.pageserver_api().timeline_detail(
|
||||
env.initial_tenant, TimelineId.generate()
|
||||
)
|
||||
assert e.value.status_code == 404
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
env.storage_controller.pageserver_api().timeline_lsn_lease(
|
||||
env.initial_tenant, TimelineId.generate(), Lsn(0)
|
||||
)
|
||||
assert e.value.status_code == 404
|
||||
|
||||
# Access a non-existing tenant when reconciler is not running -> 404
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
env.storage_controller.pageserver_api().timeline_detail(
|
||||
TenantId.generate(), env.initial_timeline
|
||||
)
|
||||
assert e.value.status_code == 404
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
env.storage_controller.pageserver_api().timeline_lsn_lease(
|
||||
TenantId.generate(), env.initial_timeline, Lsn(0)
|
||||
)
|
||||
assert e.value.status_code == 404
|
||||
|
||||
# Normal requests should succeed
|
||||
detail = env.storage_controller.pageserver_api().timeline_detail(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
last_record_lsn = Lsn(detail["last_record_lsn"])
|
||||
env.storage_controller.pageserver_api().timeline_lsn_lease(
|
||||
env.initial_tenant, env.initial_timeline, last_record_lsn
|
||||
)
|
||||
|
||||
# Get into a situation where the intent state is not the same as the observed state.
|
||||
describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
|
||||
current_primary = describe["node_attached"]
|
||||
current_secondary = describe["node_secondary"][0]
|
||||
assert current_primary != current_secondary
|
||||
|
||||
# Pause the reconciler so that the generation number won't be updated.
|
||||
env.storage_controller.configure_failpoints(
|
||||
("reconciler-live-migrate-post-generation-inc", "pause")
|
||||
)
|
||||
|
||||
# Do the migration in another thread; the request will be dropped as we don't wait.
|
||||
shard_zero = TenantShardId(env.initial_tenant, 0, 0)
|
||||
concurrent.futures.ThreadPoolExecutor(max_workers=1).submit(
|
||||
env.storage_controller.tenant_shard_migrate,
|
||||
shard_zero,
|
||||
current_secondary,
|
||||
StorageControllerMigrationConfig(override_scheduler=True),
|
||||
)
|
||||
# Not the best way to do this, we should wait until the migration gets started.
|
||||
time.sleep(1)
|
||||
placement = env.storage_controller.get_tenants_placement()[str(shard_zero)]
|
||||
assert placement["observed"] != placement["intent"]
|
||||
assert placement["observed"]["attached"] == current_primary
|
||||
assert placement["intent"]["attached"] == current_secondary
|
||||
|
||||
# Now we issue requests that would cause 404 again
|
||||
retry_strategy = Retry(total=0)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
|
||||
no_retry_api = env.storage_controller.pageserver_api()
|
||||
no_retry_api.mount("http://", adapter)
|
||||
no_retry_api.mount("https://", adapter)
|
||||
|
||||
# As intent state != observed state, tenant not found error should return 503,
|
||||
# so that the client can retry once we've successfully migrated.
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate())
|
||||
assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0))
|
||||
assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
|
||||
|
||||
# Unblock reconcile operations
|
||||
env.storage_controller.configure_failpoints(
|
||||
("reconciler-live-migrate-post-generation-inc", "off")
|
||||
)
|
||||
|
||||
@@ -76,7 +76,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
"""Tests tenants with and without wal acceptors"""
|
||||
tenant_1, _ = env.create_tenant()
|
||||
tenant_2, _ = env.create_tenant()
|
||||
|
||||
|
||||
@@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Wait for the error message to appear in the compute log
|
||||
def error_logged():
|
||||
return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
|
||||
if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None:
|
||||
raise Exception("Expected error message not found in compute log yet")
|
||||
|
||||
wait_until(error_logged)
|
||||
log.info("Found expected error message in compute log, resuming.")
|
||||
@@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("select count(*) from t")
|
||||
# 2000 rows from first insert + 1000 from last insert
|
||||
assert cur.fetchone() == (3000,)
|
||||
|
||||
|
||||
def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker
|
||||
also works as expected. The test scenario:
|
||||
1. Create a timeline and endpoint.
|
||||
2. Mock high disk usage via failpoint
|
||||
3. Write data to the timeline so that disk usage exceeds the limit.
|
||||
4. Verify that the writes hang and the expected error message appears in the compute log.
|
||||
5. Mock low disk usage via failpoint
|
||||
6. Verify that the hanging writes unblock and we can continue to write as normal.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.create_branch("test_global_disk_usage_limit")
|
||||
endpoint = env.endpoints.create_start("test_global_disk_usage_limit")
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t2(key int, value text)")
|
||||
|
||||
for sk in env.safekeepers:
|
||||
sk.stop().start(
|
||||
extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"]
|
||||
)
|
||||
|
||||
# Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical
|
||||
# limits in the test environment.
|
||||
for sk in env.safekeepers:
|
||||
sk.http_client().configure_failpoints(
|
||||
[("sk-global-disk-usage", "return(18446744073709551615)")]
|
||||
)
|
||||
|
||||
# Wait until the global disk usage limit watcher trips the circuit breaker.
|
||||
def error_logged_in_sk():
|
||||
for sk in env.safekeepers:
|
||||
if sk.log_contains("Global disk usage exceeded limit") is None:
|
||||
raise Exception("Expected error message not found in safekeeper log yet")
|
||||
|
||||
wait_until(error_logged_in_sk)
|
||||
|
||||
def run_hanging_insert_global():
|
||||
with closing(endpoint.connect()) as bg_conn:
|
||||
with bg_conn.cursor() as bg_cur:
|
||||
# This should generate more than 1KiB of WAL
|
||||
bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'")
|
||||
|
||||
bg_thread_global = threading.Thread(target=run_hanging_insert_global)
|
||||
bg_thread_global.start()
|
||||
|
||||
def error_logged_in_compute():
|
||||
if endpoint.log_contains("Global disk usage exceeded limit") is None:
|
||||
raise Exception("Expected error message not found in compute log yet")
|
||||
|
||||
wait_until(error_logged_in_compute)
|
||||
log.info("Found the expected error message in compute log, resuming.")
|
||||
|
||||
time.sleep(2)
|
||||
assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!"
|
||||
|
||||
# Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing.
|
||||
# The SKs should resume accepting WAL writes without restarting.
|
||||
for sk in env.safekeepers:
|
||||
sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")])
|
||||
|
||||
bg_thread_global.join(timeout=120)
|
||||
assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart"
|
||||
log.info("Global hanging insert unblocked.")
|
||||
|
||||
# Verify that we can continue to write as normal and we don't have obvious data corruption
|
||||
# following the recovery.
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("insert into t2 select generate_series(2001,3000), 'payload'")
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t2")
|
||||
assert cur.fetchone() == (3000,)
|
||||
|
||||
9
test_runner/sql_regress/expected/neon-spgist.out
Normal file
9
test_runner/sql_regress/expected/neon-spgist.out
Normal file
@@ -0,0 +1,9 @@
|
||||
-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error)
|
||||
create table spgist_point_tbl(id int4, p point);
|
||||
create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25);
|
||||
insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g;
|
||||
insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g;
|
||||
insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
|
||||
vacuum spgist_point_tbl;
|
||||
insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
|
||||
checkpoint;
|
||||
@@ -9,5 +9,6 @@ test: neon-rel-truncate
|
||||
test: neon-clog
|
||||
test: neon-test-utils
|
||||
test: neon-vacuum-full
|
||||
test: neon-event-triggers
|
||||
test: neon-subxacts
|
||||
test: neon-spgist
|
||||
test: neon-event-triggers
|
||||
|
||||
10
test_runner/sql_regress/sql/neon-spgist.sql
Normal file
10
test_runner/sql_regress/sql/neon-spgist.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
-- Test unlogged build of SPGIST index (no "Page evicted with zero LSN" error)
|
||||
create table spgist_point_tbl(id int4, p point);
|
||||
create index spgist_point_idx on spgist_point_tbl using spgist(p) with (fillfactor = 25);
|
||||
insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g;
|
||||
insert into spgist_point_tbl (id, p) select g, point(g*10, g*10) from generate_series(1, 10000) g;
|
||||
insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
|
||||
|
||||
vacuum spgist_point_tbl;
|
||||
insert into spgist_point_tbl (id, p) select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
|
||||
checkpoint;
|
||||
Reference in New Issue
Block a user