mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 03:20:36 +00:00
Merge branch 'main' into communicator-rewrite
This commit is contained in:
@@ -503,6 +503,7 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple=False,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> subprocess.CompletedProcess[str]:
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -534,6 +535,8 @@ class NeonLocalCli(AbstractNeonCli):
|
||||
args.extend(["--allow-multiple"])
|
||||
if update_catalog:
|
||||
args.extend(["--update-catalog"])
|
||||
if privileged_role_name is not None:
|
||||
args.extend(["--privileged-role-name", privileged_role_name])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
|
||||
@@ -728,7 +728,7 @@ class NeonEnvBuilder:
|
||||
# NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
|
||||
# However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
|
||||
# controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
|
||||
# So, from_repo_dir patches up the the storcon database.
|
||||
# So, from_repo_dir patches up the storcon database.
|
||||
patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
|
||||
assert not patch_script_path.exists()
|
||||
patch_script = ""
|
||||
@@ -4324,6 +4324,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id: int | None = None,
|
||||
allow_multiple: bool = False,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> Self:
|
||||
"""
|
||||
Create a new Postgres endpoint.
|
||||
@@ -4351,6 +4352,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
update_catalog=update_catalog,
|
||||
privileged_role_name=privileged_role_name,
|
||||
)
|
||||
path = Path("endpoints") / self.endpoint_id / "pgdata"
|
||||
self.pgdata_dir = self.env.repo_dir / path
|
||||
@@ -4811,6 +4813,7 @@ class EndpointFactory:
|
||||
config_lines: list[str] | None = None,
|
||||
pageserver_id: int | None = None,
|
||||
update_catalog: bool = False,
|
||||
privileged_role_name: str | None = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -4834,6 +4837,7 @@ class EndpointFactory:
|
||||
config_lines=config_lines,
|
||||
pageserver_id=pageserver_id,
|
||||
update_catalog=update_catalog,
|
||||
privileged_role_name=privileged_role_name,
|
||||
)
|
||||
|
||||
def stop_all(self, fail_on_error=True) -> Self:
|
||||
|
||||
@@ -60,7 +60,7 @@ def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
@pytest.mark.timeout(30 * 60)
|
||||
@pytest.mark.timeout(2 * 60 * 60)
|
||||
def test_compare_prewarmed_pgbench_perf_benchmark(
|
||||
pg_bin: PgBin,
|
||||
neon_api: NeonAPI,
|
||||
@@ -91,8 +91,9 @@ def benchmark_impl(
|
||||
test_duration_min = 5
|
||||
pgbench_duration = f"-T{test_duration_min * 60}"
|
||||
# prewarm API is not publicly exposed. In order to test performance of a
|
||||
# fully prewarmed endpoint, wait after it restarts
|
||||
prewarmed_sleep_secs = 30
|
||||
# fully prewarmed endpoint, wait after it restarts.
|
||||
# The number here is empirical, based on manual runs on staging
|
||||
prewarmed_sleep_secs = 180
|
||||
|
||||
branch_id = project["branch"]["id"]
|
||||
project_id = project["project"]["id"]
|
||||
|
||||
@@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
".*Local notification hook failed.*",
|
||||
".*Marking shard.*for notification retry.*",
|
||||
".*Failed to notify compute.*",
|
||||
# As an optimization, the storage controller kicks the downloads on the secondary
|
||||
# after the shard split. However, secondaries are created async, so it's possible
|
||||
# that the intent state was modified, but the actual secondary hasn't been created,
|
||||
# which results in an error.
|
||||
".*Error calling secondary download after shard split.*",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -24,10 +24,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
|
||||
[
|
||||
".*get_values_reconstruct_data for layer .*",
|
||||
".*could not find data for key.*",
|
||||
".*is not active. Current state: Broken.*",
|
||||
".*will not become active. Current state: Broken.*",
|
||||
".*failed to load metadata.*",
|
||||
".*load failed.*load local timeline.*",
|
||||
".*: layer load failed, assuming permanent failure:.*",
|
||||
".*failed to get checkpoint bytes.*",
|
||||
".*failed to get control bytes.*",
|
||||
|
||||
@@ -687,7 +687,7 @@ def test_sharding_compaction(
|
||||
for _i in range(0, 10):
|
||||
# Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
|
||||
# these should result in image layers each time we write some data into a shard, and also shards
|
||||
# recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
|
||||
# receiving less data hitting their "empty image layer" path (where they should skip writing the layer,
|
||||
# rather than asserting)
|
||||
workload.churn_rows(64)
|
||||
|
||||
|
||||
@@ -217,7 +217,7 @@ if SQL_EXPORTER is None:
|
||||
self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
|
||||
) -> None:
|
||||
# NOTE: Keep the version the same as in
|
||||
# compute/compute-node.Dockerfile and build-tools.Dockerfile.
|
||||
# compute/compute-node.Dockerfile and build-tools/Dockerfile.
|
||||
#
|
||||
# The "host" network mode allows sql_exporter to talk to the
|
||||
# endpoint which is running on the host.
|
||||
|
||||
@@ -103,3 +103,90 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
query = "DROP SUBSCRIPTION sub CASCADE"
|
||||
log.info(f"Dropping subscription: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
|
||||
def test_privileged_role_override(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
"""
|
||||
Test that we can override the privileged role for an endpoint and when we do it,
|
||||
everything is correctly bootstrapped inside Postgres and we don't have neon_superuser
|
||||
role in the database.
|
||||
"""
|
||||
PRIVILEGED_ROLE_NAME = "my_superuser"
|
||||
|
||||
env = neon_simple_env
|
||||
env.create_branch("test_privileged_role_override")
|
||||
ep = env.endpoints.create(
|
||||
"test_privileged_role_override",
|
||||
privileged_role_name=PRIVILEGED_ROLE_NAME,
|
||||
update_catalog=True,
|
||||
)
|
||||
|
||||
ep.start()
|
||||
|
||||
ep.wait_for_migrations()
|
||||
|
||||
member_roles = [
|
||||
"pg_read_all_data",
|
||||
"pg_write_all_data",
|
||||
"pg_monitor",
|
||||
"pg_signal_backend",
|
||||
]
|
||||
|
||||
non_member_roles = [
|
||||
"pg_execute_server_program",
|
||||
"pg_read_server_files",
|
||||
"pg_write_server_files",
|
||||
]
|
||||
|
||||
role_attributes = {
|
||||
"rolsuper": False,
|
||||
"rolinherit": True,
|
||||
"rolcreaterole": True,
|
||||
"rolcreatedb": True,
|
||||
"rolcanlogin": False,
|
||||
"rolreplication": True,
|
||||
"rolconnlimit": -1,
|
||||
"rolbypassrls": True,
|
||||
}
|
||||
|
||||
if pg_version >= PgVersion.V15:
|
||||
non_member_roles.append("pg_checkpoint")
|
||||
|
||||
if pg_version >= PgVersion.V16:
|
||||
member_roles.append("pg_create_subscription")
|
||||
non_member_roles.append("pg_use_reserved_connections")
|
||||
|
||||
with ep.cursor() as cur:
|
||||
cur.execute(f"SELECT rolname FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
|
||||
|
||||
cur.execute("SELECT rolname FROM pg_roles WHERE rolname = 'neon_superuser'")
|
||||
assert len(cur.fetchall()) == 0
|
||||
|
||||
cur.execute("SHOW neon.privileged_role_name")
|
||||
assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role is created
|
||||
cur.execute(f"select * from pg_roles where rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
assert cur.fetchone() is not None
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role has the correct member roles
|
||||
for role in member_roles:
|
||||
cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
|
||||
assert cur.fetchone() == (True,), (
|
||||
f"Role {role} should be a member of {PRIVILEGED_ROLE_NAME}"
|
||||
)
|
||||
|
||||
for role in non_member_roles:
|
||||
cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
|
||||
assert cur.fetchone() == (False,), (
|
||||
f"Role {role} should not be a member of {PRIVILEGED_ROLE_NAME}"
|
||||
)
|
||||
|
||||
# check PRIVILEGED_ROLE_NAME role has the correct role attributes
|
||||
for attr, val in role_attributes.items():
|
||||
cur.execute(f"SELECT {attr} FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
|
||||
curr_val = cur.fetchone()
|
||||
assert curr_val == (val,), (
|
||||
f"Role attribute {attr} should be {val} instead of {curr_val}"
|
||||
)
|
||||
|
||||
@@ -76,7 +76,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
"""Tests tenants with and without wal acceptors"""
|
||||
tenant_1, _ = env.create_tenant()
|
||||
tenant_2, _ = env.create_tenant()
|
||||
|
||||
|
||||
@@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Wait for the error message to appear in the compute log
|
||||
def error_logged():
|
||||
return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
|
||||
if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None:
|
||||
raise Exception("Expected error message not found in compute log yet")
|
||||
|
||||
wait_until(error_logged)
|
||||
log.info("Found expected error message in compute log, resuming.")
|
||||
@@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("select count(*) from t")
|
||||
# 2000 rows from first insert + 1000 from last insert
|
||||
assert cur.fetchone() == (3000,)
|
||||
|
||||
|
||||
def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker
|
||||
also works as expected. The test scenario:
|
||||
1. Create a timeline and endpoint.
|
||||
2. Mock high disk usage via failpoint
|
||||
3. Write data to the timeline so that disk usage exceeds the limit.
|
||||
4. Verify that the writes hang and the expected error message appears in the compute log.
|
||||
5. Mock low disk usage via failpoint
|
||||
6. Verify that the hanging writes unblock and we can continue to write as normal.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.create_branch("test_global_disk_usage_limit")
|
||||
endpoint = env.endpoints.create_start("test_global_disk_usage_limit")
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t2(key int, value text)")
|
||||
|
||||
for sk in env.safekeepers:
|
||||
sk.stop().start(
|
||||
extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"]
|
||||
)
|
||||
|
||||
# Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical
|
||||
# limits in the test environment.
|
||||
for sk in env.safekeepers:
|
||||
sk.http_client().configure_failpoints(
|
||||
[("sk-global-disk-usage", "return(18446744073709551615)")]
|
||||
)
|
||||
|
||||
# Wait until the global disk usage limit watcher trips the circuit breaker.
|
||||
def error_logged_in_sk():
|
||||
for sk in env.safekeepers:
|
||||
if sk.log_contains("Global disk usage exceeded limit") is None:
|
||||
raise Exception("Expected error message not found in safekeeper log yet")
|
||||
|
||||
wait_until(error_logged_in_sk)
|
||||
|
||||
def run_hanging_insert_global():
|
||||
with closing(endpoint.connect()) as bg_conn:
|
||||
with bg_conn.cursor() as bg_cur:
|
||||
# This should generate more than 1KiB of WAL
|
||||
bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'")
|
||||
|
||||
bg_thread_global = threading.Thread(target=run_hanging_insert_global)
|
||||
bg_thread_global.start()
|
||||
|
||||
def error_logged_in_compute():
|
||||
if endpoint.log_contains("Global disk usage exceeded limit") is None:
|
||||
raise Exception("Expected error message not found in compute log yet")
|
||||
|
||||
wait_until(error_logged_in_compute)
|
||||
log.info("Found the expected error message in compute log, resuming.")
|
||||
|
||||
time.sleep(2)
|
||||
assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!"
|
||||
|
||||
# Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing.
|
||||
# The SKs should resume accepting WAL writes without restarting.
|
||||
for sk in env.safekeepers:
|
||||
sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")])
|
||||
|
||||
bg_thread_global.join(timeout=120)
|
||||
assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart"
|
||||
log.info("Global hanging insert unblocked.")
|
||||
|
||||
# Verify that we can continue to write as normal and we don't have obvious data corruption
|
||||
# following the recovery.
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("insert into t2 select generate_series(2001,3000), 'payload'")
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t2")
|
||||
assert cur.fetchone() == (3000,)
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
@@ -198,3 +199,115 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool)
|
||||
# the table is back now!
|
||||
restored = env.endpoints.create_start("main")
|
||||
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
|
||||
|
||||
|
||||
# BEGIN_HADRON
|
||||
# TODO: re-enable once CM python is integreated.
|
||||
# def clear_directory(directory):
|
||||
# for item in os.listdir(directory):
|
||||
# item_path = os.path.join(directory, item)
|
||||
# if os.path.isdir(item_path):
|
||||
# log.info(f"removing SK directory: {item_path}")
|
||||
# shutil.rmtree(item_path)
|
||||
# else:
|
||||
# log.info(f"removing SK file: {item_path}")
|
||||
# os.remove(item_path)
|
||||
|
||||
|
||||
# def test_sk_pull_timelines(
|
||||
# neon_env_builder: NeonEnvBuilder,
|
||||
# ):
|
||||
# DBNAME = "regression"
|
||||
# superuser_name = "databricks_superuser"
|
||||
# neon_env_builder.num_safekeepers = 3
|
||||
# neon_env_builder.num_pageservers = 4
|
||||
# neon_env_builder.safekeeper_extra_opts = ["--enable-pull-timeline-on-startup"]
|
||||
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
|
||||
|
||||
# env = neon_env_builder.init_start(initial_tenant_shard_count=4)
|
||||
|
||||
# env.compute_manager.start(base_port=env.compute_manager_port)
|
||||
|
||||
# test_creator = "test_creator"
|
||||
# test_metastore_id = uuid4()
|
||||
# test_account_id = uuid4()
|
||||
# test_workspace_id = 1
|
||||
# test_workspace_url = "http://test_workspace_url"
|
||||
# test_metadata_version = 1
|
||||
# test_metadata = {
|
||||
# "state": "INSTANCE_PROVISIONING",
|
||||
# "admin_rolename": "admin",
|
||||
# "admin_password_scram": "abc123456",
|
||||
# }
|
||||
|
||||
# test_instance_name_1 = "test_instance_1"
|
||||
# test_instance_read_write_compute_pool_1 = {
|
||||
# "instance_name": test_instance_name_1,
|
||||
# "compute_pool_name": "compute_pool_1",
|
||||
# "creator": test_creator,
|
||||
# "capacity": 2.0,
|
||||
# "node_count": 1,
|
||||
# "metadata_version": 0,
|
||||
# "metadata": {
|
||||
# "state": "INSTANCE_PROVISIONING",
|
||||
# },
|
||||
# }
|
||||
|
||||
# test_instance_1_readable_secondaries_enabled = False
|
||||
|
||||
# # Test creation
|
||||
# create_instance_with_retries(
|
||||
# env,
|
||||
# test_instance_name_1,
|
||||
# test_creator,
|
||||
# test_metastore_id,
|
||||
# test_account_id,
|
||||
# test_workspace_id,
|
||||
# test_workspace_url,
|
||||
# test_instance_read_write_compute_pool_1,
|
||||
# test_metadata_version,
|
||||
# test_metadata,
|
||||
# test_instance_1_readable_secondaries_enabled,
|
||||
# )
|
||||
# instance = env.compute_manager.get_instance_by_name(test_instance_name_1, test_workspace_id)
|
||||
# log.info(f"haoyu Instance created: {instance}")
|
||||
# assert instance["instance_name"] == test_instance_name_1
|
||||
# test_instance_id = instance["instance_id"]
|
||||
# instance_detail = env.compute_manager.describe_instance(test_instance_id)
|
||||
# log.info(f"haoyu Instance detail: {instance_detail}")
|
||||
|
||||
# env.initial_tenant = instance_detail[0]["tenant_id"]
|
||||
# env.initial_timeline = instance_detail[0]["timeline_id"]
|
||||
|
||||
# # Connect to postgres and create a database called "regression".
|
||||
# endpoint = env.endpoints.create_start("main")
|
||||
# endpoint.safe_psql(f"CREATE ROLE {superuser_name}")
|
||||
# endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
# endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
|
||||
# # Write some data. ~20 MB.
|
||||
# num_rows = 0
|
||||
# for _i in range(0, 20000):
|
||||
# endpoint.safe_psql(
|
||||
# "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
# )
|
||||
# num_rows += 1
|
||||
|
||||
# log.info(f"SKs {env.storage_controller.hcc_sk_node_list()}")
|
||||
|
||||
# env.safekeepers[0].stop(immediate=True)
|
||||
# clear_directory(env.safekeepers[0].data_dir)
|
||||
# env.safekeepers[0].start()
|
||||
|
||||
# # PG can still write data. ~20 MB.
|
||||
# for _i in range(0, 20000):
|
||||
# endpoint.safe_psql(
|
||||
# "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
# )
|
||||
# num_rows += 1
|
||||
|
||||
# tuples = endpoint.safe_psql("SELECT COUNT(*) FROM usertable;")
|
||||
# assert tuples[0][0] == num_rows
|
||||
# endpoint.stop_and_destroy()
|
||||
|
||||
# END_HADRON
|
||||
|
||||
Reference in New Issue
Block a user