Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517

This commit is contained in:
John Spray
2025-06-05 13:07:03 +01:00
committed by GitHub
92 changed files with 2134 additions and 938 deletions

View File

@@ -357,31 +357,6 @@ class PgProtocol:
return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0]))
class PageserverWalReceiverProtocol(StrEnum):
VANILLA = "vanilla"
INTERPRETED = "interpreted"
@staticmethod
def to_config_key_value(proto) -> tuple[str, dict[str, Any]]:
if proto == PageserverWalReceiverProtocol.VANILLA:
return (
"wal_receiver_protocol",
{
"type": "vanilla",
},
)
elif proto == PageserverWalReceiverProtocol.INTERPRETED:
return (
"wal_receiver_protocol",
{
"type": "interpreted",
"args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
},
)
else:
raise ValueError(f"Unknown protocol type: {proto}")
@dataclass
class PageserverTracingConfig:
sampling_ratio: tuple[int, int]
@@ -423,6 +398,7 @@ class PageserverImportConfig:
"import_job_concurrency": self.import_job_concurrency,
"import_job_soft_size_limit": self.import_job_soft_size_limit,
"import_job_checkpoint_threshold": self.import_job_checkpoint_threshold,
"import_job_max_byte_range_size": 4 * 1024 * 1024, # Pageserver default
}
return ("timeline_import_config", value)
@@ -474,7 +450,6 @@ class NeonEnvBuilder:
safekeeper_extra_opts: list[str] | None = None,
storage_controller_port_override: int | None = None,
pageserver_virtual_file_io_mode: str | None = None,
pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
pageserver_get_vectored_concurrent_io: str | None = None,
pageserver_tracing_config: PageserverTracingConfig | None = None,
pageserver_import_config: PageserverImportConfig | None = None,
@@ -551,11 +526,6 @@ class NeonEnvBuilder:
self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
if pageserver_wal_receiver_protocol is not None:
self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
else:
self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED
assert test_name.startswith("test_"), (
"Unexpectedly instantiated from outside a test function"
)
@@ -1201,7 +1171,6 @@ class NeonEnv:
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
self.pageserver_tracing_config = config.pageserver_tracing_config
if config.pageserver_import_config is None:
@@ -1333,13 +1302,6 @@ class NeonEnv:
for key, value in override.items():
ps_cfg[key] = value
if self.pageserver_wal_receiver_protocol is not None:
key, value = PageserverWalReceiverProtocol.to_config_key_value(
self.pageserver_wal_receiver_protocol
)
if key not in ps_cfg:
ps_cfg[key] = value
if self.pageserver_tracing_config is not None:
key, value = self.pageserver_tracing_config.to_config_key_value()
@@ -4710,7 +4672,7 @@ class EndpointFactory:
origin: Endpoint,
endpoint_id: str | None = None,
config_lines: list[str] | None = None,
):
) -> Endpoint:
branch_name = origin.branch_name
assert origin in self.endpoints
assert branch_name is not None
@@ -4729,7 +4691,7 @@ class EndpointFactory:
origin: Endpoint,
endpoint_id: str | None = None,
config_lines: list[str] | None = None,
):
) -> Endpoint:
branch_name = origin.branch_name
assert origin in self.endpoints
assert branch_name is not None

View File

@@ -15,19 +15,10 @@ from fixtures.neon_fixtures import (
@pytest.mark.timeout(1200)
@pytest.mark.parametrize("shard_count", [1, 8, 32])
@pytest.mark.parametrize(
"wal_receiver_protocol",
[
"vanilla",
"interpreted-bincode-compressed",
"interpreted-protobuf-compressed",
],
)
def test_sharded_ingest(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
shard_count: int,
wal_receiver_protocol: str,
):
"""
Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
@@ -39,36 +30,6 @@ def test_sharded_ingest(
neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_configs()
for ps in env.pageservers:
if wal_receiver_protocol == "vanilla":
ps.patch_config_toml_nonrecursive(
{
"wal_receiver_protocol": {
"type": "vanilla",
}
}
)
elif wal_receiver_protocol == "interpreted-bincode-compressed":
ps.patch_config_toml_nonrecursive(
{
"wal_receiver_protocol": {
"type": "interpreted",
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
}
}
)
elif wal_receiver_protocol == "interpreted-protobuf-compressed":
ps.patch_config_toml_nonrecursive(
{
"wal_receiver_protocol": {
"type": "interpreted",
"args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
}
}
)
else:
raise AssertionError("Test must use explicit wal receiver protocol config")
env.start()
# Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure

View File

@@ -182,10 +182,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"lsn_lease_length": "1m",
"lsn_lease_length_for_ts": "5s",
"timeline_offloading": False,
"wal_receiver_protocol_override": {
"type": "interpreted",
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
},
"rel_size_v2_enabled": True,
"relsize_snapshot_cache_capacity": 10000,
"gc_compaction_enabled": True,

View File

@@ -10,7 +10,6 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PageserverWalReceiverProtocol,
generate_uploads_and_deletions,
)
from fixtures.pageserver.http import PageserverApiException
@@ -68,14 +67,9 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
"wal_receiver_protocol",
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
@pytest.mark.timeout(900)
def test_pageserver_compaction_smoke(
neon_env_builder: NeonEnvBuilder,
wal_receiver_protocol: PageserverWalReceiverProtocol,
):
"""
This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -85,8 +79,6 @@ def test_pageserver_compaction_smoke(
observed bounds.
"""
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
# Effectively disable the page cache to rely only on image layers
# to shorten reads.
neon_env_builder.pageserver_config_override = """

View File

@@ -466,8 +466,13 @@ def test_perf_counters(neon_simple_env: NeonEnv):
#
# 1.5 is the minimum version to contain these views.
cur.execute("CREATE EXTENSION neon VERSION '1.5'")
cur.execute("set neon.monitor_query_exec_time = on")
cur.execute("SELECT * FROM neon_perf_counters")
cur.execute("SELECT * FROM neon_backend_perf_counters")
cur.execute(
"select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()"
)
assert cur.fetchall()[0][0] == 2
def collect_metric(

View File

@@ -1,9 +1,13 @@
from __future__ import annotations
from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.neon_cli import WalCraft
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
# Restart nodes with WAL end having specially crafted shape, like last record
# crossing segment boundary, to test decoding issues.
@@ -19,17 +23,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
"wal_record_crossing_segment_followed_by_small_one",
],
)
@pytest.mark.parametrize(
"wal_receiver_protocol",
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
def test_crafted_wal_end(
neon_env_builder: NeonEnvBuilder,
wal_type: str,
wal_receiver_protocol: PageserverWalReceiverProtocol,
):
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
env = neon_env_builder.init_start()
env.create_branch("test_crafted_wal_end")
env.pageserver.allowed_errors.extend(

View File

@@ -159,7 +159,8 @@ def test_remote_extensions(
# Setup a mock nginx S3 gateway which will return our test extension.
(host, port) = httpserver_listen_address
extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway"
log.info(f"remote extensions base URL: {remote_ext_base_url}")
extension.build(pg_config, test_output_dir)
tarball = extension.package(test_output_dir)
@@ -221,7 +222,7 @@ def test_remote_extensions(
endpoint.create_remote_extension_spec(spec)
endpoint.start(remote_ext_base_url=extensions_endpoint)
endpoint.start(remote_ext_base_url=remote_ext_base_url)
with endpoint.connect() as conn:
with conn.cursor() as cur:
@@ -249,7 +250,7 @@ def test_remote_extensions(
# Remove the extension files to force a redownload of the extension.
extension.remove(test_output_dir, pg_version)
endpoint.start(remote_ext_base_url=extensions_endpoint)
endpoint.start(remote_ext_base_url=remote_ext_base_url)
# Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
with endpoint.connect() as conn:

View File

@@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
for query in queries:
with s_con.cursor() as secondary_cursor:
secondary_cursor.execute(query)
response = secondary_cursor.fetchone()
assert response is not None
res = secondary_cursor.fetchone()
assert res is not None
response = res
assert response == responses[query]
# Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
s_cur.execute("SELECT COUNT(*) FROM test")
res = s_cur.fetchone()
assert res[0] == 10000
assert res == (10000,)
# Clear the cache in the standby, so that when we
# re-execute the query, it will make GetPage
@@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)
res = s_cur.fetchone()
assert res[0] == 10000
assert res == (10000,)
def run_pgbench(connstr: str, pg_bin: PgBin):

View File

@@ -0,0 +1,133 @@
"""
File with secondary->primary promotion testing.
This far, only contains a test that we don't break and that the data is persisted.
"""
import psycopg2
from fixtures.log_helper import log
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from pytest import raises
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
"""
Test that a replica safely promotes, and can commit data updates which
show up when the primary boots up after the promoted secondary endpoint
shut down.
"""
# Initialize the primary, a test table, and a helper function to create lots
# of subtransactions.
env: NeonEnv = neon_simple_env
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
with primary.connect() as primary_conn:
primary_cur = primary_conn.cursor()
primary_cur.execute(
"create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
)
primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}")
primary_cur.execute("show neon.safekeepers")
safekeepers = primary_cur.fetchall()[0][0]
wait_replica_caughtup(primary, secondary)
with secondary.connect() as secondary_conn:
secondary_cur = secondary_conn.cursor()
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (100,)
with raises(psycopg2.Error):
secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)")
secondary_conn.commit()
secondary_conn.rollback()
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (100,)
primary.stop_and_destroy(mode="immediate")
# Reconnect to the secondary to make sure we get a read-write connection
promo_conn = secondary.connect()
promo_cur = promo_conn.cursor()
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
promo_cur.execute("select pg_reload_conf()")
promo_cur.execute("SELECT * FROM pg_promote()")
assert promo_cur.fetchone() == (True,)
promo_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
# Reconnect to the secondary to make sure we get a read-write connection
with secondary.connect() as new_primary_conn:
new_primary_cur = new_primary_conn.cursor()
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (100,)
new_primary_cur.execute(
"INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
)
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
new_primary_cur = new_primary_conn.cursor()
new_primary_cur.execute("select payload from t")
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (200,)
new_primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
with secondary.connect() as second_viewpoint_conn:
new_primary_cur = second_viewpoint_conn.cursor()
new_primary_cur.execute("select payload from t")
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
secondary.stop_and_destroy()
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
with primary.connect() as new_primary:
new_primary_cur = new_primary.cursor()
new_primary_cur.execute(
"""
SELECT pg_current_wal_insert_lsn(),
pg_current_wal_lsn(),
pg_current_wal_flush_lsn()
"""
)
log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}")
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (200,)
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (300,)
primary.stop(mode="immediate")

View File

@@ -1,9 +1,7 @@
from __future__ import annotations
import pytest
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PageserverWalReceiverProtocol,
check_restored_datadir_content,
)
@@ -14,13 +12,7 @@ from fixtures.neon_fixtures import (
# maintained in the pageserver, so subtransactions are not very exciting for
# Neon. They are included in the commit record though and updated in the
# CLOG.
@pytest.mark.parametrize(
"wal_receiver_protocol",
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol):
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")

View File

@@ -348,7 +348,6 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st
def assert_tenant_conf_semantically_equal(lhs, rhs):
"""
Storcon returns None for fields that are not set while the pageserver does not.
Compare two tenant's config overrides semantically, by dropping the None values.
"""
lhs = {k: v for k, v in lhs.items() if v is not None}
@@ -375,10 +374,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st
patch: dict[str, Any | None] = {
"gc_period": "3h",
"wal_receiver_protocol_override": {
"type": "interpreted",
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
},
"gc_compaction_ratio_percent": 10,
}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
@@ -391,7 +387,7 @@ def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: st
assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
crnt_tenant_conf = tenant_conf_after_patch
patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
patch = {"gc_period": "5h", "gc_compaction_ratio_percent": None}
api.patch_tenant_config(env.initial_tenant, patch)
tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
if ps_managed_by == "storcon":

View File

@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
PageserverWalReceiverProtocol,
Safekeeper,
)
from fixtures.remote_storage import RemoteStorageKind
@@ -751,15 +750,8 @@ async def run_segment_init_failure(env: NeonEnv):
# Test (injected) failure during WAL segment init.
# https://github.com/neondatabase/neon/issues/6401
# https://github.com/neondatabase/neon/issues/6402
@pytest.mark.parametrize(
"wal_receiver_protocol",
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
def test_segment_init_failure(
neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
):
def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
env = neon_env_builder.init_start()
asyncio.run(run_segment_init_failure(env))