mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
Merge commit '296c9190b' into problame/standby-horizon-leases
This commit is contained in:
@@ -5421,6 +5421,7 @@ SKIP_FILES = frozenset(
|
||||
(
|
||||
"pg_internal.init",
|
||||
"pg.log",
|
||||
"neon.signal",
|
||||
"zenith.signal",
|
||||
"pg_hba.conf",
|
||||
"postgresql.conf",
|
||||
|
||||
@@ -115,8 +115,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*Local data loss suspected.*",
|
||||
# Too many frozen layers error is normal during intensive benchmarks
|
||||
".*too many frozen layers.*",
|
||||
# Transient errors when resolving tenant shards by page service
|
||||
".*Fail to resolve tenant shard in attempt.*",
|
||||
".*Failed to resolve tenant shard after.*",
|
||||
# Expected warnings when pageserver has not refreshed GC info yet
|
||||
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
|
||||
".*No broker updates received for a while.*",
|
||||
|
||||
@@ -7,6 +7,7 @@ import time
|
||||
from enum import StrEnum
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantShardId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
@@ -960,6 +961,67 @@ def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
|
||||
return image_layer_count, delta_layer_count
|
||||
|
||||
|
||||
def test_image_layer_creation_time_threshold(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that image layers can be created when the time threshold is reached on non-0 shards.
|
||||
"""
|
||||
tenant_conf = {
|
||||
"compaction_threshold": "100",
|
||||
"image_creation_threshold": "100",
|
||||
"image_layer_creation_check_threshold": "1",
|
||||
# disable distance based image layer creation check
|
||||
"checkpoint_distance": 10 * 1024 * 1024 * 1024,
|
||||
"checkpoint_timeout": "100ms",
|
||||
"image_layer_force_creation_period": "1s",
|
||||
"pitr_interval": "10s",
|
||||
"gc_period": "1s",
|
||||
"compaction_period": "1s",
|
||||
"lsn_lease_length": "1s",
|
||||
}
|
||||
|
||||
# consider every tenant large to run the image layer generation check more eagerly
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
"image_layer_generation_large_timeline_threshold=0"
|
||||
)
|
||||
|
||||
neon_env_builder.num_pageservers = 1
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf=tenant_conf,
|
||||
initial_tenant_shard_count=2,
|
||||
initial_tenant_shard_stripe_size=1,
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
|
||||
|
||||
for v in range(10):
|
||||
endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
|
||||
|
||||
tenant_shard_id = TenantShardId(tenant_id, 1, 2)
|
||||
|
||||
# Generate some rows.
|
||||
for v in range(20):
|
||||
endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
|
||||
|
||||
# restart page server so that logical size on non-0 shards is missing
|
||||
env.pageserver.restart()
|
||||
|
||||
(old_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
|
||||
log.info(f"old images: {old_images}, old deltas: {old_deltas}")
|
||||
|
||||
def check_image_creation():
|
||||
(new_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
|
||||
log.info(f"images: {new_images}, deltas: {old_deltas}")
|
||||
assert new_images > old_images
|
||||
|
||||
wait_until(check_image_creation)
|
||||
|
||||
endpoint.stop_and_destroy()
|
||||
|
||||
|
||||
def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that page server can force creating new images if image_layer_force_creation_period is enabled
|
||||
|
||||
@@ -2,6 +2,9 @@ from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import StorageControllerApiException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
@@ -75,3 +78,38 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
|
||||
ep.start(safekeeper_generation=1, safekeepers=[3])
|
||||
|
||||
assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
|
||||
|
||||
|
||||
def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that safekeeper_migrate validates the new_sk_set before starting the migration.
|
||||
"""
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
neon_env_builder.storage_controller_config = {
|
||||
"timelines_onto_safekeepers": True,
|
||||
"timeline_safekeeper_count": 2,
|
||||
}
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
def expect_fail(sk_set: list[int], match: str):
|
||||
with pytest.raises(StorageControllerApiException, match=match):
|
||||
env.storage_controller.migrate_safekeepers(
|
||||
env.initial_tenant, env.initial_timeline, sk_set
|
||||
)
|
||||
# Check that we failed before commiting to the database.
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
assert mconf["generation"] == 1
|
||||
|
||||
expect_fail([], "safekeeper set is empty")
|
||||
expect_fail([1], "must have at least 2 safekeepers")
|
||||
expect_fail([1, 1], "duplicate safekeeper")
|
||||
expect_fail([1, 100500], "does not exist")
|
||||
|
||||
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
|
||||
sk_set = mconf["sk_set"]
|
||||
assert len(sk_set) == 2
|
||||
|
||||
decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
|
||||
env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
|
||||
|
||||
expect_fail([sk_set[0], decom_sk], "decomissioned")
|
||||
|
||||
@@ -1673,6 +1673,91 @@ def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
|
||||
# END_HADRON
|
||||
|
||||
|
||||
# HADRON
|
||||
@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
|
||||
def test_back_pressure_per_shard(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests back pressure knobs are enforced on the per shard basis instead of at the tenant level.
|
||||
"""
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = init_shard_count
|
||||
stripe_size = 1
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count,
|
||||
initial_tenant_shard_stripe_size=stripe_size,
|
||||
initial_tenant_conf={
|
||||
# disable auto-flush of shards and set max_replication_flush_lag as 15MB.
|
||||
# The backpressure parameters must be enforced at the shard level to avoid stalling PG.
|
||||
"checkpoint_distance": 1 * 1024 * 1024 * 1024,
|
||||
"checkpoint_timeout": "1h",
|
||||
},
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create(
|
||||
"main",
|
||||
config_lines=[
|
||||
"max_replication_write_lag = 0",
|
||||
"max_replication_apply_lag = 0",
|
||||
"max_replication_flush_lag = 15MB",
|
||||
"neon.max_cluster_size = 10GB",
|
||||
],
|
||||
)
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
# generate 20MB of data
|
||||
endpoint.safe_psql(
|
||||
"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, 20000) s;"
|
||||
)
|
||||
res = endpoint.safe_psql(
|
||||
"SELECT neon.backpressure_throttling_time() as throttling_time", dbname="databricks_system"
|
||||
)[0]
|
||||
assert res[0] == 0, f"throttling_time should be 0, but got {res[0]}"
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
# HADRON
|
||||
def test_shard_split_page_server_timeout(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that shard split can correctly handle page server timeouts and abort the split
|
||||
"""
|
||||
init_shard_count = 2
|
||||
neon_env_builder.num_pageservers = 1
|
||||
stripe_size = 1
|
||||
|
||||
if neon_env_builder.storage_controller_config is None:
|
||||
neon_env_builder.storage_controller_config = {"shard_split_request_timeout": "5s"}
|
||||
else:
|
||||
neon_env_builder.storage_controller_config["shard_split_request_timeout"] = "5s"
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count,
|
||||
initial_tenant_shard_stripe_size=stripe_size,
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Enqueuing background abort.*",
|
||||
".*failpoint.*",
|
||||
".*Failed to abort.*",
|
||||
".*Exclusive lock by ShardSplit was held.*",
|
||||
]
|
||||
)
|
||||
env.pageserver.allowed_errors.extend([".*request was dropped before completing.*"])
|
||||
|
||||
endpoint1 = env.endpoints.create_start(branch_name="main")
|
||||
|
||||
env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "pause"))
|
||||
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=4)
|
||||
|
||||
env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "off"))
|
||||
endpoint1.stop_and_destroy()
|
||||
|
||||
|
||||
def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check a scenario when one of the shards is much slower than others.
|
||||
|
||||
@@ -209,9 +209,9 @@ def test_ancestor_detach_branched_from(
|
||||
client.timeline_delete(env.initial_tenant, env.initial_timeline)
|
||||
wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
|
||||
# as there is always "PREV_LSN: invalid" for "before"
|
||||
skip_files = {"zenith.signal"}
|
||||
# because we do the fullbackup from ancestor at the branch_lsn, the neon.signal and/or zenith.signal is always
|
||||
# different as there is always "PREV_LSN: invalid" for "before"
|
||||
skip_files = {"zenith.signal", "neon.signal"}
|
||||
|
||||
assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
|
||||
|
||||
@@ -767,7 +767,7 @@ def test_compaction_induced_by_detaches_in_history(
|
||||
env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
|
||||
)
|
||||
|
||||
# we don't need to skip any files, because zenith.signal will be identical
|
||||
# we don't need to skip any files, because neon.signal will be identical
|
||||
assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user