mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
Merge branch 'main' into amasterov/random-ops-add
# Conflicts: # test_runner/random_ops/test_random_ops.py
This commit is contained in:
@@ -2,11 +2,12 @@ from __future__ import annotations
|
||||
|
||||
import urllib.parse
|
||||
from enum import StrEnum
|
||||
from typing import TYPE_CHECKING, final
|
||||
from typing import TYPE_CHECKING, Any, final
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.auth import AuthBase
|
||||
from requests.exceptions import ReadTimeout
|
||||
from typing_extensions import override
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session):
|
||||
|
||||
wait_until(offloaded)
|
||||
|
||||
def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
|
||||
url = f"http://localhost:{self.external_port}/promote"
|
||||
if disconnect:
|
||||
try: # send first request to start promote and disconnect
|
||||
self.post(url, data=safekeepers_lsn, timeout=0.001)
|
||||
except ReadTimeout:
|
||||
pass # wait on second request which returns on promotion finish
|
||||
res = self.post(url, data=safekeepers_lsn)
|
||||
res.raise_for_status()
|
||||
json: dict[str, str] = res.json()
|
||||
return json
|
||||
|
||||
def database_schema(self, database: str):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
|
||||
|
||||
@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
# BEGIN_HADRON
|
||||
"pageserver_active_storage_operations_count",
|
||||
# END_HADRON
|
||||
"pageserver_current_logical_size",
|
||||
"pageserver_resident_physical_size",
|
||||
"pageserver_io_operations_bytes_total",
|
||||
|
||||
@@ -111,6 +111,14 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
".*stalling layer flushes for compaction backpressure.*",
|
||||
".*layer roll waiting for flush due to compaction backpressure.*",
|
||||
".*BatchSpanProcessor.*",
|
||||
# Can happen in tests that purposely wipe pageserver "local disk" data.
|
||||
".*Local data loss suspected.*",
|
||||
# Too many frozen layers error is normal during intensive benchmarks
|
||||
".*too many frozen layers.*",
|
||||
# Transient errors when resolving tenant shards by page service
|
||||
".*Fail to resolve tenant shard in attempt.*",
|
||||
# Expected warnings when pageserver has not refreshed GC info yet
|
||||
".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
|
||||
".*No broker updates received for a while.*",
|
||||
*(
|
||||
[
|
||||
|
||||
@@ -1247,3 +1247,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def force_refresh_feature_flag(self, tenant_id: TenantId | TenantShardId):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/force_refresh_feature_flag",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
@@ -71,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man
|
||||
n_clients: int,
|
||||
):
|
||||
setup_and_run_pagebench_benchmark(
|
||||
neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
|
||||
neon_env_builder,
|
||||
zenbenchmark,
|
||||
pg_bin,
|
||||
n_tenants,
|
||||
pgbench_scale,
|
||||
duration,
|
||||
n_clients,
|
||||
)
|
||||
|
||||
|
||||
@@ -86,7 +92,8 @@ def setup_and_run_pagebench_benchmark(
|
||||
):
|
||||
def record(metric, **kwargs):
|
||||
zenbenchmark.record(
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
|
||||
metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
params: dict[str, tuple[Any, dict[str, Any]]] = {}
|
||||
@@ -104,7 +111,7 @@ def setup_and_run_pagebench_benchmark(
|
||||
# configure cache sizes like in prod
|
||||
page_cache_size = 16384
|
||||
max_file_descriptors = 500000
|
||||
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}"
|
||||
neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}"
|
||||
|
||||
tracing_config = PageserverTracingConfig(
|
||||
sampling_ratio=(0, 1000),
|
||||
@@ -120,7 +127,10 @@ def setup_and_run_pagebench_benchmark(
|
||||
page_cache_size * 8192,
|
||||
{"unit": "byte"},
|
||||
),
|
||||
"pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
|
||||
"pageserver_config_override.max_file_descriptors": (
|
||||
max_file_descriptors,
|
||||
{"unit": ""},
|
||||
),
|
||||
"pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -49,3 +49,12 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder):
|
||||
env.initial_tenant, "test-feature-flag"
|
||||
)["result"]
|
||||
)
|
||||
|
||||
env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant)
|
||||
|
||||
# Check if the properties exist
|
||||
result = env.pageserver.http_client().evaluate_feature_flag_multivariate(
|
||||
env.initial_tenant, "test-feature-flag"
|
||||
)
|
||||
assert "tenant_remote_size_mb" in result["properties"]
|
||||
assert "tenant_id" in result["properties"]
|
||||
|
||||
@@ -1,29 +1,51 @@
|
||||
"""
|
||||
File with secondary->primary promotion testing.
|
||||
|
||||
This far, only contains a test that we don't break and that the data is persisted.
|
||||
Secondary -> primary promotion testing
|
||||
"""
|
||||
|
||||
from enum import StrEnum
|
||||
from typing import cast
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import USE_LFC
|
||||
from psycopg2.extensions import cursor as Cursor
|
||||
from pytest import raises
|
||||
|
||||
|
||||
def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
|
||||
ep.stop(mode="immediate-terminate")
|
||||
lsn = ep.terminate_flush_lsn
|
||||
if expected_lsn is not None:
|
||||
assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}"
|
||||
if lsn is not None:
|
||||
assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
|
||||
else:
|
||||
assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"
|
||||
|
||||
|
||||
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
return cast("tuple[str, str, str]", cur.fetchone())
|
||||
|
||||
|
||||
class PromoteMethod(StrEnum):
|
||||
COMPUTE_CTL = "compute-ctl"
|
||||
POSTGRES = "postgres"
|
||||
|
||||
|
||||
METHOD_OPTIONS = [e for e in PromoteMethod]
|
||||
METHOD_IDS = [e.value for e in PromoteMethod]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS)
|
||||
def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
|
||||
"""
|
||||
Test that a replica safely promotes, and can commit data updates which
|
||||
show up when the primary boots up after the promoted secondary endpoint
|
||||
@@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
|
||||
with primary.connect() as primary_conn:
|
||||
primary_cur = primary_conn.cursor()
|
||||
primary_cur.execute("create extension neon")
|
||||
primary_cur.execute(
|
||||
"create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
|
||||
)
|
||||
primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
|
||||
|
||||
lsn_triple = get_lsn_triple(primary_cur)
|
||||
log.info(f"Primary: Current LSN after workload is {lsn_triple}")
|
||||
expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
|
||||
primary_cur.execute("show neon.safekeepers")
|
||||
safekeepers = primary_cur.fetchall()[0][0]
|
||||
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
primary.http_client().offload_lfc()
|
||||
else:
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
|
||||
with secondary.connect() as secondary_conn:
|
||||
secondary_cur = secondary_conn.cursor()
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
with raises(psycopg2.Error):
|
||||
@@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
promo_conn = secondary.connect()
|
||||
promo_cur = promo_conn.cursor()
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
# control plane knows safekeepers, simulate it by querying primary
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn)["status"] == "completed"
|
||||
else:
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
promo_cur.execute("SELECT * FROM pg_promote()")
|
||||
assert promo_cur.fetchone() == (True,)
|
||||
|
||||
promo_cur.execute("SELECT * FROM pg_promote()")
|
||||
assert promo_cur.fetchone() == (True,)
|
||||
promo_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
|
||||
lsn_triple = get_lsn_triple(promo_cur)
|
||||
log.info(f"Secondary: LSN after promotion is {lsn_triple}")
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
with secondary.connect() as new_primary_conn:
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (100,)
|
||||
|
||||
@@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
)
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
|
||||
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
new_primary_cur = conn.cursor()
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (200,)
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
|
||||
|
||||
with secondary.connect() as second_viewpoint_conn:
|
||||
new_primary_cur = second_viewpoint_conn.cursor()
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
log.info(f"Secondary: LSN after workload is {lsn_triple}")
|
||||
expected_promoted_lsn = Lsn(lsn_triple[2])
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as new_primary_cur:
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# secondaries don't sync safekeepers on finish so LSN will be None
|
||||
stop_and_check_lsn(secondary, None)
|
||||
if method == PromoteMethod.COMPUTE_CTL:
|
||||
# compute_ctl's /promote switches replica type to Primary so it syncs
|
||||
# safekeepers on finish
|
||||
stop_and_check_lsn(secondary, expected_promoted_lsn)
|
||||
else:
|
||||
# on testing postgres, we don't update replica type, secondaries don't
|
||||
# sync so lsn should be None
|
||||
stop_and_check_lsn(secondary, None)
|
||||
|
||||
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
|
||||
|
||||
with primary.connect() as new_primary:
|
||||
new_primary_cur = new_primary.cursor()
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
|
||||
with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
|
||||
lsn_triple = get_lsn_triple(new_primary_cur)
|
||||
expected_primary_lsn = Lsn(lsn_triple[2])
|
||||
log.info(f"New primary: Boot LSN is {lsn_triple}")
|
||||
|
||||
@@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (300,)
|
||||
|
||||
stop_and_check_lsn(primary, expected_primary_lsn)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
|
||||
def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens
|
||||
once, and no error is thrown
|
||||
"""
|
||||
env: NeonEnv = neon_simple_env
|
||||
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
|
||||
|
||||
with primary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("create extension neon")
|
||||
cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
|
||||
cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
cur.execute("show neon.safekeepers")
|
||||
safekeepers = cur.fetchall()[0][0]
|
||||
|
||||
primary.http_client().offload_lfc()
|
||||
primary_endpoint_id = primary.endpoint_id
|
||||
primary.stop(mode="immediate-terminate")
|
||||
assert (lsn := primary.terminate_flush_lsn)
|
||||
|
||||
client = secondary.http_client()
|
||||
client.prewarm_lfc(primary_endpoint_id)
|
||||
safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
|
||||
assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
|
||||
|
||||
with secondary.connect() as conn, conn.cursor() as cur:
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (100,)
|
||||
cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
|
||||
cur.execute("select count(*) from t")
|
||||
assert cur.fetchone() == (200,)
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from threading import Event
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import pytest
|
||||
@@ -1505,6 +1508,171 @@ def test_sharding_split_failures(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
|
||||
def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test backpressure can ignore new shards during tenant split so that if we abort the split,
|
||||
PG can continue without being blocked.
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = init_shard_count
|
||||
stripe_size = 32
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create(
|
||||
"main",
|
||||
config_lines=[
|
||||
"max_replication_write_lag = 1MB",
|
||||
"databricks.max_wal_mb_per_second = 1",
|
||||
"neon.max_cluster_size = 10GB",
|
||||
],
|
||||
)
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
|
||||
write_done = Event()
|
||||
|
||||
def write_data(write_done):
|
||||
while not write_done.is_set():
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
log.info("write_data thread exiting")
|
||||
|
||||
writer_thread = threading.Thread(target=write_data, args=(write_done,))
|
||||
writer_thread.start()
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
write_done.set()
|
||||
writer_thread.join()
|
||||
|
||||
# writing more data to page servers after split is aborted
|
||||
for _i in range(5000):
|
||||
endpoint.safe_psql(
|
||||
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
|
||||
)
|
||||
|
||||
# wait until write lag becomes 0
|
||||
def check_write_lag_is_zero():
|
||||
res = endpoint.safe_psql(
|
||||
"""
|
||||
SELECT
|
||||
pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag
|
||||
FROM neon.backpressure_lsns();
|
||||
""",
|
||||
dbname="databricks_system",
|
||||
log_query=False,
|
||||
)
|
||||
log.info(f"received_lsn_lag = {res[0][0]}")
|
||||
assert res[0][0] == 0
|
||||
|
||||
wait_until(check_write_lag_is_zero)
|
||||
endpoint.stop_and_destroy()
|
||||
|
||||
|
||||
# BEGIN_HADRON
|
||||
def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Tests that page service is able to resolve the correct shard during tenant split without causing query errors
|
||||
"""
|
||||
DBNAME = "regression"
|
||||
WORKER_THREADS = 16
|
||||
ROW_COUNT = 10000
|
||||
|
||||
init_shard_count = 4
|
||||
neon_env_builder.num_pageservers = 1
|
||||
stripe_size = 16
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
|
||||
)
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
]
|
||||
)
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created.
|
||||
endpoint.start()
|
||||
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
# generate 10MB of data
|
||||
endpoint.safe_psql(
|
||||
f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;"
|
||||
)
|
||||
read_done = Event()
|
||||
|
||||
def read_data(read_done):
|
||||
i = 0
|
||||
while not read_done.is_set() or i < 10:
|
||||
endpoint.safe_psql(
|
||||
f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}",
|
||||
log_query=False,
|
||||
)
|
||||
i += 1
|
||||
log.info(f"read_data thread exiting. Executed {i} queries.")
|
||||
|
||||
reader_threads = []
|
||||
for _i in range(WORKER_THREADS):
|
||||
reader_thread = threading.Thread(target=read_data, args=(read_done,))
|
||||
reader_thread.start()
|
||||
reader_threads.append(reader_thread)
|
||||
|
||||
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
|
||||
# split the tenant
|
||||
with pytest.raises(StorageControllerApiException):
|
||||
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
|
||||
|
||||
# wait until abort is done
|
||||
def check_tenant_status():
|
||||
active_count = 0
|
||||
for i in range(init_shard_count):
|
||||
status = env.pageserver.http_client().tenant_status(
|
||||
TenantShardId(env.initial_tenant, i, init_shard_count)
|
||||
)
|
||||
if status["state"]["slug"] == "Active":
|
||||
active_count += 1
|
||||
assert active_count == 4
|
||||
|
||||
wait_until(check_tenant_status)
|
||||
|
||||
read_done.set()
|
||||
for thread in reader_threads:
|
||||
thread.join()
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
# END_HADRON
|
||||
|
||||
|
||||
def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check a scenario when one of the shards is much slower than others.
|
||||
|
||||
@@ -332,7 +332,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
|
||||
|
||||
last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
|
||||
|
||||
def start_publisher_workload(table_num: int, duration: int):
|
||||
def start_publisher_workload(i: int, duration: int):
|
||||
start = time.time()
|
||||
with endpoint.cursor(dbname="publisher_db") as cur:
|
||||
while time.time() - start < duration:
|
||||
|
||||
Reference in New Issue
Block a user