Merge branch 'main' into devin/1745492468-add-dev-flag-pr11517

This commit is contained in:
John Spray
2025-06-26 07:32:08 -07:00
committed by GitHub
425 changed files with 12213 additions and 5436 deletions

View File

@@ -24,7 +24,7 @@ The value to place in the `aud` claim.
@final
class ComputeClaimsScope(StrEnum):
ADMIN = "admin"
ADMIN = "compute_ctl:admin"
@final
@@ -69,15 +69,17 @@ class EndpointHttpClient(requests.Session):
json: dict[str, str] = res.json()
return json
def prewarm_lfc(self):
self.post(f"http://localhost:{self.external_port}/lfc/prewarm").raise_for_status()
def prewarm_lfc(self, from_endpoint_id: str | None = None):
url: str = f"http://localhost:{self.external_port}/lfc/prewarm"
params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict()
self.post(url, params=params).raise_for_status()
def prewarmed():
json = self.prewarm_lfc_status()
status, err = json["status"], json.get("error")
assert status == "completed", f"{status}, error {err}"
wait_until(prewarmed)
wait_until(prewarmed, timeout=60)
def offload_lfc(self):
url = f"http://localhost:{self.external_port}/lfc/offload"

View File

@@ -129,6 +129,18 @@ class NeonAPI:
return cast("dict[str, Any]", resp.json())
def get_project_limits(self, project_id: str) -> dict[str, Any]:
resp = self.__request(
"GET",
f"/projects/{project_id}/limits",
headers={
"Accept": "application/json",
"Content-Type": "application/json",
},
)
return cast("dict[str, Any]", resp.json())
def delete_project(
self,
project_id: str,

View File

@@ -497,6 +497,7 @@ class NeonLocalCli(AbstractNeonCli):
tenant_id: TenantId,
pg_version: PgVersion,
endpoint_id: str | None = None,
grpc: bool | None = None,
hot_standby: bool = False,
lsn: Lsn | None = None,
pageserver_id: int | None = None,
@@ -521,6 +522,8 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--external-http-port", str(external_http_port)])
if internal_http_port is not None:
args.extend(["--internal-http-port", str(internal_http_port)])
if grpc:
args.append("--grpc")
if endpoint_id is not None:
args.append(endpoint_id)
if hot_standby:
@@ -564,6 +567,7 @@ class NeonLocalCli(AbstractNeonCli):
basebackup_request_tries: int | None = None,
timeout: str | None = None,
env: dict[str, str] | None = None,
dev: bool = False,
) -> subprocess.CompletedProcess[str]:
args = [
"endpoint",
@@ -589,6 +593,8 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--create-test-user"])
if timeout is not None:
args.extend(["--start-timeout", str(timeout)])
if dev:
args.extend(["--dev"])
res = self.raw_cli(args, extra_env_vars)
res.check_returncode()
@@ -617,7 +623,7 @@ class NeonLocalCli(AbstractNeonCli):
destroy=False,
check_return_code=True,
mode: str | None = None,
) -> subprocess.CompletedProcess[str]:
) -> tuple[Lsn | None, subprocess.CompletedProcess[str]]:
args = [
"endpoint",
"stop",
@@ -629,7 +635,11 @@ class NeonLocalCli(AbstractNeonCli):
if endpoint_id is not None:
args.append(endpoint_id)
return self.raw_cli(args, check_return_code=check_return_code)
proc = self.raw_cli(args, check_return_code=check_return_code)
log.debug(f"endpoint stop stdout: {proc.stdout}")
lsn_str = proc.stdout.split()[-1]
lsn: Lsn | None = None if lsn_str == "null" else Lsn(lsn_str)
return lsn, proc
def mappings_map_branch(
self, name: str, tenant_id: TenantId, timeline_id: TimelineId

View File

@@ -453,6 +453,7 @@ class NeonEnvBuilder:
pageserver_get_vectored_concurrent_io: str | None = None,
pageserver_tracing_config: PageserverTracingConfig | None = None,
pageserver_import_config: PageserverImportConfig | None = None,
storcon_kick_secondary_downloads: bool | None = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -489,7 +490,9 @@ class NeonEnvBuilder:
self.config_init_force: str | None = None
self.top_output_dir = top_output_dir
self.control_plane_hooks_api: str | None = None
self.storage_controller_config: dict[Any, Any] | None = None
self.storage_controller_config: dict[Any, Any] | None = {
"timelines_onto_safekeepers": True,
}
# Flag to enable https listener in pageserver, generate local ssl certs,
# and force storage controller to use https for pageserver api.
@@ -512,6 +515,8 @@ class NeonEnvBuilder:
self.pageserver_tracing_config = pageserver_tracing_config
self.pageserver_import_config = pageserver_import_config
self.storcon_kick_secondary_downloads = storcon_kick_secondary_downloads
self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = (
pageserver_default_tenant_config_compaction_algorithm
)
@@ -1219,6 +1224,14 @@ class NeonEnv:
else:
cfg["storage_controller"] = {"use_local_compute_notifications": False}
if config.storcon_kick_secondary_downloads is not None:
# Configure whether storage controller should actively kick off secondary downloads
if "storage_controller" not in cfg:
cfg["storage_controller"] = {}
cfg["storage_controller"]["kick_secondary_downloads"] = (
config.storcon_kick_secondary_downloads
)
# Create config for pageserver
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -1228,6 +1241,7 @@ class NeonEnv:
):
pageserver_port = PageserverPort(
pg=self.port_distributor.get_port(),
grpc=self.port_distributor.get_port(),
http=self.port_distributor.get_port(),
https=self.port_distributor.get_port() if config.use_https_pageserver_api else None,
)
@@ -1243,13 +1257,14 @@ class NeonEnv:
ps_cfg: dict[str, Any] = {
"id": ps_id,
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
"listen_grpc_addr": f"localhost:{pageserver_port.grpc}",
"listen_http_addr": f"localhost:{pageserver_port.http}",
"listen_https_addr": f"localhost:{pageserver_port.https}"
if config.use_https_pageserver_api
else None,
"pg_auth_type": pg_auth_type,
"http_auth_type": http_auth_type,
"grpc_auth_type": grpc_auth_type,
"http_auth_type": http_auth_type,
"availability_zone": availability_zone,
# Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
# the pageserver taking a long time to start up due to syncfs flushing other tests' data
@@ -1762,6 +1777,7 @@ def neon_env_builder(
@dataclass
class PageserverPort:
pg: int
grpc: int
http: int
https: int | None = None
@@ -2054,6 +2070,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def tombstone_delete(self, node_id):
log.info(f"tombstone_delete({node_id})")
self.request(
"DELETE",
f"{self.api}/debug/v1/tombstone/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
def node_drain(self, node_id):
log.info(f"node_drain({node_id})")
self.request(
@@ -2110,6 +2134,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
)
return response.json()
def tombstone_list(self):
response = self.request(
"GET",
f"{self.api}/debug/v1/tombstone",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def tenant_shard_dump(self):
"""
Debug listing API: dumps the internal map of tenant shards
@@ -2207,6 +2239,17 @@ class NeonStorageController(MetricsGetter, LogUtils):
shards: list[dict[str, Any]] = body["shards"]
return shards
def timeline_locate(self, tenant_id: TenantId, timeline_id: TimelineId):
"""
:return: dict {"generation": int, "sk_set": [int], "new_sk_set": [int]}
"""
response = self.request(
"GET",
f"{self.api}/debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def tenant_describe(self, tenant_id: TenantId):
"""
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int, preferred_az_id: str}
@@ -2333,6 +2376,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
delay_max = max_interval
while n > 0:
n = self.reconcile_all()
if n == 0:
break
elif time.time() - start_at > timeout_secs:
@@ -4030,6 +4074,16 @@ def static_proxy(
"CREATE TABLE neon_control_plane.endpoints (endpoint_id VARCHAR(255) PRIMARY KEY, allowed_ips VARCHAR(255))"
)
vanilla_pg.stop()
vanilla_pg.edit_hba(
[
"local all all trust",
"host all all 127.0.0.1/32 scram-sha-256",
"host all all ::1/128 scram-sha-256",
]
)
vanilla_pg.start()
proxy_port = port_distributor.get_port()
mgmt_port = port_distributor.get_port()
http_port = port_distributor.get_port()
@@ -4155,6 +4209,8 @@ class Endpoint(PgProtocol, LogUtils):
self._running = threading.Semaphore(0)
self.__jwt: str | None = None
self.terminate_flush_lsn: Lsn | None = None
def http_client(self, retries: Retry | None = None) -> EndpointHttpClient:
assert self.__jwt is not None
return EndpointHttpClient(
@@ -4167,6 +4223,7 @@ class Endpoint(PgProtocol, LogUtils):
self,
branch_name: str,
endpoint_id: str | None = None,
grpc: bool | None = None,
hot_standby: bool = False,
lsn: Lsn | None = None,
config_lines: list[str] | None = None,
@@ -4191,6 +4248,7 @@ class Endpoint(PgProtocol, LogUtils):
endpoint_id=self.endpoint_id,
tenant_id=self.tenant_id,
lsn=lsn,
grpc=grpc,
hot_standby=hot_standby,
pg_port=self.pg_port,
external_http_port=self.external_http_port,
@@ -4457,9 +4515,10 @@ class Endpoint(PgProtocol, LogUtils):
running = self._running.acquire(blocking=False)
if running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
lsn, _ = self.env.neon_cli.endpoint_stop(
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
)
self.terminate_flush_lsn = lsn
if sks_wait_walreceiver_gone is not None:
for sk in sks_wait_walreceiver_gone[0]:
@@ -4477,9 +4536,10 @@ class Endpoint(PgProtocol, LogUtils):
running = self._running.acquire(blocking=False)
if running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
lsn, _ = self.env.neon_cli.endpoint_stop(
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
)
self.terminate_flush_lsn = lsn
self.endpoint_id = None
return self
@@ -4488,6 +4548,7 @@ class Endpoint(PgProtocol, LogUtils):
self,
branch_name: str,
endpoint_id: str | None = None,
grpc: bool | None = None,
hot_standby: bool = False,
lsn: Lsn | None = None,
config_lines: list[str] | None = None,
@@ -4505,6 +4566,7 @@ class Endpoint(PgProtocol, LogUtils):
branch_name=branch_name,
endpoint_id=endpoint_id,
config_lines=config_lines,
grpc=grpc,
hot_standby=hot_standby,
lsn=lsn,
pageserver_id=pageserver_id,
@@ -4592,6 +4654,7 @@ class EndpointFactory:
endpoint_id: str | None = None,
tenant_id: TenantId | None = None,
lsn: Lsn | None = None,
grpc: bool | None = None,
hot_standby: bool = False,
config_lines: list[str] | None = None,
remote_ext_base_url: str | None = None,
@@ -4611,6 +4674,7 @@ class EndpointFactory:
return ep.create_start(
branch_name=branch_name,
endpoint_id=endpoint_id,
grpc=grpc,
hot_standby=hot_standby,
config_lines=config_lines,
lsn=lsn,
@@ -4625,6 +4689,7 @@ class EndpointFactory:
endpoint_id: str | None = None,
tenant_id: TenantId | None = None,
lsn: Lsn | None = None,
grpc: bool | None = None,
hot_standby: bool = False,
config_lines: list[str] | None = None,
pageserver_id: int | None = None,
@@ -4647,6 +4712,7 @@ class EndpointFactory:
branch_name=branch_name,
endpoint_id=endpoint_id,
lsn=lsn,
grpc=grpc,
hot_standby=hot_standby,
config_lines=config_lines,
pageserver_id=pageserver_id,
@@ -4671,6 +4737,7 @@ class EndpointFactory:
self,
origin: Endpoint,
endpoint_id: str | None = None,
grpc: bool | None = None,
config_lines: list[str] | None = None,
) -> Endpoint:
branch_name = origin.branch_name
@@ -4682,6 +4749,7 @@ class EndpointFactory:
endpoint_id=endpoint_id,
tenant_id=origin.tenant_id,
lsn=None,
grpc=grpc,
hot_standby=True,
config_lines=config_lines,
)
@@ -4690,6 +4758,7 @@ class EndpointFactory:
self,
origin: Endpoint,
endpoint_id: str | None = None,
grpc: bool | None = None,
config_lines: list[str] | None = None,
) -> Endpoint:
branch_name = origin.branch_name
@@ -4701,6 +4770,7 @@ class EndpointFactory:
endpoint_id=endpoint_id,
tenant_id=origin.tenant_id,
lsn=None,
grpc=grpc,
hot_standby=True,
config_lines=config_lines,
)
@@ -4852,6 +4922,9 @@ class Safekeeper(LogUtils):
log.info(f"finished pulling timeline from {src_ids} to {self.id}")
return res
def safekeeper_id(self) -> SafekeeperId:
return SafekeeperId(self.id, "localhost", self.port.pg_tenant_only)
@property
def data_dir(self) -> Path:
return self.env.repo_dir / "safekeepers" / f"sk{self.id}"

View File

@@ -1219,3 +1219,31 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
)
self.verbose_error(res)
return res.json()
def force_override_feature_flag(self, flag: str, value: str | None = None):
if value is None:
res = self.delete(
f"http://localhost:{self.port}/v1/feature_flag/{flag}",
)
else:
res = self.put(
f"http://localhost:{self.port}/v1/feature_flag/{flag}",
params={"value": value},
)
self.verbose_error(res)
def evaluate_feature_flag_boolean(self, tenant_id: TenantId, flag: str) -> Any:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}",
params={"as": "boolean"},
)
self.verbose_error(res)
return res.json()
def evaluate_feature_flag_multivariate(self, tenant_id: TenantId, flag: str) -> Any:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}",
params={"as": "multivariate"},
)
self.verbose_error(res)
return res.json()

View File

@@ -0,0 +1,22 @@
-- add 100000 rows or approximately 11 MB to the action_blocks table
-- takes about 1 second
INSERT INTO workflows.action_blocks (
id,
uuid,
created_at,
status,
function_signature,
reference_id,
blocking,
run_synchronously
)
SELECT
id,
uuid_generate_v4(),
now() - (random() * interval '100 days'), -- Random date within the last 100 days
'CONDITIONS_NOT_MET',
'function_signature_' || id, -- Create a unique function signature using id
CASE WHEN random() > 0.5 THEN 'reference_' || id ELSE NULL END, -- 50% chance of being NULL
true,
CASE WHEN random() > 0.5 THEN true ELSE false END -- Random boolean value
FROM generate_series(1, 100000) AS id;

View File

@@ -0,0 +1,11 @@
-- add 100000 rows or approximately 10 MB to the action_kwargs table
-- takes about 5 minutes
INSERT INTO workflows.action_kwargs (created_at, key, uuid, value_id, state_value_id, action_block_id)
SELECT
now(), -- Using the default value for `created_at`
'key_' || gs.id, -- Generating a unique key based on the id
uuid_generate_v4(), -- Generating a new UUID for each row
CASE WHEN gs.id % 2 = 0 THEN gs.id ELSE NULL END, -- Setting value_id for even ids
CASE WHEN gs.id % 2 <> 0 THEN gs.id ELSE NULL END, -- Setting state_value_id for odd ids
1 -- Setting action_block_id as 1 for simplicity
FROM generate_series(1, 100000) AS gs(id);

View File

@@ -0,0 +1,56 @@
-- add 100000 rows or approx. 30 MB to the device_fingerprint_event table
-- takes about 4 minutes
INSERT INTO authentication.device_fingerprint_event (
uuid,
created_at,
identity_uuid,
fingerprint_request_id,
fingerprint_id,
confidence_score,
ip_address,
url,
client_referrer,
last_seen_at,
raw_fingerprint_response,
session_uuid,
fingerprint_response,
browser_version,
browser_name,
device,
operating_system,
operating_system_version,
user_agent,
ip_address_location_city,
ip_address_location_region,
ip_address_location_country_code,
ip_address_location_latitude,
ip_address_location_longitude,
is_incognito
)
SELECT
gen_random_uuid(), -- Generates a random UUID for primary key
now() - (random() * interval '10 days'), -- Random timestamp within the last 10 days
gen_random_uuid(), -- Random UUID for identity
md5(gs::text), -- Simulates unique fingerprint request ID using `md5` hash of series number
md5((gs + 10000)::text), -- Simulates unique fingerprint ID
round(CAST(random() AS numeric), 2), -- Generates a random score between 0 and 1, cast `random()` to numeric
'192.168.' || (random() * 255)::int || '.' || (random() * 255)::int, -- Random IP address
'https://example.com/' || (gs % 1000), -- Random URL with series number suffix
CASE WHEN random() < 0.5 THEN NULL ELSE 'https://referrer.com/' || (gs % 100)::text END, -- Random referrer, 50% chance of being NULL
now() - (random() * interval '5 days'), -- Last seen timestamp within the last 5 days
NULL, -- Keeping raw_fingerprint_response NULL for simplicity
CASE WHEN random() < 0.3 THEN gen_random_uuid() ELSE NULL END, -- Session UUID, 30% chance of NULL
NULL, -- Keeping fingerprint_response NULL for simplicity
CASE WHEN random() < 0.5 THEN '93.0' ELSE '92.0' END, -- Random browser version
CASE WHEN random() < 0.5 THEN 'Firefox' ELSE 'Chrome' END, -- Random browser name
CASE WHEN random() < 0.5 THEN 'Desktop' ELSE 'Mobile' END, -- Random device type
'Windows', -- Static value for operating system
'10.0', -- Static value for operating system version
'Mozilla/5.0', -- Static value for user agent
'City ' || (gs % 1000)::text, -- Random city name
'Region ' || (gs % 100)::text, -- Random region name
'US', -- Static country code
random() * 180 - 90, -- Random latitude between -90 and 90
random() * 360 - 180, -- Random longitude between -180 and 180
random() < 0.1 -- 10% chance of being incognito
FROM generate_series(1, 100000) AS gs;

View File

@@ -0,0 +1,10 @@
-- add 100000 rows or approximately 11 MB to the edges table
-- takes about 1 minute
INSERT INTO workflows.edges (created_at, workflow_id, uuid, from_vertex_id, to_vertex_id)
SELECT
now() - (random() * interval '365 days'), -- Random `created_at` timestamp in the last year
(random() * 100)::int + 1, -- Random `workflow_id` between 1 and 100
uuid_generate_v4(), -- Generate a new UUID for each row
(random() * 100000)::bigint + 1, -- Random `from_vertex_id` between 1 and 100,000
(random() * 100000)::bigint + 1 -- Random `to_vertex_id` between 1 and 100,000
FROM generate_series(1, 100000) AS gs; -- Generate 100,000 sequential IDs

View File

@@ -0,0 +1,21 @@
-- add 100000 rows or approximately 10 MB to the hotel_rate_mapping table
-- takes about 1 second
INSERT INTO booking_inventory.hotel_rate_mapping (
uuid,
created_at,
updated_at,
hotel_rate_id,
remote_id,
source
)
SELECT
uuid_generate_v4(), -- Unique UUID for each row
now(), -- Created at timestamp
now(), -- Updated at timestamp
'rate_' || gs AS hotel_rate_id, -- Unique hotel_rate_id
'remote_' || gs AS remote_id, -- Unique remote_id
CASE WHEN gs % 3 = 0 THEN 'source_1'
WHEN gs % 3 = 1 THEN 'source_2'
ELSE 'source_3'
END AS source -- Distributing sources among three options
FROM generate_series(1, 100000) AS gs;

View File

@@ -0,0 +1,31 @@
-- add 100000 rows or approximately 20 MB to the ocr_pipeline_results_version table
-- takes about 1 second
INSERT INTO ocr.ocr_pipeline_results_version (
id, transaction_id, operation_type, created_at, updated_at, s3_filename, completed_at, result,
end_transaction_id, pipeline_type, is_async, callback, callback_kwargs, input, error, file_type, s3_bucket_name, pipeline_kwargs
)
SELECT
gs.aid, -- id
gs.aid, -- transaction_id (same as id for simplicity)
(gs.aid % 5)::smallint + 1, -- operation_type (cyclic values from 1 to 5)
now() - interval '1 day' * (random() * 30), -- created_at (random timestamp within the last 30 days)
now() - interval '1 day' * (random() * 30), -- updated_at (random timestamp within the last 30 days)
's3_file_' || gs.aid || '.txt', -- s3_filename (synthetic filename)
now() - interval '1 day' * (random() * 30), -- completed_at (random timestamp within the last 30 days)
'{}'::jsonb, -- result (empty JSON object)
NULL, -- end_transaction_id (NULL)
CASE (gs.aid % 3) -- pipeline_type (cyclic text values)
WHEN 0 THEN 'OCR'
WHEN 1 THEN 'PDF'
ELSE 'Image'
END,
gs.aid % 2 = 0, -- is_async (alternating between true and false)
'http://callback/' || gs.aid, -- callback (synthetic URL)
'{}'::jsonb, -- callback_kwargs (empty JSON object)
'Input text ' || gs.aid, -- input (synthetic input text)
NULL, -- error (NULL)
'pdf', -- file_type (default to 'pdf')
'bucket_' || gs.aid % 10, -- s3_bucket_name (synthetic bucket names)
'{}'::jsonb -- pipeline_kwargs (empty JSON object)
FROM
generate_series(1, 100000) AS gs(aid);

View File

@@ -0,0 +1,18 @@
-- add 100000 rows or approx. 20 MB to the priceline_raw_response table
-- takes about 20 seconds
INSERT INTO booking_inventory.priceline_raw_response (
uuid, created_at, updated_at, url, base_url, path, method, params, request, response
)
SELECT
gen_random_uuid(), -- Generate random UUIDs
now() - (random() * interval '30 days'), -- Random creation time within the past 30 days
now() - (random() * interval '30 days'), -- Random update time within the past 30 days
'https://example.com/resource/' || gs, -- Construct a unique URL for each row
'https://example.com', -- Base URL for all rows
'/resource/' || gs, -- Path for each row
CASE WHEN gs % 2 = 0 THEN 'GET' ELSE 'POST' END, -- Alternate between GET and POST methods
'id=' || gs, -- Simple parameter pattern for each row
'{}'::jsonb, -- Empty JSON object for request
jsonb_build_object('status', 'success', 'data', gs) -- Construct a valid JSON response
FROM
generate_series(1, 100000) AS gs;

View File

@@ -0,0 +1,26 @@
-- add 100000 rows or approx. 1 MB to the relabeled_transactions table
-- takes about 1 second
INSERT INTO heron.relabeled_transactions (
id,
created_at,
universal_transaction_id,
raw_result,
category,
category_confidence,
merchant,
batch_id
)
SELECT
gs.aid AS id,
now() - (gs.aid % 1000) * interval '1 second' AS created_at,
'txn_' || gs.aid AS universal_transaction_id,
'{}'::jsonb AS raw_result,
CASE WHEN gs.aid % 5 = 0 THEN 'grocery'
WHEN gs.aid % 5 = 1 THEN 'electronics'
WHEN gs.aid % 5 = 2 THEN 'clothing'
WHEN gs.aid % 5 = 3 THEN 'utilities'
ELSE NULL END AS category,
ROUND(RANDOM()::numeric, 2) AS category_confidence,
CASE WHEN gs.aid % 2 = 0 THEN 'Merchant_' || gs.aid % 20 ELSE NULL END AS merchant,
gs.aid % 100 + 1 AS batch_id
FROM generate_series(1, 100000) AS gs(aid);

View File

@@ -0,0 +1,9 @@
-- add 100000 rows or approx.10 MB to the state_values table
-- takes about 14 seconds
INSERT INTO workflows.state_values (key, workflow_id, state_type, value_id)
SELECT
'key_' || gs::text, -- Key: Generate as 'key_1', 'key_2', etc.
(gs - 1) / 1000 + 1, -- workflow_id: Distribute over a range (1000 workflows)
'STATIC', -- state_type: Use constant 'STATIC' as defined in schema
gs::bigint -- value_id: Use the same as the series value
FROM generate_series(1, 100000) AS gs; -- Generate 100,000 rows

View File

@@ -0,0 +1,30 @@
-- add 100000 rows or approx. 24 MB to the values table
-- takes about 126 seconds
INSERT INTO workflows.values (
id,
type,
int_value,
string_value,
child_type,
bool_value,
uuid,
numeric_value,
workflow_id,
jsonb_value,
parent_value_id
)
SELECT
gs AS id,
'TYPE_A' AS type,
CASE WHEN selector = 1 THEN gs ELSE NULL END AS int_value,
CASE WHEN selector = 2 THEN 'string_value_' || gs::text ELSE NULL END AS string_value,
'CHILD_TYPE_A' AS child_type, -- Always non-null
CASE WHEN selector = 3 THEN (gs % 2 = 0) ELSE NULL END AS bool_value,
uuid_generate_v4() AS uuid, -- Always non-null
CASE WHEN selector = 4 THEN gs * 1.0 ELSE NULL END AS numeric_value,
(array[1, 2, 3, 4, 5])[gs % 5 + 1] AS workflow_id, -- Use only existing workflow IDs
CASE WHEN selector = 5 THEN ('{"key":' || gs::text || '}')::jsonb ELSE NULL END AS jsonb_value,
(gs % 100) + 1 AS parent_value_id -- Always non-null
FROM
generate_series(1, 100000) AS gs,
(SELECT floor(random() * 5 + 1)::int AS selector) AS s;

View File

@@ -0,0 +1,26 @@
-- add 100000 rows or approx. 18 MB to the vertices table
-- takes about 90 seconds
INSERT INTO workflows.vertices(
uuid,
created_at,
condition_block_id,
operator,
has_been_visited,
reference_id,
workflow_id,
meta_data,
-- id,
action_block_id
)
SELECT
uuid_generate_v4() AS uuid,
now() AS created_at,
CASE WHEN (gs % 2 = 0) THEN gs % 10 ELSE NULL END AS condition_block_id, -- Every alternate row has a condition_block_id
'operator_' || (gs % 10) AS operator, -- Cyclical operator values (e.g., operator_0, operator_1)
false AS has_been_visited,
'ref_' || gs AS reference_id, -- Unique reference_id for each row
(gs % 1000) + 1 AS workflow_id, -- Random workflow_id values between 1 and 1000
'{}'::jsonb AS meta_data, -- Empty JSON metadata
-- gs AS id, -- default from sequence to get unique ID
CASE WHEN (gs % 2 = 1) THEN gs ELSE NULL END AS action_block_id -- Complementary to condition_block_id
FROM generate_series(1, 100000) AS gs;

View File

@@ -0,0 +1,9 @@
-- update approximately 2000 rows or 200 kb in the accounting_coding_body_tracking_category_selection table
-- takes about 1 second
UPDATE accounting.accounting_coding_body_tracking_category_selection
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM accounting.accounting_coding_body_tracking_category_selection
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 9000 rows or 1 MB in the action_blocks table
-- takes about 1 second
UPDATE workflows.action_blocks
SET run_synchronously = NOT run_synchronously
WHERE ctid in (
SELECT ctid
FROM workflows.action_blocks
TABLESAMPLE SYSTEM (0.001)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 5000 rows or 1 MB in the action_kwargs table
-- takes about 1 second
UPDATE workflows.action_kwargs
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM workflows.action_kwargs
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,10 @@
-- update approximately 3000 rows or 500 KB in the denormalized_approval_workflow table
-- takes about 1 second
UPDATE approvals_v2.denormalized_approval_workflow
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM approvals_v2.denormalized_approval_workflow
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 2000 rows or 1 MB in the device_fingerprint_event table
-- takes about 5 seconds
UPDATE authentication.device_fingerprint_event
SET is_incognito = NOT is_incognito
WHERE ctid in (
SELECT ctid
FROM authentication.device_fingerprint_event
TABLESAMPLE SYSTEM (0.001)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 4000 rows or 600 kb in the edges table
-- takes about 1 second
UPDATE workflows.edges
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM workflows.edges
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 10000 rows or 200 KB in the heron_transaction_enriched_log table
-- takes about 1 minutes
UPDATE heron.heron_transaction_enriched_log
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM heron.heron_transaction_enriched_log
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 4000 rows or 1 MB in the heron_transaction_enrichment_requests table
-- takes about 2 minutes
UPDATE heron.heron_transaction_enrichment_requests
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM heron.heron_transaction_enrichment_requests
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 6000 rows or 600 kb in the hotel_rate_mapping table
-- takes about 1 second
UPDATE booking_inventory.hotel_rate_mapping
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM booking_inventory.hotel_rate_mapping
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 2000 rows or 1 MB in the incoming_webhooks table
-- takes about 5 seconds
UPDATE webhook.incoming_webhooks
SET is_body_encrypted = NOT is_body_encrypted
WHERE ctid in (
SELECT ctid
FROM webhook.incoming_webhooks
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 1000 rows or 200 kb in the manual_transaction table
-- takes about 2 seconds
UPDATE banking.manual_transaction
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM banking.manual_transaction
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 1000 rows or 100 kb in the ml_receipt_matching_log table
-- takes about 1 second
UPDATE receipt.ml_receipt_matching_log
SET is_shadow_mode = NOT is_shadow_mode
WHERE ctid in (
SELECT ctid
FROM receipt.ml_receipt_matching_log
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 2000 rows or 400 kb in the ocr_pipeline_results_version table
-- takes about 1 second
UPDATE ocr.ocr_pipeline_results_version
SET is_async = NOT is_async
WHERE ctid in (
SELECT ctid
FROM ocr.ocr_pipeline_results_version
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 3000 rows or 1 MB in the ocr_pipeline_step_results table
-- takes about 11 seconds
UPDATE ocr.ocr_pipeline_step_results
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM ocr.ocr_pipeline_step_results
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 5000 rows or 1 MB in the ocr_pipeline_step_results_version table
-- takes about 40 seconds
UPDATE ocr.ocr_pipeline_step_results_version
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM ocr.ocr_pipeline_step_results_version
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 5000 rows or 1 MB in the priceline_raw_response table
-- takes about 1 second
UPDATE booking_inventory.priceline_raw_response
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM booking_inventory.priceline_raw_response
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 5000 rows or 1 MB in the quickbooks_transactions table
-- takes about 30 seconds
UPDATE accounting.quickbooks_transactions
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM accounting.quickbooks_transactions
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,15 @@
-- update approximately 6000 rows or 600 kb in the raw_finicity_transaction table
-- takes about 1 second
UPDATE banking.raw_finicity_transaction
SET raw_data =
jsonb_set(
raw_data,
'{updated}',
to_jsonb(now()),
true
)
WHERE ctid IN (
SELECT ctid
FROM banking.raw_finicity_transaction
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 8000 rows or 1 MB in the relabeled_transactions table
-- takes about 1 second
UPDATE heron.relabeled_transactions
SET created_at = now()
WHERE ctid in (
SELECT ctid
FROM heron.relabeled_transactions
TABLESAMPLE SYSTEM (0.0005)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 8000 rows or 1 MB in the state_values table
-- takes about 2 minutes
UPDATE workflows.state_values
SET state_type = now()::text
WHERE ctid in (
SELECT ctid
FROM workflows.state_values
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 4000 rows or 1 MB in the stripe_authorization_event_log table
-- takes about 5 minutes
UPDATE stripe.stripe_authorization_event_log
SET approved = NOT approved
WHERE ctid in (
SELECT ctid
FROM stripe.stripe_authorization_event_log
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 2000 rows or 301 MB in the transaction table
-- takes about 90 seconds
UPDATE transaction.transaction
SET is_last = NOT is_last
WHERE ctid in (
SELECT ctid
FROM transaction.transaction
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -0,0 +1,9 @@
-- update approximately 2500 rows or 1 MB in the values table
-- takes about 3 minutes
UPDATE workflows.values
SET bool_value = NOT bool_value
WHERE ctid in (
SELECT ctid
FROM workflows.values
TABLESAMPLE SYSTEM (0.0002)
) AND bool_value IS NOT NULL;

View File

@@ -0,0 +1,9 @@
-- update approximately 10000 rows or 2 MB in the vertices table
-- takes about 1 minute
UPDATE workflows.vertices
SET has_been_visited = NOT has_been_visited
WHERE ctid in (
SELECT ctid
FROM workflows.vertices
TABLESAMPLE SYSTEM (0.0002)
);

View File

@@ -146,8 +146,6 @@ def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
ps_http.base_url,
"--page-service-connstring",
env.pageserver.connstr(password=None),
"--gzip-probability",
"1",
"--runtime",
f"{duration_secs}s",
# don't specify the targets explicitly, let pagebench auto-discover them

View File

@@ -31,7 +31,9 @@ def get_custom_scripts(
return rv
def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
def run_test_pgbench(
env: PgCompare, custom_scripts: str, duration: int, clients: int = 500, jobs: int = 100
):
password = env.pg.default_options.get("password", None)
options = env.pg.default_options.get("options", "")
# drop password from the connection string by passing password=None and set password separately
@@ -46,8 +48,8 @@ def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
"-n", # no explicit vacuum before the test - we want to rely on auto-vacuum
"-M",
"prepared",
"--client=500",
"--jobs=100",
f"--client={clients}",
f"--jobs={jobs}",
f"-T{duration}",
"-P60", # progress every minute
"--progress-timestamp",
@@ -164,6 +166,12 @@ def test_perf_oltp_large_tenant_pgbench(
run_test_pgbench(remote_compare, custom_scripts, duration)
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
def test_perf_oltp_large_tenant_growth(remote_compare: PgCompare, duration: int):
run_test_pgbench(remote_compare, " ".join(get_custom_scripts()), duration, 35, 35)
@pytest.mark.remote_cluster
def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare):
# run analyze, vacuum, re-index after the test and measure and report its duration

View File

@@ -45,6 +45,8 @@ class NeonEndpoint:
if self.branch.connect_env:
self.connect_env = self.branch.connect_env.copy()
self.connect_env["PGHOST"] = self.host
if self.type == "read_only":
self.project.read_only_endpoints_total += 1
def delete(self):
self.project.delete_endpoint(self.id)
@@ -228,8 +230,13 @@ class NeonProject:
self.benchmarks: dict[str, subprocess.Popen[Any]] = {}
self.restore_num: int = 0
self.restart_pgbench_on_console_errors: bool = False
self.limits: dict[str, Any] = self.get_limits()["limits"]
self.read_only_endpoints_total: int = 0
def delete(self):
def get_limits(self) -> dict[str, Any]:
return self.neon_api.get_project_limits(self.id)
def delete(self) -> None:
self.neon_api.delete_project(self.id)
def create_branch(self, parent_id: str | None = None) -> NeonBranch | None:
@@ -282,6 +289,7 @@ class NeonProject:
self.neon_api.delete_endpoint(self.id, endpoint_id)
self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id)
self.endpoints.pop(endpoint_id)
self.read_only_endpoints_total -= 1
self.wait()
def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]:
@@ -369,49 +377,64 @@ def setup_class(
print(f"::warning::Retried on 524 error {neon_api.retries524} times")
if neon_api.retries4xx > 0:
print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times")
log.info("Removing the project")
log.info("Removing the project %s", project.id)
project.delete()
def do_action(project: NeonProject, action: str) -> None:
def do_action(project: NeonProject, action: str) -> bool:
"""
Runs the action
"""
log.info("Action: %s", action)
if action == "new_branch":
log.info("Trying to create a new branch")
if 0 <= project.limits["max_branches"] <= len(project.branches):
log.info(
"Maximum branch limit exceeded (%s of %s)",
len(project.branches),
project.limits["max_branches"],
)
return False
parent = project.branches[
random.choice(list(set(project.branches.keys()) - project.reset_branches))
]
log.info("Parent: %s", parent)
child = parent.create_child_branch()
if child is None:
return
return False
log.info("Created branch %s", child)
child.start_benchmark()
elif action == "delete_branch":
if project.leaf_branches:
target = random.choice(list(project.leaf_branches.values()))
target: NeonBranch = random.choice(list(project.leaf_branches.values()))
log.info("Trying to delete branch %s", target)
target.delete()
else:
log.info("Leaf branches not found, skipping")
return False
elif action == "new_ro_endpoint":
if 0 <= project.limits["max_read_only_endpoints"] <= project.read_only_endpoints_total:
log.info(
"Maximum read only endpoint limit exceeded (%s of %s)",
project.read_only_endpoints_total,
project.limits["max_read_only_endpoints"],
)
return False
ep = random.choice(
[br for br in project.branches.values() if br.id not in project.reset_branches]
).create_ro_endpoint()
log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id)
ep.start_benchmark()
elif action == "delete_ro_endpoint":
if project.read_only_endpoints_total == 0:
log.info("no read_only endpoints present, skipping")
return False
ro_endpoints: list[NeonEndpoint] = [
endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only"
]
if ro_endpoints:
target_ep: NeonEndpoint = random.choice(ro_endpoints)
target_ep.delete()
log.info("endpoint %s deleted", target_ep.id)
else:
log.info("no read_only endpoints present, skipping")
target_ep: NeonEndpoint = random.choice(ro_endpoints)
target_ep.delete()
log.info("endpoint %s deleted", target_ep.id)
elif action == "restore_random_time":
if project.leaf_branches:
br: NeonBranch = random.choice(list(project.leaf_branches.values()))
@@ -419,8 +442,10 @@ def do_action(project: NeonProject, action: str) -> None:
br.restore_random_time()
else:
log.info("No leaf branches found")
return False
else:
raise ValueError(f"The action {action} is unknown")
return True
@pytest.mark.timeout(7200)
@@ -457,8 +482,9 @@ def test_api_random(
pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env)
for _ in range(num_operations):
log.info("Starting action #%s", _ + 1)
do_action(
while not do_action(
project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0]
)
):
log.info("Retrying...")
project.check_all_benchmarks()
assert True

View File

@@ -184,7 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"timeline_offloading": False,
"rel_size_v2_enabled": True,
"relsize_snapshot_cache_capacity": 10000,
"gc_compaction_enabled": True,
"gc_compaction_enabled": False,
"gc_compaction_verification": False,
"gc_compaction_initial_threshold_kb": 1024000,
"gc_compaction_ratio_percent": 200,

View File

@@ -26,6 +26,10 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
ps = env.pageserver
ps_http = ps.http_client()
storcon_managed_timelines = (env.storage_controller_config or {}).get(
"timelines_onto_safekeepers", False
)
# 1. Check that we always hit the cache after compute restart.
for i in range(3):
ep.start()
@@ -33,15 +37,26 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
def check_metrics(i=i):
metrics = ps_http.get_metrics()
# Never miss.
# The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
# All other requests should be a hit
assert (
metrics.query_one(
"pageserver_basebackup_cache_read_total", {"result": "miss"}
).value
== 0
)
if storcon_managed_timelines:
# We do not cache the initial basebackup yet,
# so the first compute startup should be a miss.
assert (
metrics.query_one(
"pageserver_basebackup_cache_read_total", {"result": "miss"}
).value
== 1
)
else:
# If the timeline is not initialized on safekeeprs,
# the compute_ctl sends `get_basebackup` with lsn=None for the first startup.
# We do not use cache for such requests, so it's niether a hit nor a miss.
assert (
metrics.query_one(
"pageserver_basebackup_cache_read_total", {"result": "miss"}
).value
== 0
)
# All but the first requests are hits.
assert (
metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
@@ -54,6 +69,11 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
).value
== i + 1
)
# There should be only one basebackup file in the cache.
assert metrics.query_one("pageserver_basebackup_cache_entries_total").value == 1
# The size of one basebackup for new DB is ~20KB.
size_bytes = metrics.query_one("pageserver_basebackup_cache_size_bytes").value
assert 10 * 1024 <= size_bytes <= 100 * 1024
wait_until(check_metrics)

View File

@@ -11,6 +11,7 @@ from fixtures.common_types import Lsn, TimelineId
from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import wait_until_tenant_active
from fixtures.safekeeper.http import MembershipConfiguration, TimelineCreateRequest
from fixtures.utils import query_scalar
from performance.test_perf_pgbench import get_scales_matrix
from requests import RequestException
@@ -164,6 +165,19 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
env.pageserver.tenant_create(env.initial_tenant)
sk = env.safekeepers[0]
assert sk
sk.http_client().timeline_create(
TimelineCreateRequest(
env.initial_tenant,
env.initial_timeline,
MembershipConfiguration(generation=1, members=[sk.safekeeper_id()], new_members=None),
int(env.pg_version) * 10000,
Lsn(0),
None,
)
)
initial_branch = "initial_branch"
def start_creating_timeline():

View File

@@ -18,6 +18,8 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
Safekeeper,
StorageControllerApiException,
flush_ep_to_pageserver,
)
from fixtures.pageserver.http import PageserverApiException
@@ -26,6 +28,7 @@ from fixtures.pageserver.utils import (
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
from fixtures.safekeeper.http import MembershipConfiguration
from fixtures.workload import Workload
if TYPE_CHECKING:
@@ -125,6 +128,12 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set",
)
skip_old_debug_versions = pytest.mark.skipif(
os.getenv("BUILD_TYPE", "debug") == "debug"
and os.getenv("DEFAULT_PG_VERSION") in [PgVersion.V14, PgVersion.V15, PgVersion.V16],
reason="compatibility snaphots not available for old versions of debug builds",
)
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
@@ -195,6 +204,7 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_
@check_ondisk_data_compatibility_if_enabled
@skip_old_debug_versions
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
def test_backward_compatibility(
@@ -222,6 +232,7 @@ def test_backward_compatibility(
@check_ondisk_data_compatibility_if_enabled
@skip_old_debug_versions
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
def test_forward_compatibility(
@@ -291,7 +302,20 @@ def test_forward_compatibility(
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
ep = env.endpoints.create("main")
ep_env = {"LD_LIBRARY_PATH": str(env.pg_distrib_dir / f"v{env.pg_version}/lib")}
ep.start(env=ep_env)
# If the compatibility snapshot was created with --timelines-onto-safekeepers=false,
# we should not pass safekeeper_generation to the endpoint because the compute
# will not be able to start.
# Zero generation is INVALID_GENERATION.
generation = 0
try:
res = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
generation = res["generation"]
except StorageControllerApiException as e:
if e.status_code != 404 or not re.search(r"Timeline .* not found", str(e)):
raise e
ep.start(env=ep_env, safekeeper_generation=generation)
connstr = ep.connstr()
@@ -341,7 +365,7 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
)
# Timeline exists again: restart the endpoint
ep.start(env=ep_env)
ep.start(env=ep_env, safekeeper_generation=generation)
pg_bin.run_capture(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
@@ -542,6 +566,24 @@ def test_historic_storage_formats(
# All our artifacts should contain at least one timeline
assert len(timelines) > 0
# Import tenant does not create the timeline on safekeepers,
# because it is a debug handler and the timeline may have already been
# created on some set of safekeepers.
# Create the timeline on safekeepers manually.
# TODO(diko): when we have the script/storcon handler to migrate
# the timeline to storcon, we can replace this code with it.
mconf = MembershipConfiguration(
generation=1,
members=Safekeeper.sks_to_safekeeper_ids([env.safekeepers[0]]),
new_members=None,
)
members_sks = Safekeeper.mconf_sks(env, mconf)
for timeline in timelines:
Safekeeper.create_timeline(
dataset.tenant_id, timeline["timeline_id"], env.pageserver, mconf, members_sks
)
# TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
# least they should include a mixture of deltas and image layers. Preferably they should also
# contain some "exotic" stuff like aux files from logical replication.
@@ -573,6 +615,7 @@ def test_historic_storage_formats(
@check_ondisk_data_compatibility_if_enabled
@skip_old_debug_versions
@pytest.mark.xdist_group("compatibility")
@pytest.mark.parametrize(
**fixtures.utils.allpairs_versions(),

View File

@@ -418,7 +418,7 @@ def test_sql_exporter_metrics_e2e(
pg_user = conn_options["user"]
pg_dbname = conn_options["dbname"]
pg_application_name = f"sql_exporter{stem_suffix}"
connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}"
connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}&pgaudit.log=none"
def escape_go_filepath_match_characters(s: str) -> str:
"""

View File

@@ -9,6 +9,8 @@ from fixtures.utils import wait_until
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
from fixtures.log_helper import log
def test_compute_reconfigure(neon_simple_env: NeonEnv):
"""
@@ -85,3 +87,57 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
samples = metrics.query_all("compute_ctl_up", {"build_tag": build_tag})
assert len(samples) == 1
assert samples[0].value == 1
def test_compute_safekeeper_connstrings_duplicate(neon_simple_env: NeonEnv):
"""
Test that we catch duplicate entries in neon.safekeepers.
"""
env = neon_simple_env
endpoint = env.endpoints.create_start("main")
# grab the current value of neon.safekeepers
sk_list = []
with endpoint.cursor() as cursor:
cursor.execute("SHOW neon.safekeepers;")
row = cursor.fetchone()
assert row is not None
log.info(f' initial neon.safekeepers: "{row}"')
# build a safekeepers list with a duplicate
sk_list.append(row[0])
sk_list.append(row[0])
safekeepers = ",".join(sk_list)
log.info(f'reconfigure neon.safekeepers: "{safekeepers}"')
# introduce duplicate entry in neon.safekeepers, on purpose
endpoint.respec_deep(
**{
"spec": {
"skip_pg_catalog_updates": True,
"cluster": {
"settings": [
{
"name": "neon.safekeepers",
"vartype": "string",
"value": safekeepers,
}
]
},
},
}
)
try:
endpoint.reconfigure()
# Check that in logs we see that it was actually reconfigured,
# not restarted or something else.
endpoint.log_contains("INFO request{method=POST uri=/configure")
except Exception as e:
# we except a failure here
log.info(f"RAISED: {e}" % e)

View File

@@ -0,0 +1,51 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from fixtures.utils import run_only_on_default_postgres
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
@run_only_on_default_postgres("Pageserver-only test only needs to run on one version")
def test_feature_flag(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "true")
assert env.pageserver.http_client().evaluate_feature_flag_boolean(
env.initial_tenant, "test-feature-flag"
)["result"]["Ok"]
assert (
env.pageserver.http_client().evaluate_feature_flag_multivariate(
env.initial_tenant, "test-feature-flag"
)["result"]["Ok"]
== "true"
)
env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "false")
assert (
env.pageserver.http_client().evaluate_feature_flag_boolean(
env.initial_tenant, "test-feature-flag"
)["result"]["Err"]
== "No condition group is matched"
)
assert (
env.pageserver.http_client().evaluate_feature_flag_multivariate(
env.initial_tenant, "test-feature-flag"
)["result"]["Ok"]
== "false"
)
env.pageserver.http_client().force_override_feature_flag("test-feature-flag", None)
assert (
"Err"
in env.pageserver.http_client().evaluate_feature_flag_boolean(
env.initial_tenant, "test-feature-flag"
)["result"]
)
assert (
"Err"
in env.pageserver.http_client().evaluate_feature_flag_multivariate(
env.initial_tenant, "test-feature-flag"
)["result"]
)

View File

@@ -87,6 +87,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
# Set up pageserver for import
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
}
env = neon_env_builder.init_start()
env.pageserver.tenant_create(tenant)

View File

@@ -59,7 +59,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
pg_cur.execute("create extension neon version '1.6'")
pg_cur.execute("create extension neon")
pg_cur.execute("create database lfc")
lfc_conn = endpoint.connect(dbname="lfc")
@@ -84,11 +84,8 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod):
endpoint.stop()
endpoint.start()
# wait until compute_ctl completes downgrade of extension to default version
time.sleep(1)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
pg_cur.execute("alter extension neon update to '1.6'")
lfc_conn = endpoint.connect(dbname="lfc")
lfc_cur = lfc_conn.cursor()
@@ -144,7 +141,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
pg_cur.execute("create extension neon version '1.6'")
pg_cur.execute("create extension neon")
pg_cur.execute("CREATE DATABASE lfc")
lfc_conn = endpoint.connect(dbname="lfc")
@@ -188,7 +185,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet
pg_cur.execute("select pg_reload_conf()")
if query is LfcQueryMethod.COMPUTE_CTL:
http_client.prewarm_lfc()
# Same thing as prewarm_lfc(), testing other method
http_client.prewarm_lfc(endpoint.endpoint_id)
else:
pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,))

View File

@@ -29,7 +29,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
# IMPORTANT:
# If the version has changed, the test should be updated.
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.5",)
assert cur.fetchone() == ("1.6",)
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
res = cur.fetchall()
log.info(res)
@@ -53,10 +53,10 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
# IMPORTANT:
# If the version has changed, the test should be updated.
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.5",)
assert cur.fetchone() == ("1.6",)
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
current_version = "1.5"
all_versions = ["1.6", "1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
current_version = "1.6"
for idx, begin_version in enumerate(all_versions):
for target_version in all_versions[idx + 1 :]:
if current_version != begin_version:

View File

@@ -64,6 +64,11 @@ def test_normal_work(
"""
neon_env_builder.num_safekeepers = num_safekeepers
if safekeeper_proto_version == 2:
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

View File

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin):
def test_pageserver_restarts_under_workload(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
env.create_branch("test_pageserver_restarts")
endpoint = env.endpoints.create_start("test_pageserver_restarts")
@@ -28,7 +28,11 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
thread = threading.Thread(
target=run_pgbench,
args=(endpoint.connstr(options="-cstatement_timeout=360s"),),
daemon=True,
)
thread.start()
for _ in range(n_restarts):

View File

@@ -173,7 +173,11 @@ def test_pg_regress(
(runpath / "testtablespace").mkdir(parents=True)
# Compute all the file locations that pg_regress will need.
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
#
# XXX: We assume that the `build` directory is a sibling of the
# pg_distrib_dir. That is the default when you check out the
# repository; `build` and `pg_install` are created side by side.
build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress"
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
schedule = src_path / "parallel_schedule"
@@ -250,7 +254,11 @@ def test_isolation(
(runpath / "testtablespace").mkdir(parents=True)
# Compute all the file locations that pg_isolation_regress will need.
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
#
# XXX: We assume that the `build` directory is a sibling of the
# pg_distrib_dir. That is the default when you check out the
# repository; `build` and `pg_install` are created side by side.
build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/isolation"
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
schedule = src_path / "isolation_schedule"
@@ -306,13 +314,7 @@ def test_sql_regress(
)
# Connect to postgres and create a database called "regression".
endpoint = env.endpoints.create_start(
"main",
config_lines=[
# Enable the test mode, so that we don't need to patch the test cases.
"neon.regress_test_mode = true",
],
)
endpoint = env.endpoints.create_start("main")
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
# Create some local directories for pg_regress to run in.
@@ -320,8 +322,11 @@ def test_sql_regress(
(runpath / "testtablespace").mkdir(parents=True)
# Compute all the file locations that pg_regress will need.
# This test runs neon specific tests
build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
#
# XXX: We assume that the `build` directory is a sibling of the
# pg_distrib_dir. That is the default when you check out the
# repository; `build` and `pg_install` are created side by side.
build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress"
src_path = base_dir / "test_runner/sql_regress"
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
schedule = src_path / "parallel_schedule"

View File

@@ -19,11 +19,15 @@ TABLE_NAME = "neon_control_plane.endpoints"
async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: VanillaPostgres):
# Shouldn't be able to connect to this project
vanilla_pg.safe_psql(
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')"
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('private-project', '8.8.8.8')",
user="proxy",
password="password",
)
# Should be able to connect to this project
vanilla_pg.safe_psql(
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')"
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('generic-project', '::1,127.0.0.1')",
user="proxy",
password="password",
)
def check_cannot_connect(**kwargs):
@@ -60,7 +64,9 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
# Shouldn't be able to connect to this project
vanilla_pg.safe_psql(
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')"
f"INSERT INTO {TABLE_NAME} (endpoint_id, allowed_ips) VALUES ('proxy', '8.8.8.8')",
user="proxy",
password="password",
)
def query(status: int, query: str, *args):
@@ -75,6 +81,8 @@ async def test_proxy_http_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil
query(400, "select 1;") # ip address is not allowed
# Should be able to connect to this project
vanilla_pg.safe_psql(
f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'"
f"UPDATE {TABLE_NAME} SET allowed_ips = '8.8.8.8,127.0.0.1' WHERE endpoint_id = 'proxy'",
user="proxy",
password="password",
)
query(200, "select 1;") # should work now

View File

@@ -4,13 +4,25 @@ File with secondary->primary promotion testing.
This far, only contains a test that we don't break and that the data is persisted.
"""
from typing import cast
import psycopg2
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from pytest import raises
def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
ep.stop(mode="immediate-terminate")
lsn = ep.terminate_flush_lsn
if expected_lsn is not None:
assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
else:
assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
"""
Test that a replica safely promotes, and can commit data updates which
@@ -37,7 +49,9 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
pg_current_wal_flush_lsn()
"""
)
log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}")
lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
log.info(f"Primary: Current LSN after workload is {lsn_triple}")
expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
primary_cur.execute("show neon.safekeepers")
safekeepers = primary_cur.fetchall()[0][0]
@@ -57,7 +71,7 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (100,)
primary.stop_and_destroy(mode="immediate")
stop_and_check_lsn(primary, expected_primary_lsn)
# Reconnect to the secondary to make sure we get a read-write connection
promo_conn = secondary.connect()
@@ -109,9 +123,10 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
secondary.stop_and_destroy()
# secondaries don't sync safekeepers on finish so LSN will be None
stop_and_check_lsn(secondary, None)
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
with primary.connect() as new_primary:
new_primary_cur = new_primary.cursor()
@@ -122,7 +137,9 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
pg_current_wal_flush_lsn()
"""
)
log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}")
lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
expected_primary_lsn = Lsn(lsn_triple[2])
log.info(f"New primary: Boot LSN is {lsn_triple}")
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (200,)
@@ -130,4 +147,4 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
new_primary_cur.execute("select count(*) from t")
assert new_primary_cur.fetchone() == (300,)
primary.stop(mode="immediate")
stop_and_check_lsn(primary, expected_primary_lsn)

View File

@@ -74,7 +74,7 @@ def test_tenant_s3_restore(
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
last_flush_lsns.append(last_flush_lsn)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn, timeout=60)
log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}")
parent = timeline

View File

@@ -30,6 +30,7 @@ def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabl
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found in global map.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was cancelled and cannot be used anymore.*",
]
)
@@ -198,6 +199,7 @@ def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder)
env.pageserver.allowed_errors.extend(
[
".*Timeline.*was cancelled.*",
".*Timeline.*has been deleted.*",
".*Timeline.*was not found.*",
]
)

View File

@@ -1337,7 +1337,7 @@ def test_sharding_split_failures(
# Create bystander tenants with various shard counts. They should not be affected by the aborted
# splits. Regression test for https://github.com/neondatabase/cloud/issues/28589.
bystanders = {} # id → shard_count
for bystander_shard_count in [1, 2, 4, 8]:
for bystander_shard_count in [1, 2, 4]:
id, _ = env.create_tenant(shard_count=bystander_shard_count)
bystanders[id] = bystander_shard_count
@@ -1358,6 +1358,8 @@ def test_sharding_split_failures(
".*Reconcile error.*Cancelled.*",
# While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning
".*Failed to schedule metadata upload after updating disk_consistent_lsn.*",
# We didn't identify a secondary to remove.
".*Keeping extra secondaries.*",
]
)
@@ -1388,51 +1390,36 @@ def test_sharding_split_failures(
with pytest.raises(failure.expect_exception()):
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
secondary_count = 0
attached_count = 0
log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
for ps in env.pageservers:
if exclude_ps_id is not None and ps.id == exclude_ps_id:
continue
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
for loc in locations:
tenant_shard_id = TenantShardId.parse(loc[0])
if tenant_shard_id.tenant_id != tenant_id:
continue # skip bystanders
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
assert tenant_shard_id.shard_count == shard_count
if loc[1]["mode"] == "Secondary":
secondary_count += 1
else:
attached_count += 1
assert secondary_count == shard_count
assert attached_count == shard_count
# We expect that the overall operation will fail, but some split requests
# will have succeeded: the net result should be to return to a clean state, including
# detaching any child shards.
def assert_rolled_back(exclude_ps_id=None) -> None:
secondary_count = 0
attached_count = 0
for ps in env.pageservers:
if exclude_ps_id is not None and ps.id == exclude_ps_id:
continue
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
for loc in locations:
tenant_shard_id = TenantShardId.parse(loc[0])
if tenant_shard_id.tenant_id != tenant_id:
continue # skip bystanders
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
assert tenant_shard_id.shard_count == initial_shard_count
if loc[1]["mode"] == "Secondary":
secondary_count += 1
else:
attached_count += 1
assert secondary_count == initial_shard_count
assert attached_count == initial_shard_count
assert_shard_count(initial_shard_count, exclude_ps_id)
def assert_split_done(exclude_ps_id: int | None = None) -> None:
secondary_count = 0
attached_count = 0
for ps in env.pageservers:
if exclude_ps_id is not None and ps.id == exclude_ps_id:
continue
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
for loc in locations:
tenant_shard_id = TenantShardId.parse(loc[0])
if tenant_shard_id.tenant_id != tenant_id:
continue # skip bystanders
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
assert tenant_shard_id.shard_count == split_shard_count
if loc[1]["mode"] == "Secondary":
secondary_count += 1
else:
attached_count += 1
assert attached_count == split_shard_count
assert secondary_count == split_shard_count
assert_shard_count(split_shard_count, exclude_ps_id)
def finish_split():
# Having failed+rolled back, we should be able to split again
@@ -1468,6 +1455,7 @@ def test_sharding_split_failures(
# The split should appear to be rolled back from the point of view of all pageservers
# apart from the one that is offline
env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
finish_split()
@@ -1482,6 +1470,7 @@ def test_sharding_split_failures(
log.info("Clearing failure...")
failure.clear(env)
env.storage_controller.reconcile_until_idle(timeout_secs=60, max_interval=2)
wait_until(assert_rolled_back)
# Having rolled back, the tenant should be working
@@ -1836,3 +1825,90 @@ def test_sharding_gc(
shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn
def test_split_ps_delete_old_shard_after_commit(neon_env_builder: NeonEnvBuilder):
"""
Check that PageServer only deletes old shards after the split is committed such that it doesn't
have to download a lot of files during abort.
"""
DBNAME = "regression"
init_shard_count = 4
neon_env_builder.num_pageservers = init_shard_count
stripe_size = 32
env = neon_env_builder.init_start(
initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
)
env.storage_controller.allowed_errors.extend(
[
# All split failures log a warning when they enqueue the abort operation
".*Enqueuing background abort.*",
# Tolerate any error logs that mention a failpoint
".*failpoint.*",
]
)
endpoint = env.endpoints.create("main")
endpoint.respec(skip_pg_catalog_updates=False)
endpoint.start()
# Write some initial data.
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
for _ in range(1000):
endpoint.safe_psql(
"INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
)
# Record how many bytes we've downloaded before the split.
def collect_downloaded_bytes() -> list[float | None]:
downloaded_bytes = []
for page_server in env.pageservers:
metric = page_server.http_client().get_metric_value(
"pageserver_remote_ondemand_downloaded_bytes_total"
)
downloaded_bytes.append(metric)
return downloaded_bytes
downloaded_bytes_before = collect_downloaded_bytes()
# Attempt to split the tenant, but fail the split before it completes.
env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
with pytest.raises(StorageControllerApiException):
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
# Wait until split is aborted.
def check_split_is_aborted():
tenants = env.storage_controller.tenant_list()
assert len(tenants) == 1
shards = tenants[0]["shards"]
assert len(shards) == 4
for shard in shards:
assert not shard["is_splitting"]
assert not shard["is_reconciling"]
# Make sure all new shards have been deleted.
valid_shards = 0
for ps in env.pageservers:
for tenant_dir in os.listdir(ps.workdir / "tenants"):
try:
tenant_shard_id = TenantShardId.parse(tenant_dir)
valid_shards += 1
assert tenant_shard_id.shard_count == 4
except ValueError:
log.info(f"{tenant_dir} is not valid tenant shard id")
assert valid_shards >= 4
wait_until(check_split_is_aborted)
endpoint.safe_psql("SELECT count(*) from usertable;", log_query=False)
# Make sure we didn't download anything following the aborted split.
downloaded_bytes_after = collect_downloaded_bytes()
assert downloaded_bytes_before == downloaded_bytes_after
endpoint.stop_and_destroy()

View File

@@ -88,6 +88,12 @@ def test_storage_controller_smoke(
neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
env = neon_env_builder.init_configs()
# These bubble up from safekeepers
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
# Start services by hand so that we can skip a pageserver (this will start + register later)
env.broker.start()
env.storage_controller.start()
@@ -2956,7 +2962,7 @@ def test_storage_controller_leadership_transfer_during_split(
env.storage_controller.allowed_errors.extend(
[".*Unexpected child shard count.*", ".*Enqueuing background abort.*"]
)
pause_failpoint = "shard-split-pre-complete"
pause_failpoint = "shard-split-pre-complete-pause"
env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
split_fut = executor.submit(
@@ -3003,7 +3009,7 @@ def test_storage_controller_leadership_transfer_during_split(
env.storage_controller.request(
"PUT",
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
json=[{"name": pause_failpoint, "actions": "off"}],
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
@@ -3093,6 +3099,58 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
wait_until(reconfigure_node_again)
def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_start()
def assert_nodes_count(n: int):
nodes = env.storage_controller.node_list()
assert len(nodes) == n
# Nodes count must remain the same before deletion
assert_nodes_count(3)
ps = env.pageservers[0]
env.storage_controller.node_delete(ps.id)
# After deletion, the node count must be reduced
assert_nodes_count(2)
# Running pageserver CLI init in a separate thread
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
log.info("Restarting tombstoned pageserver...")
ps.stop()
ps_start_fut = executor.submit(lambda: ps.start(await_active=False))
# After deleted pageserver restart, the node count must remain the same
assert_nodes_count(2)
tombstones = env.storage_controller.tombstone_list()
assert len(tombstones) == 1 and tombstones[0]["id"] == ps.id
env.storage_controller.tombstone_delete(ps.id)
tombstones = env.storage_controller.tombstone_list()
assert len(tombstones) == 0
# Wait for the pageserver start operation to complete.
# If it fails with an exception, we try restarting the pageserver since the failure
# may be due to the storage controller refusing to register the node.
# However, if we get a TimeoutError that means the pageserver is completely hung,
# which is an unexpected failure mode that we'll let propagate up.
try:
ps_start_fut.result(timeout=20)
except TimeoutError:
raise
except Exception:
log.info("Restarting deleted pageserver...")
ps.restart()
# Finally, the node can be registered again after tombstone is deleted
wait_until(lambda: assert_nodes_count(3))
def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
"""
The storage controller is meant to handle the case where a timeline CRUD operation races
@@ -3403,7 +3461,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
assert target.get_safekeeper(fake_id) is None
assert len(target.get_safekeepers()) == 0
start_sks = target.get_safekeepers()
sk_0 = env.safekeepers[0]
@@ -3425,7 +3483,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
inserted = target.get_safekeeper(fake_id)
assert inserted is not None
assert target.get_safekeepers() == [inserted]
assert target.get_safekeepers() == start_sks + [inserted]
assert eq_safekeeper_records(body, inserted)
# error out if pk is changed (unexpected)
@@ -3437,7 +3495,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
assert exc.value.status_code == 400
inserted_again = target.get_safekeeper(fake_id)
assert target.get_safekeepers() == [inserted_again]
assert target.get_safekeepers() == start_sks + [inserted_again]
assert inserted_again is not None
assert eq_safekeeper_records(inserted, inserted_again)
@@ -3446,7 +3504,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
body["version"] += 1
target.on_safekeeper_deploy(fake_id, body)
inserted_now = target.get_safekeeper(fake_id)
assert target.get_safekeepers() == [inserted_now]
assert target.get_safekeepers() == start_sks + [inserted_now]
assert inserted_now is not None
assert eq_safekeeper_records(body, inserted_now)
@@ -3455,7 +3513,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
body["https_port"] = 123
target.on_safekeeper_deploy(fake_id, body)
inserted_now = target.get_safekeeper(fake_id)
assert target.get_safekeepers() == [inserted_now]
assert target.get_safekeepers() == start_sks + [inserted_now]
assert inserted_now is not None
assert eq_safekeeper_records(body, inserted_now)
env.storage_controller.consistency_check()
@@ -3464,7 +3522,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
body["https_port"] = None
target.on_safekeeper_deploy(fake_id, body)
inserted_now = target.get_safekeeper(fake_id)
assert target.get_safekeepers() == [inserted_now]
assert target.get_safekeepers() == start_sks + [inserted_now]
assert inserted_now is not None
assert eq_safekeeper_records(body, inserted_now)
env.storage_controller.consistency_check()
@@ -3583,6 +3641,11 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
env = neon_env_builder.init_configs()
env.start()
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1})
@@ -4373,6 +4436,53 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder,
assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == []
def test_attached_0_graceful_migration(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 4
neon_env_builder.num_azs = 2
neon_env_builder.storcon_kick_secondary_downloads = False
env = neon_env_builder.init_start()
# It is default, but we want to ensure that there are no secondary locations requested
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 0}})
env.storage_controller.reconcile_until_idle()
desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
src_ps_id = desc["node_attached"]
src_ps = env.get_pageserver(src_ps_id)
src_az = desc["preferred_az_id"]
# There must be no secondary locations with Attached(0) placement policy
assert len(desc["node_secondary"]) == 0
# Migrate tenant shard to the same AZ node
dst_ps = [ps for ps in env.pageservers if ps.id != src_ps_id and ps.az_id == src_az][0]
env.storage_controller.tenant_shard_migrate(
TenantShardId(env.initial_tenant, 0, 0),
dst_ps.id,
config=StorageControllerMigrationConfig(prewarm=True),
)
def tenant_shard_migrated():
src_locations = src_ps.http_client().tenant_list_locations()["tenant_shards"]
assert len(src_locations) == 0
log.info(f"Tenant shard migrated from {src_ps.id}")
dst_locations = dst_ps.http_client().tenant_list_locations()["tenant_shards"]
assert len(dst_locations) == 1
assert dst_locations[0][1]["mode"] == "AttachedSingle"
log.info(f"Tenant shard migrated to {dst_ps.id}")
# After all we expect that tenant shard exists only on dst node.
# We wait so long because [`DEFAULT_HEATMAP_PERIOD`] and [`DEFAULT_DOWNLOAD_INTERVAL`]
# are set to 60 seconds by default.
#
# TODO: we should consider making these configurable, so the test can run faster.
wait_until(tenant_shard_migrated, timeout=180, interval=5, status_interval=10)
log.info("Tenant shard migrated successfully")
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
def test_storage_controller_migrate_with_pageserver_restart(
neon_env_builder: NeonEnvBuilder, make_httpserver

View File

@@ -341,6 +341,11 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
env = neon_env_builder.init_configs()
env.start()
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.create_tenant(

View File

@@ -430,6 +430,7 @@ def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: Pg
workload.init()
workload.write_rows(256)
workload.validate()
workload.stop()
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,

View File

@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
import pytest
import requests
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.common_types import Lsn, TenantId, TimelineArchivalState, TimelineId
from fixtures.log_helper import log
from fixtures.metrics import (
PAGESERVER_GLOBAL_METRICS,
@@ -299,6 +299,65 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
assert post_detach_samples == set()
def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuilder):
"""Tests that when a timeline is offloaded, the tenant specific metrics are not left behind"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
tenant_1, _ = env.create_tenant()
timeline_1 = env.create_timeline("test_metrics_removed_after_offload_1", tenant_id=tenant_1)
timeline_2 = env.create_timeline("test_metrics_removed_after_offload_2", tenant_id=tenant_1)
endpoint_tenant1 = env.endpoints.create_start(
"test_metrics_removed_after_offload_1", tenant_id=tenant_1
)
endpoint_tenant2 = env.endpoints.create_start(
"test_metrics_removed_after_offload_2", tenant_id=tenant_1
)
for endpoint in [endpoint_tenant1, endpoint_tenant2]:
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (5000050000,)
endpoint.stop()
def get_ps_metric_samples_for_timeline(
tenant_id: TenantId, timeline_id: TimelineId
) -> list[Sample]:
ps_metrics = env.pageserver.http_client().get_metrics()
samples = []
for metric_name in ps_metrics.metrics:
for sample in ps_metrics.query_all(
name=metric_name,
filter={"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)},
):
samples.append(sample)
return samples
for timeline in [timeline_1, timeline_2]:
pre_offload_samples = set(
[x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)]
)
assert len(pre_offload_samples) > 0, f"expected at least one sample for {timeline}"
env.pageserver.http_client().timeline_archival_config(
tenant_1,
timeline,
state=TimelineArchivalState.ARCHIVED,
)
env.pageserver.http_client().timeline_offload(tenant_1, timeline)
post_offload_samples = set(
[x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)]
)
assert post_offload_samples == set()
def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()

View File

@@ -21,7 +21,10 @@ from fixtures.neon_fixtures import (
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
from fixtures.pageserver.http import (
HistoricLayerInfo,
PageserverApiException,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until
@@ -413,6 +416,7 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
"read_only": True,
},
)
sk = env.safekeepers[0]
assert sk
with pytest.raises(requests.exceptions.HTTPError, match="Not Found"):
@@ -504,8 +508,15 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots
assert len(lineage.get("original_ancestor", [])) == 0
assert len(lineage.get("reparenting_history", [])) == 0
for name, _, _, rows, starts in expected_result:
with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
for branch_name, queried_timeline, _, rows, starts in expected_result:
details = client.timeline_detail(env.initial_tenant, queried_timeline)
log.info(f"reading data from branch {branch_name}")
# specifying the lsn makes the endpoint read-only and not connect to safekeepers
with env.endpoints.create(
branch_name,
lsn=Lsn(details["last_record_lsn"]),
) as ep:
ep.start(safekeeper_generation=1)
assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1
@@ -1088,6 +1099,9 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
for ps in env.pageservers:
ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
pageservers = dict((int(p.id), p) for p in env.pageservers)
@@ -1209,6 +1223,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
for ps in env.pageservers:
ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
pageservers = dict((int(p.id), p) for p in env.pageservers)

View File

@@ -24,6 +24,10 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"},
initial_tenant_shard_count=2 if sharded else None,
)
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"]
)
if sharded:
http = env.storage_controller.pageserver_api()

View File

@@ -229,7 +229,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder):
# Test timeline_list endpoint.
http_cli = env.safekeepers[0].http_client()
assert len(http_cli.timeline_list()) == 3
assert len(http_cli.timeline_list()) == 4
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
@@ -433,6 +433,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder):
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found in global map.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was cancelled and cannot be used anymore.*",
]
)
@@ -739,8 +740,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.create_branch("test_timeline_status")
endpoint = env.endpoints.create_start("test_timeline_status")
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main")
wa = env.safekeepers[0]
@@ -1291,6 +1292,12 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
# it works without compute at all.
def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
# timelines should be created the old way
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
@@ -1532,6 +1539,11 @@ def test_safekeeper_without_pageserver(
def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
# timelines should be created the old way manually until we have migration support
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
def execute_payload(endpoint: Endpoint):
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
@@ -1661,6 +1673,15 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool):
res = env.safekeepers[3].pull_timeline(
[env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id
)
sk_id_1 = env.safekeepers[0].safekeeper_id()
sk_id_3 = env.safekeepers[2].safekeeper_id()
sk_id_4 = env.safekeepers[3].safekeeper_id()
new_conf = MembershipConfiguration(
generation=2, members=[sk_id_1, sk_id_3, sk_id_4], new_members=None
)
for i in [0, 2, 3]:
env.safekeepers[i].http_client().membership_switch(tenant_id, timeline_id, new_conf)
log.info("Finished pulling timeline")
log.info(res)
@@ -1705,13 +1726,15 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
dst_sk.stop()
[tenant_id, timeline_id] = env.create_tenant()
log.info("use only first 2 safekeepers, 3rd will be seeded")
endpoint = env.endpoints.create("main")
endpoint = env.endpoints.create("main", tenant_id=tenant_id)
endpoint.active_safekeepers = [1, 2]
endpoint.start()
endpoint.safe_psql("create table t(key int, value text)")
@@ -1723,6 +1746,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
src_http = src_sk.http_client()
# run pull_timeline which will halt before downloading files
src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause"))
dst_sk.start()
pt_handle = PropagatingThread(
target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
)
@@ -1782,23 +1806,27 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
(src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
dst_sk.stop()
src_http = src_sk.http_client()
src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause"))
timeline_id = env.create_branch("pull_timeline_term_changes")
# run pull_timeline which will halt before downloading files
log.info("use only first 2 safekeepers, 3rd will be seeded")
ep = env.endpoints.create("main")
ep = env.endpoints.create("pull_timeline_term_changes")
ep.active_safekeepers = [1, 2]
ep.start()
ep.safe_psql("create table t(key int, value text)")
ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
src_http = src_sk.http_client()
# run pull_timeline which will halt before downloading files
src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause"))
pt_handle = PropagatingThread(
target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
)
dst_sk.start()
pt_handle.start()
src_sk.wait_until_paused("sk-snapshot-after-list-pausable")
@@ -1807,7 +1835,7 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
# restart compute to bump term
ep.stop()
ep = env.endpoints.create("main")
ep = env.endpoints.create("pull_timeline_term_changes")
ep.active_safekeepers = [1, 2]
ep.start()
ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
@@ -1929,12 +1957,18 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
@run_only_on_default_postgres("tests only safekeeper API")
def test_membership_api(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
# timelines should be created the old way
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
# These are expected after timeline deletion on safekeepers.
env.pageserver.allowed_errors.extend(
[
".*Timeline .* was not found in global map.*",
".*Timeline .* has been deleted.*",
".*Timeline .* was cancelled and cannot be used anymore.*",
]
)
@@ -2008,6 +2042,12 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
created manually, later storcon will do that.
"""
neon_env_builder.num_safekeepers = 3
# timelines should be created the old way manually
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
@@ -2063,7 +2103,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.create_branch("test_idle_reconnections")
timeline_id = env.initial_timeline
def collect_stats() -> dict[str, float]:
# we need to collect safekeeper_pg_queries_received_total metric from all safekeepers
@@ -2094,7 +2134,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
collect_stats()
endpoint = env.endpoints.create_start("test_idle_reconnections")
endpoint = env.endpoints.create_start("main")
# just write something to the timeline
endpoint.safe_psql("create table t(i int)")
collect_stats()

View File

@@ -590,6 +590,13 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int):
neon_env_builder.num_safekeepers = 3
if safekeeper_proto_version == 2:
# On the legacy protocol, we don't support generations, which are part of
# `timelines_onto_safekeepers`
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
@@ -713,6 +720,11 @@ async def run_quorum_sanity(env: NeonEnv):
# we don't.
def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 4
# The test fails basically always on the new mode.
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
env = neon_env_builder.init_start()
asyncio.run(run_quorum_sanity(env))

View File

@@ -16,6 +16,13 @@ if TYPE_CHECKING:
# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
# we assert below that the walreceiver is not active before data writes.
# with manually created timelines, it is active.
# FIXME: remove this test once we remove timelines_onto_safekeepers
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": False,
}
# Trigger WAL wait timeout faster
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
env = neon_env_builder.init_start()

View File

@@ -0,0 +1,90 @@
create or replace function admin_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'admin event trigger is executed for %', current_user;
end;
$$;
create role neon_superuser;
create role neon_admin login inherit createrole createdb in role neon_superuser;
grant create on schema public to neon_admin;
create database neondb with owner neon_admin;
grant all privileges on database neondb to neon_superuser;
create role neon_user;
grant create on schema public to neon_user;
create event trigger on_ddl1 on ddl_command_end
execute procedure admin_proc();
set role neon_user;
-- check that non-privileged user can not change neon.event_triggers
set neon.event_triggers to false;
ERROR: permission denied to set neon.event_triggers
DETAIL: Only "neon_superuser" is allowed to set the GUC
-- Non-privileged neon user should not be able to create event trigers
create event trigger on_ddl2 on ddl_command_end
execute procedure admin_proc();
ERROR: permission denied to create event trigger "on_ddl2"
HINT: Must be superuser to create an event trigger.
set role neon_admin;
-- neon_superuser should be able to create event trigers
create or replace function neon_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'neon event trigger is executed for %', current_user;
end;
$$;
NOTICE: admin event trigger is executed for neon_admin
create event trigger on_ddl2 on ddl_command_end
execute procedure neon_proc();
\c neondb neon_admin
create or replace function neondb_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'neondb event trigger is executed for %', current_user;
end;
$$;
create or replace function neondb_secdef_proc()
returns event_trigger
language plpgsql
SECURITY DEFINER
as
$$
begin
raise notice 'neondb secdef event trigger is executed for %', current_user;
end;
$$;
-- neon_admin (neon_superuser member) should be able to create event triggers
create event trigger on_ddl3 on ddl_command_end
execute procedure neondb_proc();
create event trigger on_ddl4 on ddl_command_end
execute procedure neondb_secdef_proc();
-- Check that event trigger is fired for neon_admin
create table t1(x integer);
NOTICE: neondb event trigger is executed for neon_admin
NOTICE: neondb secdef event trigger is executed for neon_admin
-- Check that event trigger can be skipped
set neon.event_triggers to false;
create table t2(x integer);
WARNING: Skipping Event Trigger: neon.event_triggers is false
WARNING: Skipping Event Trigger: neon.event_triggers is false
\c regression cloud_admin
-- Check that event triggers are not fired for superuser
create table t3(x integer);
NOTICE: admin event trigger is executed for cloud_admin
WARNING: Skipping Event Trigger
DETAIL: Event Trigger function "neon_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser
\c neondb cloud_admin
-- Check that user-defined event triggers are not fired for superuser
create table t4(x integer);
WARNING: Skipping Event Trigger
DETAIL: Event Trigger function "neondb_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser
WARNING: Skipping Event Trigger
DETAIL: Event Trigger function "neondb_secdef_proc" is owned by non-superuser role "neon_admin", and current_user "cloud_admin" is superuser
\c neondb neon_admin
-- Check that neon_admin can drop event triggers
drop event trigger on_ddl3;
drop event trigger on_ddl4;

View File

@@ -9,3 +9,4 @@ test: neon-rel-truncate
test: neon-clog
test: neon-test-utils
test: neon-vacuum-full
test: neon-event-triggers

View File

@@ -0,0 +1,96 @@
create or replace function admin_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'admin event trigger is executed for %', current_user;
end;
$$;
create role neon_superuser;
create role neon_admin login inherit createrole createdb in role neon_superuser;
grant create on schema public to neon_admin;
create database neondb with owner neon_admin;
grant all privileges on database neondb to neon_superuser;
create role neon_user;
grant create on schema public to neon_user;
create event trigger on_ddl1 on ddl_command_end
execute procedure admin_proc();
set role neon_user;
-- check that non-privileged user can not change neon.event_triggers
set neon.event_triggers to false;
-- Non-privileged neon user should not be able to create event trigers
create event trigger on_ddl2 on ddl_command_end
execute procedure admin_proc();
set role neon_admin;
-- neon_superuser should be able to create event trigers
create or replace function neon_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'neon event trigger is executed for %', current_user;
end;
$$;
create event trigger on_ddl2 on ddl_command_end
execute procedure neon_proc();
\c neondb neon_admin
create or replace function neondb_proc()
returns event_trigger
language plpgsql as
$$
begin
raise notice 'neondb event trigger is executed for %', current_user;
end;
$$;
create or replace function neondb_secdef_proc()
returns event_trigger
language plpgsql
SECURITY DEFINER
as
$$
begin
raise notice 'neondb secdef event trigger is executed for %', current_user;
end;
$$;
-- neon_admin (neon_superuser member) should be able to create event triggers
create event trigger on_ddl3 on ddl_command_end
execute procedure neondb_proc();
create event trigger on_ddl4 on ddl_command_end
execute procedure neondb_secdef_proc();
-- Check that event trigger is fired for neon_admin
create table t1(x integer);
-- Check that event trigger can be skipped
set neon.event_triggers to false;
create table t2(x integer);
\c regression cloud_admin
-- Check that event triggers are not fired for superuser
create table t3(x integer);
\c neondb cloud_admin
-- Check that user-defined event triggers are not fired for superuser
create table t4(x integer);
\c neondb neon_admin
-- Check that neon_admin can drop event triggers
drop event trigger on_ddl3;
drop event trigger on_ddl4;