Merge remote-tracking branch 'origin/main' into problame/benchmarking/pr/python-perftest

This commit is contained in:
Christian Schwarz
2024-01-08 14:24:32 +00:00
106 changed files with 4535 additions and 1223 deletions

View File

@@ -1166,8 +1166,8 @@ class AbstractNeonCli(abc.ABC):
If `local_binpath` is true, then we are invoking a test utility
"""
assert type(arguments) == list
assert type(self.COMMAND) == str
assert isinstance(arguments, list)
assert isinstance(self.COMMAND, str)
if local_binpath:
# Test utility
@@ -3108,6 +3108,28 @@ class SafekeeperHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
json=body,
)
res.raise_for_status()
def timeline_digest(
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
) -> Dict[str, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
params={
"from_lsn": str(from_lsn),
"until_lsn": str(until_lsn),
},
)
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def timeline_create(
self,
tenant_id: TenantId,

View File

@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
self.verbose_error(res)
def tenant_secondary_download(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
self.verbose_error(res)
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
assert "tenant_id" not in config.keys()
res = self.put(
@@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session):
assert isinstance(res, dict)
assert TenantId(res["id"]) == tenant_id
size = res["size"]
assert type(size) == int
assert isinstance(size, int)
inputs = res["inputs"]
assert type(inputs) is dict
assert isinstance(inputs, dict)
return (size, inputs)
def tenant_size_debug(self, tenant_id: TenantId) -> str:

View File

@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
# Please do not alter the label for the query, as it is used to identify it.
# Labels for ClickBench queries match the labels in ClickBench reports
# on https://benchmark.clickhouse.com/ (the DB size may differ).
#
# Disable auto formatting for the list of queries so that it's easier to read
# fmt: off
QUERIES: Tuple[LabelledQuery, ...] = (
# Disable `black` formatting for the list of queries so that it's easier to read
# fmt: off
### ClickBench queries:
LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"),
LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
# LabelledQuery("NQ0", r"..."),
# LabelledQuery("NQ1", r"..."),
# ...
# fmt: on
)
# fmt: on
EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"

View File

@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
else:
assert (
len(x) == 2
), f"request param ({request.param}) should have a format of \
`neon_{{safekeepers_enable_fsync}}`"
), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
# `NeonCompare` interface
neon_env_builder = request.getfixturevalue("neon_env_builder")

View File

@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
assert set(our_tenant_config.effective_config.keys()) == set(
fully_custom_config.keys()
), "ensure we cover all config options"
assert {
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
for k in fully_custom_config.keys()
} == {
k: True for k in fully_custom_config.keys()
}, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
assert (
{
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
for k in fully_custom_config.keys()
}
== {k: True for k in fully_custom_config.keys()}
), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
ps_http.tenant_detach(tenant_id)
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)

View File

@@ -186,9 +186,7 @@ def test_backward_compatibility(
else:
raise
assert (
not breaking_changes_allowed
), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
@check_ondisk_data_compatibility_if_enabled
@@ -247,9 +245,7 @@ def test_forward_compatibility(
else:
raise
assert (
not breaking_changes_allowed
), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):

View File

@@ -2,7 +2,6 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
# Restart nodes with WAL end having specially crafted shape, like last record
# crossing segment boundary, to test decoding issues.

View File

@@ -102,9 +102,7 @@ def test_basic_eviction(
), f"Did not expect to find {local_layer} layer after evicting"
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
assert (
not empty_layers
), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
assert (

View File

@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
ps_http = env.pageserver.http_client()

View File

@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
assert (
prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn

View File

@@ -254,7 +254,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
metadata_summary = S3Scrubber(
neon_env_builder.test_output_dir, neon_env_builder
).scan_metadata()
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["timeline_count"] == 1
assert metadata_summary["timeline_shard_count"] == 1
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]

View File

@@ -1,9 +1,11 @@
import random
from pathlib import Path
from typing import Any, Dict, Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
@@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
flush_ms=5000,
)
# Encourage the new location to download while still in secondary mode
pageserver_b.http_client().tenant_secondary_download(tenant_id)
migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
assert migrated_generation == initial_generation + 1
@@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
# Writes and reads still work in AttachedStale.
workload.validate(pageserver_a.id)
# TODO: call into secondary mode API hooks to do an upload/download sync
# Generate some more dirty writes: we expect the origin to ingest WAL in
# in AttachedStale
workload.churn_rows(64, pageserver_a.id, upload=False)
@@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
log.info(f"Read back heatmap: {heatmap_second}")
assert heatmap_second != heatmap_first
validate_heatmap(heatmap_second)
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
"""
Inspect local storage on a pageserver to discover which layer files are present.
:return: list of relative paths to layers, from the timeline root.
"""
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
def relative(p: Path) -> Path:
return p.relative_to(timeline_path)
return sorted(
list(
map(
relative,
filter(
lambda path: path.name != "metadata"
and "ephemeral" not in path.name
and "temp" not in path.name,
timeline_path.glob("*"),
),
)
)
)
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
"""
Test the overall data flow in secondary mode:
- Heatmap uploads from the attached location
- Heatmap & layer downloads from the secondary location
- Eviction of layers on the attached location results in deletion
on the secondary location as well.
"""
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
ps_attached = env.pageservers[0]
ps_secondary = env.pageservers[1]
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, ps_attached.id)
# Configure a secondary location
log.info("Setting up secondary location...")
ps_secondary.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
log.info(f"Read back conf: {readback_conf}")
# Explicit upload/download cycle
# ==============================
log.info("Synchronizing after initial write...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# Make changes on attached pageserver, check secondary downloads them
# ===================================================================
log.info("Synchronizing after subsequent write...")
workload.churn_rows(128, ps_attached.id)
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
# walreceiver is still doing something.
import time
time.sleep(5)
# Do evictions on attached pageserver, check secondary follows along
# ==================================================================
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
log.info("Synchronizing after eviction...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# Scrub the remote storage
# ========================
# This confirms that the scrubber isn't upset by the presence of the heatmap
S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
# Detach secondary and delete tenant
# ===================================
# This confirms that the heatmap gets cleaned up as well as other normal content.
log.info("Detaching secondary location...")
ps_secondary.tenant_location_configure(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
},
)
log.info("Deleting tenant...")
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)

View File

@@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
assert (
tenant_id not in tenants_after_detach
), f"Ignored and then detached tenant {tenant_id} \
should not be present in pageserver's memory"
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
@@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
assert (
tenant_id not in tenants_after_detach
), f"Ignored and then detached tenant {tenant_id} \
should not be present in pageserver's memory"
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
def test_detach_while_attaching(
@@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
if found_broken:
break
time.sleep(0.5)
assert (
found_broken
), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
env.pageserver.tenant_load(env.initial_tenant)
@@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
break
time.sleep(0.5)
assert (
found_active
), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"

View File

@@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver(
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\
but got: {files_before_detach}"
), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
assert (
len(files_before_detach) >= 2
), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\
but got {files_before_detach}"
), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
return timeline_to_detach_local_path

View File

@@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
len(restored_timelines) == 1
), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
restored_timeline = restored_timelines[0]
assert restored_timeline["timeline_id"] == str(
timeline_id
assert (
restored_timeline["timeline_id"] == str(timeline_id)
), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
# Check that we had to retry the downloads
@@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
len(restored_timelines) == 1
), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
retored_timeline = restored_timelines[0]
assert retored_timeline["timeline_id"] == str(
timeline_id
assert (
retored_timeline["timeline_id"] == str(timeline_id)
), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
# Request non-incremental logical size. Calculating it needs the layer file that

View File

@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
)
endpoint.stop_and_destroy()
endpoint.stop()
timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
# Also delete and manually create timeline on safekeepers -- this tests
@@ -1838,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
assert final_stats.get("START_REPLICATION", 0) >= 1
# walproposer should connect to each safekeeper at least once
assert final_stats.get("START_WAL_PUSH", 0) >= 3
@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000])
def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
target_percents = [10, 50, 90, 100]
neon_env_builder.num_safekeepers = 3
# we need remote storage that supports copy_object S3 API
neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main")
lsns = []
def remember_lsn():
lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
lsns.append(lsn)
return lsn
# remember LSN right after timeline creation
lsn = remember_lsn()
log.info(f"LSN after timeline creation: {lsn}")
endpoint.safe_psql("create table t(key int, value text)")
timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
timeline_start_lsn = timeline_status.timeline_start_lsn
log.info(f"Timeline start LSN: {timeline_start_lsn}")
current_percent = 0.0
for new_percent in target_percents:
new_rows = insert_rows * (new_percent - current_percent) / 100
current_percent = new_percent
if new_rows == 0:
continue
endpoint.safe_psql(
f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)"
)
# remember LSN right after reaching new_percent
lsn = remember_lsn()
log.info(f"LSN after inserting {new_rows} rows: {lsn}")
# TODO: would be also good to test cases where not all segments are uploaded to S3
for lsn in lsns:
new_timeline_id = TimelineId.generate()
log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}")
orig_digest = (
env.safekeepers[0]
.http_client()
.timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn)
)
log.info(f"Original digest: {orig_digest}")
for sk in env.safekeepers:
sk.http_client().copy_timeline(
tenant_id,
timeline_id,
{
"target_timeline_id": str(new_timeline_id),
"until_lsn": str(lsn),
},
)
new_digest = sk.http_client().timeline_digest(
tenant_id, new_timeline_id, timeline_start_lsn, lsn
)
log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}")
assert orig_digest == new_digest
# TODO: test timelines can start after copy