diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py new file mode 100644 index 0000000000..f46c83a257 --- /dev/null +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -0,0 +1,110 @@ +from dataclasses import dataclass +import json +import os +import shutil +import subprocess +from pathlib import Path +import time +from typing import Any, Callable, Dict, List, Tuple + +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + SnapshotDir, + last_flush_lsn_upload, +) +from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from fixtures.types import TenantId, TimelineId +import fixtures.pageserver.remote_storage + +@dataclass +class SingleTimeline: + env: NeonEnv + timeline_id: TimelineId + tenants: List[TenantId] + +def single_timeline( + neon_env_builder: NeonEnvBuilder, + snapshot_dir: SnapshotDir, + setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], + ncopies: int, +) -> SingleTimeline: + """ + Create (or rehydrate from `snapshot_dir`) an env with `ncopies` copies + of a template tenant with a single timeline. + """ + + save_snapshot = os.getenv("CI", "false") != "true" + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + if snapshot_dir.is_initialized(): + save_snapshot = False + env = neon_env_builder.from_repo_dir(snapshot_dir.path) + ps_http = env.pageserver.http_client() + tenants = list( + {TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))} + ) + template_timeline = env.initial_timeline + + neon_env_builder.start() + else: + env = neon_env_builder.init_start() + + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + + ps_http = env.pageserver.http_client() + # clean up the useless default tenant + ps_http.tenant_delete(env.initial_tenant) + + template_tenant, template_timeline, template_config = setup_template(env) + + env.pageserver.http_client().tenant_detach(template_tenant) + # duplicate the template 20 times tenants in localfs storage + tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + assert ps_http.tenant_list() == [] + # make the attach fail after it created enough on-disk state to retry loading + # the tenant next startup, but before it can start background loops that would start download + ps_http.configure_failpoints(("attach-before-activate", "return")) + env.pageserver.allowed_errors.append( + ".*attach failed, setting tenant state to Broken: attach-before-activate.*" + ) + for tenant in tenants: + env.pageserver.tenant_attach( + tenant, + config=template_config.copy(), + ) + wait_until_tenant_state(ps_http, tenant, "Broken", 3) + env.pageserver.stop() # clears the failpoint as a side-effect + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + env.pageserver.start() + + for tenant in tenants: + wait_until_tenant_active(ps_http, tenant) + + # ensure all layers are resident for predictiable performance + for tenant in tenants: + for timeline in ps_http.tenant_status(tenant)["timelines"]: + info = ps_http.layer_map_info(tenant, timeline) + for layer in info.historic_layers: + assert not layer.remote + + # take snapshot after download all layers so tenant dir restoration is fast + if save_snapshot: + shutil.copytree(env.repo_dir, snapshot_dir.path) + snapshot_dir.set_initialized() + + return SingleTimeline(env, template_timeline, tenants) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py index 18c3a3acdb..72fa30a2f2 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/types.py @@ -31,10 +31,10 @@ class DeltaLayerFileName: key_start: Key key_end: Key - def is_l0(self): + def is_l0(self) -> bool: return self.key_start == KEY_MIN and self.key_end == KEY_MAX - def to_str(self): + def to_str(self) -> str: ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}" assert self == parse_layer_file_name(ret) return ret diff --git a/test_runner/performance/test_pageserver_pagebench.py b/test_runner/performance/test_pageserver_pagebench.py index 7bb9f58dab..e56e7e31bc 100644 --- a/test_runner/performance/test_pageserver_pagebench.py +++ b/test_runner/performance/test_pageserver_pagebench.py @@ -1,12 +1,8 @@ import json -import os -import shutil -import subprocess from pathlib import Path -import time from typing import List, Tuple - import pytest + from fixtures.benchmark_fixture import NeonBenchmarker from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -16,123 +12,60 @@ from fixtures.neon_fixtures import ( SnapshotDir, last_flush_lsn_upload, ) -from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId, TimelineId -import fixtures.pageserver.remote_storage +import fixtures.pageserver.many_tenants @pytest.fixture(scope="function") @pytest.mark.timeout(1000) -def snapshotting_env( +def getpage_throughput_fixture( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_snapshot_dir: SnapshotDir, -) -> Tuple[NeonEnv, TimelineId, List[TenantId]]: - """ - The fixture prepares environment or restores it from a snapshot. - - The logic is the following: - - if the snapshot directory exists, the snapshot is restored from it - - if there is no snapshot, the environment is initialized from scratch and stored in a snapshot - - if the fixture is executed on CI (it has CI=true in the environment), the snapshot is not saved - """ - - save_snapshot = os.getenv("CI", "false") != "true" - - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - # create our template tenant - tenant_config_mgmt_api = { - "gc_period": "0s", - "checkpoint_timeout": "10 years", - "compaction_period": "20 s", - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, - } - - if test_snapshot_dir.is_initialized(): - save_snapshot = False - env = neon_env_builder.from_repo_dir(test_snapshot_dir.path) - ps_http = env.pageserver.http_client() - tenants = list( - {TenantId(t.name) for t in (test_snapshot_dir.path.glob("pageserver_*/tenants/*"))} - ) - template_timeline = env.initial_timeline - - neon_env_builder.start() - else: - env = neon_env_builder.init_start() - - remote_storage = env.pageserver_remote_storage - assert isinstance(remote_storage, LocalFsStorage) - - ps_http = env.pageserver.http_client() - # clean up the useless default tenant - ps_http.tenant_delete(env.initial_tenant) - - tenant_config_cli = {k: str(v) for k, v in tenant_config_mgmt_api.items()} - - template_tenant, template_timeline = env.neon_cli.create_tenant( - conf=tenant_config_cli, set_default=True - ) - with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: - pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()]) +) -> fixtures.pageserver.many_tenants.SingleTimeline: + def setup_template(env: NeonEnv): + # create our template tenant + config = { + "gc_period": "0s", + "checkpoint_timeout": "10 years", + "compaction_period": "20 s", + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.tenant_attach(template_tenant, config) + # with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + # pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()]) + # last_flush_lsn_upload(env, ep, template_tenant, template_timeline) + ep = env.endpoints.create_start("main", tenant_id=template_tenant) + ep.safe_psql("create table foo(b text)") + for i in range(0, 8): + ep.safe_psql("insert into foo(b) values ('some text')") last_flush_lsn_upload(env, ep, template_tenant, template_timeline) - ps_http.tenant_detach(template_tenant) + ep.stop_and_destroy() + return (template_tenant, template_timeline, config) - # duplicate the template 20 times tenants in localfs storage - tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, 20) - - # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. - # However, on-demand downloads are quite slow ATM. - # => do the on-demand downloads in Python. - assert ps_http.tenant_list() == [] - # make the attach fail after it created enough on-disk state to retry loading - # the tenant next startup, but before it can start background loops that would start download - ps_http.configure_failpoints(("attach-before-activate", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed, setting tenant state to Broken: attach-before-activate.*" - ) - for tenant in tenants: - env.pageserver.tenant_attach( - tenant, - config=tenant_config_mgmt_api.copy(), - ) - wait_until_tenant_state(ps_http, tenant, "Broken", 3) - env.pageserver.stop() # clears the failpoint as a side-effect - tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) - fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( - env, tenant_timelines - ) - env.pageserver.start() - - for tenant in tenants: - wait_until_tenant_active(ps_http, tenant) - - # ensure all layers are resident for predictiable performance - for tenant in tenants: - for timeline in ps_http.tenant_status(tenant)["timelines"]: - info = ps_http.layer_map_info(tenant, timeline) - for layer in info.historic_layers: - assert not layer.remote - - # take snapshot after download all layers so tenant dir restoration is fast - if save_snapshot: - shutil.copytree(env.repo_dir, test_snapshot_dir.path) - test_snapshot_dir.set_initialized() - - return env, template_timeline, tenants + return fixtures.pageserver.many_tenants.single_timeline( + neon_env_builder, + test_snapshot_dir, + setup_template, + 20_000, + ) def test_getpage_throughput( - snapshotting_env: Tuple[NeonEnv, TimelineId, List[TenantId]], + getpage_throughput_fixture: fixtures.pageserver.many_tenants.SingleTimeline, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, ): - env, template_timeline, tenants = snapshotting_env + env, template_timeline, tenants = ( + getpage_throughput_fixture.env, + getpage_throughput_fixture.timeline_id, + getpage_throughput_fixture.tenants, + ) ps_http = env.pageserver.http_client() # run the benchmark with one client per timeline, each doing 10k requests to random keys.