reuable abstraction for many tenants fixture

This commit is contained in:
Christian Schwarz
2024-01-05 18:08:59 +00:00
parent dd69927953
commit 392e014a7f
3 changed files with 151 additions and 108 deletions

View File

@@ -0,0 +1,110 @@
from dataclasses import dataclass
import json
import os
import shutil
import subprocess
from pathlib import Path
import time
from typing import Any, Callable, Dict, List, Tuple
import pytest
from fixtures.benchmark_fixture import NeonBenchmarker
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
SnapshotDir,
last_flush_lsn_upload,
)
from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
import fixtures.pageserver.remote_storage
@dataclass
class SingleTimeline:
env: NeonEnv
timeline_id: TimelineId
tenants: List[TenantId]
def single_timeline(
neon_env_builder: NeonEnvBuilder,
snapshot_dir: SnapshotDir,
setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
ncopies: int,
) -> SingleTimeline:
"""
Create (or rehydrate from `snapshot_dir`) an env with `ncopies` copies
of a template tenant with a single timeline.
"""
save_snapshot = os.getenv("CI", "false") != "true"
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
if snapshot_dir.is_initialized():
save_snapshot = False
env = neon_env_builder.from_repo_dir(snapshot_dir.path)
ps_http = env.pageserver.http_client()
tenants = list(
{TenantId(t.name) for t in (snapshot_dir.path.glob("pageserver_*/tenants/*"))}
)
template_timeline = env.initial_timeline
neon_env_builder.start()
else:
env = neon_env_builder.init_start()
remote_storage = env.pageserver_remote_storage
assert isinstance(remote_storage, LocalFsStorage)
ps_http = env.pageserver.http_client()
# clean up the useless default tenant
ps_http.tenant_delete(env.initial_tenant)
template_tenant, template_timeline, template_config = setup_template(env)
env.pageserver.http_client().tenant_detach(template_tenant)
# duplicate the template 20 times tenants in localfs storage
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
# However, on-demand downloads are quite slow ATM.
# => do the on-demand downloads in Python.
assert ps_http.tenant_list() == []
# make the attach fail after it created enough on-disk state to retry loading
# the tenant next startup, but before it can start background loops that would start download
ps_http.configure_failpoints(("attach-before-activate", "return"))
env.pageserver.allowed_errors.append(
".*attach failed, setting tenant state to Broken: attach-before-activate.*"
)
for tenant in tenants:
env.pageserver.tenant_attach(
tenant,
config=template_config.copy(),
)
wait_until_tenant_state(ps_http, tenant, "Broken", 3)
env.pageserver.stop() # clears the failpoint as a side-effect
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
env, tenant_timelines
)
env.pageserver.start()
for tenant in tenants:
wait_until_tenant_active(ps_http, tenant)
# ensure all layers are resident for predictiable performance
for tenant in tenants:
for timeline in ps_http.tenant_status(tenant)["timelines"]:
info = ps_http.layer_map_info(tenant, timeline)
for layer in info.historic_layers:
assert not layer.remote
# take snapshot after download all layers so tenant dir restoration is fast
if save_snapshot:
shutil.copytree(env.repo_dir, snapshot_dir.path)
snapshot_dir.set_initialized()
return SingleTimeline(env, template_timeline, tenants)

View File

@@ -31,10 +31,10 @@ class DeltaLayerFileName:
key_start: Key
key_end: Key
def is_l0(self):
def is_l0(self) -> bool:
return self.key_start == KEY_MIN and self.key_end == KEY_MAX
def to_str(self):
def to_str(self) -> str:
ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
assert self == parse_layer_file_name(ret)
return ret

View File

@@ -1,12 +1,8 @@
import json
import os
import shutil
import subprocess
from pathlib import Path
import time
from typing import List, Tuple
import pytest
from fixtures.benchmark_fixture import NeonBenchmarker
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
@@ -16,123 +12,60 @@ from fixtures.neon_fixtures import (
SnapshotDir,
last_flush_lsn_upload,
)
from fixtures.pageserver.utils import wait_until_tenant_active, wait_until_tenant_state
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
import fixtures.pageserver.remote_storage
import fixtures.pageserver.many_tenants
@pytest.fixture(scope="function")
@pytest.mark.timeout(1000)
def snapshotting_env(
def getpage_throughput_fixture(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
test_snapshot_dir: SnapshotDir,
) -> Tuple[NeonEnv, TimelineId, List[TenantId]]:
"""
The fixture prepares environment or restores it from a snapshot.
The logic is the following:
- if the snapshot directory exists, the snapshot is restored from it
- if there is no snapshot, the environment is initialized from scratch and stored in a snapshot
- if the fixture is executed on CI (it has CI=true in the environment), the snapshot is not saved
"""
save_snapshot = os.getenv("CI", "false") != "true"
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# create our template tenant
tenant_config_mgmt_api = {
"gc_period": "0s",
"checkpoint_timeout": "10 years",
"compaction_period": "20 s",
"compaction_threshold": 10,
"compaction_target_size": 134217728,
"checkpoint_distance": 268435456,
"image_creation_threshold": 3,
}
if test_snapshot_dir.is_initialized():
save_snapshot = False
env = neon_env_builder.from_repo_dir(test_snapshot_dir.path)
ps_http = env.pageserver.http_client()
tenants = list(
{TenantId(t.name) for t in (test_snapshot_dir.path.glob("pageserver_*/tenants/*"))}
)
template_timeline = env.initial_timeline
neon_env_builder.start()
else:
env = neon_env_builder.init_start()
remote_storage = env.pageserver_remote_storage
assert isinstance(remote_storage, LocalFsStorage)
ps_http = env.pageserver.http_client()
# clean up the useless default tenant
ps_http.tenant_delete(env.initial_tenant)
tenant_config_cli = {k: str(v) for k, v in tenant_config_mgmt_api.items()}
template_tenant, template_timeline = env.neon_cli.create_tenant(
conf=tenant_config_cli, set_default=True
)
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()])
) -> fixtures.pageserver.many_tenants.SingleTimeline:
def setup_template(env: NeonEnv):
# create our template tenant
config = {
"gc_period": "0s",
"checkpoint_timeout": "10 years",
"compaction_period": "20 s",
"compaction_threshold": 10,
"compaction_target_size": 134217728,
"checkpoint_distance": 268435456,
"image_creation_threshold": 3,
}
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.tenant_attach(template_tenant, config)
# with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
# pg_bin.run_capture(["pgbench", "-i", "-s5", ep.connstr()])
# last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
ep = env.endpoints.create_start("main", tenant_id=template_tenant)
ep.safe_psql("create table foo(b text)")
for i in range(0, 8):
ep.safe_psql("insert into foo(b) values ('some text')")
last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
ps_http.tenant_detach(template_tenant)
ep.stop_and_destroy()
return (template_tenant, template_timeline, config)
# duplicate the template 20 times tenants in localfs storage
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, 20)
# In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
# However, on-demand downloads are quite slow ATM.
# => do the on-demand downloads in Python.
assert ps_http.tenant_list() == []
# make the attach fail after it created enough on-disk state to retry loading
# the tenant next startup, but before it can start background loops that would start download
ps_http.configure_failpoints(("attach-before-activate", "return"))
env.pageserver.allowed_errors.append(
".*attach failed, setting tenant state to Broken: attach-before-activate.*"
)
for tenant in tenants:
env.pageserver.tenant_attach(
tenant,
config=tenant_config_mgmt_api.copy(),
)
wait_until_tenant_state(ps_http, tenant, "Broken", 3)
env.pageserver.stop() # clears the failpoint as a side-effect
tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
env, tenant_timelines
)
env.pageserver.start()
for tenant in tenants:
wait_until_tenant_active(ps_http, tenant)
# ensure all layers are resident for predictiable performance
for tenant in tenants:
for timeline in ps_http.tenant_status(tenant)["timelines"]:
info = ps_http.layer_map_info(tenant, timeline)
for layer in info.historic_layers:
assert not layer.remote
# take snapshot after download all layers so tenant dir restoration is fast
if save_snapshot:
shutil.copytree(env.repo_dir, test_snapshot_dir.path)
test_snapshot_dir.set_initialized()
return env, template_timeline, tenants
return fixtures.pageserver.many_tenants.single_timeline(
neon_env_builder,
test_snapshot_dir,
setup_template,
20_000,
)
def test_getpage_throughput(
snapshotting_env: Tuple[NeonEnv, TimelineId, List[TenantId]],
getpage_throughput_fixture: fixtures.pageserver.many_tenants.SingleTimeline,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
):
env, template_timeline, tenants = snapshotting_env
env, template_timeline, tenants = (
getpage_throughput_fixture.env,
getpage_throughput_fixture.timeline_id,
getpage_throughput_fixture.tenants,
)
ps_http = env.pageserver.http_client()
# run the benchmark with one client per timeline, each doing 10k requests to random keys.