Add WAL offloading to s3 on safekeepers.

Separate task is launched for each timeline and stopped when timeline doesn't
need offloading. Decision who offloads is done through etcd leader election;
currently there is no pre condition for participating, that's a TODO.

neon_local and tests infrastructure for remote storage in safekeepers added,
along with the test itself.

ref #1009

Co-authored-by: Anton Shyrabokau <ahtoxa@Antons-MacBook-Pro.local>
This commit is contained in:
Arseny Sher
2022-04-27 00:24:59 -07:00
parent 1d71949c51
commit 0e1bd57c53
28 changed files with 1146 additions and 429 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
from dataclasses import field
from enum import Flag, auto
import textwrap
from cached_property import cached_property
import asyncpg
@@ -421,10 +422,51 @@ class MockS3Server:
def secret_key(self) -> str:
return 'test'
def access_env_vars(self) -> Dict[Any, Any]:
return {
'AWS_ACCESS_KEY_ID': self.access_key(),
'AWS_SECRET_ACCESS_KEY': self.secret_key(),
}
def kill(self):
self.subprocess.kill()
@dataclass
class LocalFsStorage:
local_path: Path
@dataclass
class S3Storage:
bucket_name: str
bucket_region: str
endpoint: Optional[str]
RemoteStorage = Union[LocalFsStorage, S3Storage]
# serialize as toml inline table
def remote_storage_to_toml_inline_table(remote_storage):
if isinstance(remote_storage, LocalFsStorage):
res = f"local_path='{remote_storage.local_path}'"
elif isinstance(remote_storage, S3Storage):
res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'"
if remote_storage.endpoint is not None:
res += f", endpoint='{remote_storage.endpoint}'"
else:
raise Exception(f'Unknown storage configuration {remote_storage}')
else:
raise Exception("invalid remote storage type")
return f"{{{res}}}"
class RemoteStorageUsers(Flag):
PAGESERVER = auto()
SAFEKEEPER = auto()
class ZenithEnvBuilder:
"""
Builder object to create a Zenith runtime environment
@@ -440,6 +482,7 @@ class ZenithEnvBuilder:
broker: Etcd,
mock_s3_server: MockS3Server,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
num_safekeepers: int = 1,
pageserver_auth_enabled: bool = False,
@@ -449,6 +492,7 @@ class ZenithEnvBuilder:
self.rust_log_override = rust_log_override
self.port_distributor = port_distributor
self.remote_storage = remote_storage
self.remote_storage_users = remote_storage_users
self.broker = broker
self.mock_s3_server = mock_s3_server
self.pageserver_config_override = pageserver_config_override
@@ -497,9 +541,9 @@ class ZenithEnvBuilder:
aws_access_key_id=self.mock_s3_server.access_key(),
aws_secret_access_key=self.mock_s3_server.secret_key(),
).create_bucket(Bucket=bucket_name)
self.remote_storage = S3Storage(bucket=bucket_name,
self.remote_storage = S3Storage(bucket_name=bucket_name,
endpoint=mock_endpoint,
region=mock_region)
bucket_region=mock_region)
def __enter__(self):
return self
@@ -557,6 +601,7 @@ class ZenithEnv:
self.safekeepers: List[Safekeeper] = []
self.broker = config.broker
self.remote_storage = config.remote_storage
self.remote_storage_users = config.remote_storage_users
# generate initial tenant ID here instead of letting 'zenith init' generate it,
# so that we don't need to dig it out of the config file afterwards.
@@ -605,8 +650,12 @@ class ZenithEnv:
id = {id}
pg_port = {port.pg}
http_port = {port.http}
sync = false # Disable fsyncs to make the tests go faster
""")
sync = false # Disable fsyncs to make the tests go faster""")
if bool(self.remote_storage_users
& RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None:
toml += textwrap.dedent(f"""
remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}"
""")
safekeeper = Safekeeper(env=self, id=id, port=port)
self.safekeepers.append(safekeeper)
@@ -638,7 +687,7 @@ def _shared_simple_env(request: Any,
mock_s3_server: MockS3Server,
default_broker: Etcd) -> Iterator[ZenithEnv]:
"""
Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES
# Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES
is set, this is shared by all tests using `zenith_simple_env`.
"""
@@ -822,20 +871,6 @@ class PageserverPort:
http: int
@dataclass
class LocalFsStorage:
root: Path
@dataclass
class S3Storage:
bucket: str
region: str
endpoint: Optional[str]
RemoteStorage = Union[LocalFsStorage, S3Storage]
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
re.MULTILINE)
CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P<timeline_id>[^']+)'",
@@ -998,6 +1033,7 @@ class ZenithCli:
append_pageserver_param_overrides(
params_to_update=cmd,
remote_storage=self.env.remote_storage,
remote_storage_users=self.env.remote_storage_users,
pageserver_config_override=self.env.pageserver.config_override)
res = self.raw_cli(cmd)
@@ -1022,14 +1058,10 @@ class ZenithCli:
append_pageserver_param_overrides(
params_to_update=start_args,
remote_storage=self.env.remote_storage,
remote_storage_users=self.env.remote_storage_users,
pageserver_config_override=self.env.pageserver.config_override)
s3_env_vars = None
if self.env.s3_mock_server:
s3_env_vars = {
'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(),
'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(),
}
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]':
@@ -1041,7 +1073,8 @@ class ZenithCli:
return self.raw_cli(cmd)
def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]':
return self.raw_cli(['safekeeper', 'start', str(id)])
s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None
return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars)
def safekeeper_stop(self,
id: Optional[int] = None,
@@ -1237,22 +1270,13 @@ class ZenithPageserver(PgProtocol):
def append_pageserver_param_overrides(
params_to_update: List[str],
remote_storage: Optional[RemoteStorage],
remote_storage_users: RemoteStorageUsers,
pageserver_config_override: Optional[str] = None,
):
if remote_storage is not None:
if isinstance(remote_storage, LocalFsStorage):
pageserver_storage_override = f"local_path='{remote_storage.root}'"
elif isinstance(remote_storage, S3Storage):
pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\
bucket_region='{remote_storage.region}'"
if remote_storage.endpoint is not None:
pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'"
else:
raise Exception(f'Unknown storage configuration {remote_storage}')
if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None:
remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage)
params_to_update.append(
f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}')
f'--pageserver-config-override=remote_storage={remote_storage_toml_table}')
env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES')
if env_overrides is not None:
@@ -1786,8 +1810,9 @@ class Safekeeper:
class SafekeeperTimelineStatus:
acceptor_epoch: int
flush_lsn: str
remote_consistent_lsn: str
timeline_start_lsn: str
backup_lsn: str
remote_consistent_lsn: str
@dataclass
@@ -1812,8 +1837,9 @@ class SafekeeperHttpClient(requests.Session):
resj = res.json()
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
flush_lsn=resj['flush_lsn'],
remote_consistent_lsn=resj['remote_consistent_lsn'],
timeline_start_lsn=resj['timeline_start_lsn'])
timeline_start_lsn=resj['timeline_start_lsn'],
backup_lsn=resj['backup_lsn'],
remote_consistent_lsn=resj['remote_consistent_lsn'])
def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body):
res = self.post(