Add existing_initdb_timeline_id param to timeline creation (#5912)

This PR adds an `existing_initdb_timeline_id` option to timeline
creation APIs, taking an optional timeline ID.

Follow-up of  #5390.

If the `existing_initdb_timeline_id` option is specified via the HTTP
API, the pageserver downloads the existing initdb archive from the given
timeline ID and extracts it, instead of running initdb itself.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
Arpad Müller
2023-11-30 22:32:04 +01:00
committed by GitHub
parent 3842773546
commit b71b8ecfc2
17 changed files with 245 additions and 44 deletions

View File

@@ -362,12 +362,16 @@ class PageserverHttpClient(requests.Session):
new_timeline_id: TimelineId,
ancestor_timeline_id: Optional[TimelineId] = None,
ancestor_start_lsn: Optional[Lsn] = None,
existing_initdb_timeline_id: Optional[TimelineId] = None,
**kwargs,
) -> Dict[Any, Any]:
body: Dict[str, Any] = {
"new_timeline_id": str(new_timeline_id),
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
"existing_initdb_timeline_id": str(existing_initdb_timeline_id)
if existing_initdb_timeline_id
else None,
}
if pg_version != PgVersion.NOT_SET:
body["pg_version"] = int(pg_version)

View File

@@ -1,7 +1,7 @@
import time
from typing import TYPE_CHECKING, Any, Dict, Optional
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -235,10 +235,14 @@ if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
def assert_prefix_empty(
neon_env_builder: "NeonEnvBuilder",
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
):
response = list_prefix(neon_env_builder, prefix)
keys = response["KeyCount"]
objects = response.get("Contents", [])
objects: List[ObjectTypeDef] = response.get("Contents", [])
common_prefixes = response.get("CommonPrefixes", [])
remote_storage = neon_env_builder.pageserver_remote_storage
@@ -261,7 +265,18 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
)
assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
filtered_count = 0
if allowed_postfix is None:
filtered_count = len(objects)
else:
for _obj in objects:
key: str = str(response.get("Key", []))
if not (allowed_postfix.endswith(key)):
filtered_count += 1
assert (
filtered_count == 0
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):

View File

@@ -603,7 +603,12 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id)
assert not list(remote_timeline_path.iterdir())
filtered = [
path
for path in remote_timeline_path.iterdir()
if not (path.name.endswith("initdb.tar.zst"))
]
assert len(filtered) == 0
# timeline deletion should kill ongoing uploads, so, the metric will be gone
assert get_queued_count(file_kind="index", op_kind="upload") is None

View File

@@ -197,6 +197,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# So by ignoring these instead of waiting for empty upload queue
# we execute more distinct code paths.
'.*stopping left-over name="remote upload".*',
".*Failed to load index_part from remote storage, failed creation?.*",
]
)
@@ -285,6 +286,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
str(tenant_id),
)
),
allowed_postfix="initdb.tar.zst",
)

View File

@@ -290,10 +290,13 @@ def test_pageserver_with_empty_tenants(
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(
".*marking .* as locally complete, while it doesnt exist in remote index.*"
env.pageserver.allowed_errors.extend(
[
".*marking .* as locally complete, while it doesnt exist in remote index.*",
".*Failed to load index_part from remote storage, failed creation?.*",
".*load failed.*list timelines directory.*",
]
)
env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")
client = env.pageserver.http_client()

View File

@@ -230,6 +230,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
# In one of the branches we poll for tenant to become active. Polls can generate this log message:
env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
env.pageserver.allowed_errors.append(
".*Failed to load index_part from remote storage, failed creation?.*"
)
ps_http.configure_failpoints((failpoint, "return"))
@@ -308,8 +311,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
)
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
# Check local is empty
assert not timeline_dir.exists()
if failpoint != "timeline-delete-after-index-delete":
# Check local is empty
assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
# Check no delete mark present
assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()

View File

@@ -1,6 +1,7 @@
import sys
import tarfile
import tempfile
import time
from pathlib import Path
import pytest
@@ -125,3 +126,43 @@ def test_wal_restore_initdb(
)
log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}")
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
def test_wal_restore_http(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
):
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
endpoint.safe_psql("create table t as select generate_series(1,300000)")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
ps_client = env.pageserver.http_client()
# shut down the endpoint and delete the timeline from the pageserver
endpoint.stop()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
test_output_dir / "initdb.tar.zst"
(env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
ps_client.timeline_delete(tenant_id, timeline_id)
time.sleep(2)
# verify that it is indeed deleted
# TODO
# issue the restoration command
ps_client.timeline_create(
tenant_id=tenant_id,
new_timeline_id=timeline_id,
existing_initdb_timeline_id=timeline_id,
pg_version=env.pg_version,
)
# the table is back now!
restored = env.endpoints.create_start("main")
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]