Release 2023 01 31 (#3497)

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
This commit is contained in:
Vadim Kharitonov
2023-01-31 14:06:35 +01:00
committed by GitHub
parent 3c6f779698
commit eb36403e71
71 changed files with 5779 additions and 2408 deletions

View File

@@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_physical_size",
)
PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"pageserver_storage_operations_seconds_global_count",
"pageserver_storage_operations_seconds_global_sum",
"pageserver_storage_operations_seconds_global_bucket",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_resident_physical_size",
@@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",
"pageserver_storage_operations_seconds_bucket",
"pageserver_storage_operations_seconds_count",
"pageserver_storage_operations_seconds_sum",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_wait_lsn_seconds_bucket",
"pageserver_wait_lsn_seconds_count",
"pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
"pageserver_tenant_states_count",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
)

View File

@@ -2,7 +2,15 @@ from contextlib import closing
import psycopg2.extras
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import (
LocalFsStorage,
NeonEnvBuilder,
RemoteStorageKind,
assert_tenant_status,
wait_for_upload,
)
from fixtures.types import Lsn
from fixtures.utils import wait_until
def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"compaction_period": 20,
"compaction_threshold": 10,
"gc_horizon": 67108864,
"gc_period": 100,
"gc_period": 60 * 60,
"image_creation_threshold": 3,
"pitr_interval": 604800, # 7 days
}.items()
@@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
"pitr_interval": 60,
}.items()
)
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_creating_tenant_conf_after_attach",
)
env = neon_env_builder.init_start()
assert isinstance(env.remote_storage, LocalFsStorage)
# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
assert config_path.exists(), "config file is always initially created"
http_client = env.pageserver.http_client()
detail = http_client.timeline_detail(tenant_id, timeline_id)
last_record_lsn = Lsn(detail["last_record_lsn"])
assert last_record_lsn.lsn_int != 0, "initdb must have executed"
wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
http_client.tenant_detach(tenant_id)
assert not config_path.exists(), "detach did not remove config file"
http_client.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
)
env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
contents_first = config_path.read_text()
env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
contents_later = config_path.read_text()
# dont test applying the setting here, we have that another test case to show it
# we just care about being able to create the file
assert len(contents_first) > len(contents_later)

View File

@@ -6,6 +6,7 @@ from threading import Thread
import asyncpg
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start("main", tenant_id=tenant_id)
with pg.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
with pg.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# Wait for the all data to be processed by the pageserver and uploaded in remote storage
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
tenant_metric_filter = {
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
}
pageserver_last_record_lsn_before_detach = int(
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
)
pageserver_http.tenant_detach(tenant_id)
pageserver_http.tenant_attach(tenant_id)
with pg.cursor() as cur:
assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
time.sleep(1) # for metrics propagation
# Check that we had to retry the downloads
assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
pageserver_last_record_lsn = int(
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
)
assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
with pg.cursor() as cur:
assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
# Check that we had to retry the downloads
assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
num_connections = 10
@@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
# first check for non existing tenant
tenant_id = TenantId.generate()
@@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
bogus_timeline_id = TimelineId.generate()
pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
# the error will be printed to the log too
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
# Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
# Detach while running manual GC.
# It should wait for manual GC to finish because it runs in a task associated with the tenant.

View File

@@ -1,5 +1,6 @@
import os
import shutil
import time
from contextlib import closing
from datetime import datetime
from pathlib import Path
@@ -8,6 +9,7 @@ from typing import List
import pytest
from fixtures.log_helper import log
from fixtures.metrics import (
PAGESERVER_GLOBAL_METRICS,
PAGESERVER_PER_TENANT_METRICS,
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
parse_metrics,
@@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
)
# Test (a subset of) pageserver global metrics
for metric in PAGESERVER_GLOBAL_METRICS:
ps_samples = ps_metrics.query_all(metric, {})
assert len(ps_samples) > 0
for sample in ps_samples:
labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
log.info(f"{sample.name}{{{labels}}} {sample.value}")
@pytest.mark.parametrize(
"remote_storage_kind",
@@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants(
files_in_timelines_dir == 0
), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
# Trigger timeline reinitialization after pageserver restart
# Trigger timeline re-initialization after pageserver restart
env.postgres.stop_all()
env.pageserver.stop()
@@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants(
broken_tenant["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
assert (
broken_tenant_status["state"] == "Broken"
), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
[loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
assert (
loaded_tenant["state"] == "Active"
), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
assert (
loaded_tenant_status["state"] == "Active"
), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
time.sleep(1) # to allow metrics propagation
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
broken_tenants_metric_filter = {
"tenant_id": str(tenant_without_timelines_dir),
"state": "broken",
}
active_tenants_metric_filter = {
"tenant_id": str(tenant_with_empty_timelines_dir),
"state": "active",
}
tenant_active_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=active_tenants_metric_filter
).value
)
assert (
tenant_active_count == 1
), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
tenant_broken_count = int(
ps_metrics.query_one(
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
).value
)
assert (
tenant_broken_count == 1
), f"Tenant {tenant_without_timelines_dir} should have metric as broken"