always remove RemoteTimelineClient's metrics when dropping it

This commit is contained in:
Christian Schwarz
2022-12-13 12:23:47 -05:00
committed by Christian Schwarz
parent 8fcba150db
commit 4132ae9dfe
5 changed files with 144 additions and 30 deletions

View File

@@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
return metrics
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_upload_queue_unfinished_tasks",
"pageserver_remote_operation_seconds_bucket",
"pageserver_remote_operation_seconds_count",
"pageserver_remote_operation_seconds_sum",
)
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_current_logical_size",
"pageserver_current_physical_size",
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
)

View File

@@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
metrics,
re.MULTILINE,
)
assert matches
if matches is None:
return None
return int(matches[1])
pg = env.postgres.create_start("main", tenant_id=tenant_id)
@@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
assert not timeline_path.exists()
# timeline deletion should kill ongoing uploads
assert get_queued_count(file_kind="index", op_kind="upload") == 0
# timeline deletion should kill ongoing uploads, so, the metric will be gone
assert get_queued_count(file_kind="index", op_kind="upload") is None
# timeline deletion should be unblocking checkpoint ops
checkpoint_thread.join(2.0)

View File

@@ -7,7 +7,11 @@ from typing import List
import pytest
from fixtures.log_helper import log
from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
from fixtures.metrics import (
PAGESERVER_PER_TENANT_METRICS,
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
parse_metrics,
)
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
)
def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize(
"remote_storage_kind",
# exercise both the code paths where remote_storage=None and remote_storage=Some(...)
[RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
)
def test_pageserver_metrics_removed_after_detach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_pageserver_metrics_removed_after_detach",
)
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
for tenant in [tenant_1, tenant_2]:
pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
expected = set(PAGESERVER_PER_TENANT_METRICS)
if remote_storage_kind == RemoteStorageKind.NOOP:
# if there's no remote storage configured, we don't expose the remote timeline client metrics
expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
assert pre_detach_samples == expected
env.pageserver.http_client().tenant_detach(tenant)