From f2abc4c93331c57c433976b633a1085a4bdc0f52 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 26 May 2023 09:54:30 +0200 Subject: [PATCH] independent fix: test_pageserver_metrics_removed_after_detach didn't wait for uploads This resulted in unexpectedly absent metrics `pageserver_remote_timeline_client_bytes_finished` tripping the assert quoted below. Not sure why this PR (#4350) exposed this problem though. Are we detaching faster? If so, why? AssertionError: assert {'pageserver_...s_count', ...} == {'pageserver_...s_count', ...} Extra items in the right set: 'pageserver_remote_timeline_client_bytes_started_total' 'pageserver_remote_timeline_client_bytes_finished_total' Full diff: { 'pageserver_created_persistent_files_total', 'pageserver_current_logical_size', 'pageserver_evictions_total', 'pageserver_evictions_with_low_residence_duration_total', 'pageserver_getpage_reconstruct_seconds_bucket', 'pageserver_getpage_reconstruct_seconds_count', 'pageserver_getpage_reconstruct_seconds_sum', 'pageserver_io_operations_bytes_total', 'pageserver_io_operations_seconds_bucket', 'pageserver_io_operations_seconds_count', 'pageserver_io_operations_seconds_sum', 'pageserver_last_record_lsn', 'pageserver_materialized_cache_hits_total', 'pageserver_remote_operation_seconds_bucket', 'pageserver_remote_operation_seconds_count', 'pageserver_remote_operation_seconds_sum', 'pageserver_remote_physical_size', - 'pageserver_remote_timeline_client_bytes_finished_total', - 'pageserver_remote_timeline_client_bytes_started_total', 'pageserver_remote_timeline_client_calls_started_bucket', 'pageserver_remote_timeline_client_calls_started_count', 'pageserver_remote_timeline_client_calls_started_sum', 'pageserver_remote_timeline_client_calls_unfinished', 'pageserver_resident_physical_size', 'pageserver_smgr_query_seconds_bucket', 'pageserver_smgr_query_seconds_count', 'pageserver_smgr_query_seconds_sum', 'pageserver_storage_operations_seconds_count_total', 'pageserver_storage_operations_seconds_sum_total', 'pageserver_tenant_states_count', 'pageserver_wait_lsn_seconds_bucket', 'pageserver_wait_lsn_seconds_count', 'pageserver_wait_lsn_seconds_sum', 'pageserver_written_persistent_bytes_total', } --- test_runner/regress/test_tenants.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 07da6a8145..ed127924da 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -20,6 +20,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, available_remote_storages, + last_flush_lsn_upload, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until @@ -250,8 +251,12 @@ def test_pageserver_metrics_removed_after_detach( tenant_1, _ = env.neon_cli.create_tenant() tenant_2, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) - env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) + tenant_1_timeline = env.neon_cli.create_timeline( + "test_metrics_removed_after_detach", tenant_id=tenant_1 + ) + tenant_2_timeline = env.neon_cli.create_timeline( + "test_metrics_removed_after_detach", tenant_id=tenant_2 + ) endpoint_tenant1 = env.endpoints.create_start( "test_metrics_removed_after_detach", tenant_id=tenant_1 @@ -260,13 +265,17 @@ def test_pageserver_metrics_removed_after_detach( "test_metrics_removed_after_detach", tenant_id=tenant_2 ) - for endpoint in [endpoint_tenant1, endpoint_tenant2]: + for endpoint, timeline_id in [ + (endpoint_tenant1, tenant_1_timeline), + (endpoint_tenant2, tenant_2_timeline), + ]: with closing(endpoint.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") cur.execute("SELECT sum(key) FROM t") assert cur.fetchone() == (5000050000,) + last_flush_lsn_upload(env, endpoint, endpoint.tenant_id, timeline_id) def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: ps_metrics = env.pageserver.http_client().get_metrics()