independent fix: test_pageserver_metrics_removed_after_detach didn't wait for uploads

This resulted in unexpectedly absent metrics `pageserver_remote_timeline_client_bytes_finished`
tripping the assert quoted below.

Not sure why this PR (#4350) exposed this problem though.
Are we detaching faster? If so, why?

AssertionError: assert {'pageserver_...s_count', ...} == {'pageserver_...s_count', ...}
  Extra items in the right set:
  'pageserver_remote_timeline_client_bytes_started_total'
  'pageserver_remote_timeline_client_bytes_finished_total'
  Full diff:
    {
     'pageserver_created_persistent_files_total',
     'pageserver_current_logical_size',
     'pageserver_evictions_total',
     'pageserver_evictions_with_low_residence_duration_total',
     'pageserver_getpage_reconstruct_seconds_bucket',
     'pageserver_getpage_reconstruct_seconds_count',
     'pageserver_getpage_reconstruct_seconds_sum',
     'pageserver_io_operations_bytes_total',
     'pageserver_io_operations_seconds_bucket',
     'pageserver_io_operations_seconds_count',
     'pageserver_io_operations_seconds_sum',
     'pageserver_last_record_lsn',
     'pageserver_materialized_cache_hits_total',
     'pageserver_remote_operation_seconds_bucket',
     'pageserver_remote_operation_seconds_count',
     'pageserver_remote_operation_seconds_sum',
     'pageserver_remote_physical_size',
  -  'pageserver_remote_timeline_client_bytes_finished_total',
  -  'pageserver_remote_timeline_client_bytes_started_total',
     'pageserver_remote_timeline_client_calls_started_bucket',
     'pageserver_remote_timeline_client_calls_started_count',
     'pageserver_remote_timeline_client_calls_started_sum',
     'pageserver_remote_timeline_client_calls_unfinished',
     'pageserver_resident_physical_size',
     'pageserver_smgr_query_seconds_bucket',
     'pageserver_smgr_query_seconds_count',
     'pageserver_smgr_query_seconds_sum',
     'pageserver_storage_operations_seconds_count_total',
     'pageserver_storage_operations_seconds_sum_total',
     'pageserver_tenant_states_count',
     'pageserver_wait_lsn_seconds_bucket',
     'pageserver_wait_lsn_seconds_count',
     'pageserver_wait_lsn_seconds_sum',
     'pageserver_written_persistent_bytes_total',
    }
This commit is contained in:
Christian Schwarz
2023-05-26 09:54:30 +02:00
parent 122e23071b
commit f2abc4c933

View File

@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
RemoteStorageKind,
available_remote_storages,
last_flush_lsn_upload,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -250,8 +251,12 @@ def test_pageserver_metrics_removed_after_detach(
tenant_1, _ = env.neon_cli.create_tenant()
tenant_2, _ = env.neon_cli.create_tenant()
env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1)
env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2)
tenant_1_timeline = env.neon_cli.create_timeline(
"test_metrics_removed_after_detach", tenant_id=tenant_1
)
tenant_2_timeline = env.neon_cli.create_timeline(
"test_metrics_removed_after_detach", tenant_id=tenant_2
)
endpoint_tenant1 = env.endpoints.create_start(
"test_metrics_removed_after_detach", tenant_id=tenant_1
@@ -260,13 +265,17 @@ def test_pageserver_metrics_removed_after_detach(
"test_metrics_removed_after_detach", tenant_id=tenant_2
)
for endpoint in [endpoint_tenant1, endpoint_tenant2]:
for endpoint, timeline_id in [
(endpoint_tenant1, tenant_1_timeline),
(endpoint_tenant2, tenant_2_timeline),
]:
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
cur.execute("SELECT sum(key) FROM t")
assert cur.fetchone() == (5000050000,)
last_flush_lsn_upload(env, endpoint, endpoint.tenant_id, timeline_id)
def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]:
ps_metrics = env.pageserver.http_client().get_metrics()