Set the last_record_gauge to the value which was persisted metadata (#3460)

## Describe your changes
Whenever a tenant is detached or the pageserver is restarted the
pageserver_last_record_lsn metric is dropped
This fix resurrects the value from the metadata whenever the tenant is
attached again
## Issue ticket number and link
[3571](https://github.com/neondatabase/cloud/issues/3571)
## Checklist before requesting a review
- [X] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
This commit is contained in:
Shany Pozin
2023-01-29 12:40:50 +02:00
committed by GitHub
parent 4d291d0e90
commit 67d418e91c
2 changed files with 34 additions and 10 deletions

View File

@@ -945,6 +945,10 @@ impl Timeline {
};
result.repartition_threshold = result.get_checkpoint_distance() / 10;
result
.metrics
.last_record_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
}

View File

@@ -6,6 +6,7 @@ from threading import Thread
import asyncpg
import pytest
from fixtures.log_helper import log
from fixtures.metrics import parse_metrics
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant()
pg = env.postgres.create_start("main", tenant_id=tenant_id)
with pg.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
with pg.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# Wait for the all data to be processed by the pageserver and uploaded in remote storage
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
tenant_metric_filter = {
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
}
pageserver_last_record_lsn_before_detach = int(
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
)
pageserver_http.tenant_detach(tenant_id)
pageserver_http.tenant_attach(tenant_id)
with pg.cursor() as cur:
assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
time.sleep(1) # for metrics propagation
# Check that we had to retry the downloads
assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
pageserver_last_record_lsn = int(
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
)
assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
with pg.cursor() as cur:
assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
# Check that we had to retry the downloads
assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
assert env.pageserver.log_contains(".*download.*failed, will retry.*")
num_connections = 10