Files
neon/test_runner/regress/test_old_request_lsn.py
Alexander Bayandin 30a7dd630c ruff: enable TC — flake8-type-checking (#11368)
## Problem

`TYPE_CHECKING` is used inconsistently across Python tests.

## Summary of changes
- Update `ruff`: 0.7.0 -> 0.11.2
- Enable TC (flake8-type-checking):
https://docs.astral.sh/ruff/rules/#flake8-type-checking-tc
- (auto)fix all new issues
2025-03-30 18:58:33 +00:00

78 lines
3.1 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
from fixtures.common_types import TimelineId
from fixtures.log_helper import log
from fixtures.utils import print_gc_result, query_scalar
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
#
# Test where Postgres generates a lot of WAL, and it's garbage collected away, but
# no pages are evicted so that Postgres uses an old LSN in a GetPage request.
# We had a bug where the page server failed to find the page version because it
# thought it was already garbage collected away, because the LSN in the GetPage
# request was very old and the WAL from that time was indeed already removed.
# In reality, the LSN on a GetPage request coming from a primary server is
# just a hint that the page hasn't been modified since that LSN, and the page
# server should return the latest page version regardless of the LSN.
#
def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
env.create_branch("test_old_request_lsn", ancestor_branch_name="main")
endpoint = env.endpoints.create_start("test_old_request_lsn")
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Get the timeline ID of our branch. We need it for the 'do_gc' command
timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
pageserver_http = env.pageserver.http_client()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers.
cur.execute("CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 1, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers, so that the SELECT below
# will cause GetPage requests.
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
cur.execute("VACUUM foo")
# Make a lot of updates on a single row, generating a lot of WAL. Trigger
# garbage collections so that the page server will remove old page versions.
for _ in range(10):
pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
print_gc_result(gc_result)
for _ in range(100):
cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;")
# All (or at least most of) the updates should've been on the same page, so
# that we haven't had to evict any dirty pages for a long time. Now run
# a query that sends GetPage@LSN requests with the old LSN.
cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
assert cur.fetchone() == (100000, 101000)