mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
Improves `wait_until` by: * Use `timeout` instead of `iterations`. This allows changing the timeout/interval parameters independently. * Make `timeout` and `interval` optional (default 20s and 0.5s). Most callers don't care. * Only output status every 1s by default, and add optional `status_interval` parameter. * Remove `show_intermediate_error`, this was always emitted anyway. Most callers have been updated to use the defaults, except where they had good reason otherwise.
279 lines
11 KiB
Python
279 lines
11 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from datetime import UTC, datetime, timedelta
|
|
|
|
import pytest
|
|
from fixtures.common_types import Lsn
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
|
|
from fixtures.pageserver.http import PageserverApiException
|
|
from fixtures.utils import query_scalar, wait_until
|
|
from requests.exceptions import ReadTimeout
|
|
|
|
|
|
def assert_lsn_lease_granted(result, with_lease: bool):
|
|
"""
|
|
Asserts an LSN lease is granted when `with_lease` flag is turned on.
|
|
Always asserts no LSN lease is granted when `with_lease` flag is off.
|
|
"""
|
|
if with_lease:
|
|
assert result.get("valid_until")
|
|
else:
|
|
assert result.get("valid_until") is None
|
|
|
|
|
|
@pytest.mark.parametrize("with_lease", [True, False])
|
|
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool):
|
|
"""
|
|
Test pageserver get_lsn_by_timestamp API.
|
|
|
|
:param with_lease: Whether to get a lease associated with returned LSN.
|
|
"""
|
|
env = neon_env_builder.init_start()
|
|
|
|
tenant_id, _ = env.create_tenant(
|
|
conf={
|
|
# disable default GC and compaction
|
|
"gc_period": "1000 m",
|
|
"compaction_period": "0 s",
|
|
"gc_horizon": f"{1024 ** 2}",
|
|
"checkpoint_distance": f"{1024 ** 2}",
|
|
"compaction_target_size": f"{1024 ** 2}",
|
|
}
|
|
)
|
|
|
|
timeline_id = env.create_branch("test_lsn_mapping", tenant_id=tenant_id)
|
|
endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
|
|
timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
|
|
|
|
cur = endpoint_main.connect().cursor()
|
|
|
|
# Obtain an lsn before all write operations on this branch
|
|
start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()"))
|
|
|
|
# Create table, and insert rows, each in a separate transaction
|
|
# Disable synchronous_commit to make this initialization go faster.
|
|
# Disable `synchronous_commit` to make this initialization go faster.
|
|
# XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
|
|
# doesn't change anything.
|
|
#
|
|
# Each row contains current insert LSN and the current timestamp, when
|
|
# the row was inserted.
|
|
cur.execute("SET synchronous_commit=off")
|
|
cur.execute("CREATE TABLE foo (x integer)")
|
|
tbl = []
|
|
for i in range(1000):
|
|
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
|
|
# Get the timestamp at UTC
|
|
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
|
|
tbl.append([i, after_timestamp])
|
|
|
|
# Execute one more transaction with synchronous_commit enabled, to flush
|
|
# all the previous transactions
|
|
cur.execute("SET synchronous_commit=on")
|
|
cur.execute("INSERT INTO foo VALUES (-1)")
|
|
|
|
# Wait until WAL is received by pageserver
|
|
last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id)
|
|
|
|
with env.pageserver.http_client() as client:
|
|
# Check edge cases
|
|
# Timestamp is in the future
|
|
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
|
|
result = client.timeline_get_lsn_by_timestamp(
|
|
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
|
|
)
|
|
assert result["kind"] == "future"
|
|
assert_lsn_lease_granted(result, with_lease)
|
|
# make sure that we return a well advanced lsn here
|
|
assert Lsn(result["lsn"]) > start_lsn
|
|
|
|
# Timestamp is in the unreachable past
|
|
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
|
result = client.timeline_get_lsn_by_timestamp(
|
|
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
|
|
)
|
|
assert result["kind"] == "past"
|
|
assert_lsn_lease_granted(result, with_lease)
|
|
|
|
# make sure that we return the minimum lsn here at the start of the range
|
|
assert Lsn(result["lsn"]) < start_lsn
|
|
|
|
# Probe a bunch of timestamps in the valid range
|
|
for i in range(1, len(tbl), 100):
|
|
probe_timestamp = tbl[i][1]
|
|
result = client.timeline_get_lsn_by_timestamp(
|
|
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
|
|
)
|
|
assert result["kind"] not in ["past", "nodata"]
|
|
assert_lsn_lease_granted(result, with_lease)
|
|
lsn = result["lsn"]
|
|
# Call get_lsn_by_timestamp to get the LSN
|
|
# Launch a new read-only node at that LSN, and check that only the rows
|
|
# that were supposed to be committed at that point in time are visible.
|
|
endpoint_here = env.endpoints.create_start(
|
|
branch_name="test_lsn_mapping",
|
|
endpoint_id="ep-lsn_mapping_read",
|
|
lsn=lsn,
|
|
tenant_id=tenant_id,
|
|
)
|
|
assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
|
|
|
|
endpoint_here.stop_and_destroy()
|
|
|
|
# Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff
|
|
timeline_id_child = env.create_branch(
|
|
"test_lsn_mapping_child", ancestor_branch_name="test_lsn_mapping", tenant_id=tenant_id
|
|
)
|
|
|
|
# Timestamp is in the unreachable past
|
|
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
|
result = client.timeline_get_lsn_by_timestamp(
|
|
tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease
|
|
)
|
|
assert result["kind"] == "past"
|
|
assert_lsn_lease_granted(result, with_lease)
|
|
# make sure that we return the minimum lsn here at the start of the range
|
|
assert Lsn(result["lsn"]) >= last_flush_lsn
|
|
|
|
|
|
def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
|
|
"""
|
|
Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled.
|
|
Added as an effort to improve error handling and avoid full anyhow backtrace.
|
|
"""
|
|
|
|
env = neon_env_builder.init_start()
|
|
env.pageserver.allowed_errors.extend(
|
|
[
|
|
".*request was dropped before completing.*",
|
|
".*Cancelled request finished with an error: Cancelled",
|
|
]
|
|
)
|
|
|
|
client = env.pageserver.http_client()
|
|
failpoint = "find-lsn-for-timestamp-pausable"
|
|
client.configure_failpoints((failpoint, "pause"))
|
|
|
|
with ThreadPoolExecutor(max_workers=1) as exec:
|
|
# Request get_lsn_by_timestamp, hit the pausable failpoint
|
|
failing = exec.submit(
|
|
client.timeline_get_lsn_by_timestamp,
|
|
env.initial_tenant,
|
|
env.initial_timeline,
|
|
datetime.now(),
|
|
timeout=2,
|
|
)
|
|
|
|
_, offset = wait_until(
|
|
lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
|
|
)
|
|
|
|
with pytest.raises(ReadTimeout):
|
|
failing.result()
|
|
|
|
client.configure_failpoints((failpoint, "off"))
|
|
|
|
_, offset = wait_until(
|
|
lambda: env.pageserver.assert_log_contains(
|
|
"Cancelled request finished with an error: Cancelled$", offset
|
|
),
|
|
)
|
|
|
|
|
|
# Test pageserver get_timestamp_of_lsn API
|
|
def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
|
key_not_found_error = r".*could not find data for key.*"
|
|
|
|
env = neon_env_builder.init_start()
|
|
|
|
new_timeline_id = env.create_branch("test_ts_of_lsn_api")
|
|
endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api")
|
|
|
|
cur = endpoint_main.connect().cursor()
|
|
# Create table, and insert rows, each in a separate transaction
|
|
# Enable synchronous commit as we are timing sensitive
|
|
#
|
|
# Each row contains current insert LSN and the current timestamp, when
|
|
# the row was inserted.
|
|
cur.execute("SET synchronous_commit=on")
|
|
cur.execute("CREATE TABLE foo (x integer)")
|
|
tbl = []
|
|
for i in range(1000):
|
|
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
|
|
# Get the timestamp at UTC
|
|
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=UTC)
|
|
after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
|
|
tbl.append([i, after_timestamp, after_lsn])
|
|
time.sleep(0.02)
|
|
|
|
# Execute one more transaction with synchronous_commit enabled, to flush
|
|
# all the previous transactions
|
|
cur.execute("SET synchronous_commit=on")
|
|
cur.execute("INSERT INTO foo VALUES (-1)")
|
|
|
|
# Wait until WAL is received by pageserver
|
|
last_flush_lsn = wait_for_last_flush_lsn(
|
|
env, endpoint_main, env.initial_tenant, new_timeline_id
|
|
)
|
|
|
|
with env.pageserver.http_client() as client:
|
|
# Check edge cases: lsn larger than the last flush lsn
|
|
probe_lsn = Lsn(int(last_flush_lsn) * 20 + 80_000)
|
|
result = client.timeline_get_timestamp_of_lsn(
|
|
env.initial_tenant,
|
|
new_timeline_id,
|
|
probe_lsn,
|
|
)
|
|
|
|
# lsn of zero
|
|
try:
|
|
probe_lsn = Lsn(0)
|
|
result = client.timeline_get_timestamp_of_lsn(
|
|
env.initial_tenant,
|
|
new_timeline_id,
|
|
probe_lsn,
|
|
)
|
|
# There should always be an error here.
|
|
raise RuntimeError("there should have been an 'Invalid LSN' error")
|
|
except PageserverApiException as error:
|
|
assert error.status_code == 500
|
|
assert str(error) == "Invalid LSN"
|
|
env.pageserver.allowed_errors.append(".*Invalid LSN.*")
|
|
|
|
# small lsn before initdb_lsn
|
|
try:
|
|
probe_lsn = Lsn(64)
|
|
result = client.timeline_get_timestamp_of_lsn(
|
|
env.initial_tenant,
|
|
new_timeline_id,
|
|
probe_lsn,
|
|
)
|
|
# There should always be an error here.
|
|
raise RuntimeError("there should have been an 'could not find data for key' error")
|
|
except PageserverApiException as error:
|
|
assert error.status_code == 500
|
|
assert re.match(key_not_found_error, str(error))
|
|
env.pageserver.allowed_errors.append(key_not_found_error)
|
|
|
|
# Probe a bunch of timestamps in the valid range
|
|
step_size = 100
|
|
for i in range(step_size, len(tbl), step_size):
|
|
after_timestamp = tbl[i][1]
|
|
after_lsn = tbl[i][2]
|
|
result = client.timeline_get_timestamp_of_lsn(
|
|
env.initial_tenant,
|
|
new_timeline_id,
|
|
after_lsn,
|
|
)
|
|
log.info("result: %s, after_ts: %s", result, after_timestamp)
|
|
|
|
timestamp = datetime.fromisoformat(result).replace(tzinfo=UTC)
|
|
assert timestamp < after_timestamp, "after_timestamp after timestamp"
|
|
if i > 1:
|
|
before_timestamp = tbl[i - step_size][1]
|
|
assert timestamp >= before_timestamp, "before_timestamp before timestamp"
|