Files
neon/test_runner/regress/test_relations.py
Alex Chi Z. 5c57e8a11b feat(pageserver): rework reldirv2 rollout (#12576)
## Problem

LKB-197, #9516 

To make sure the migration path is smooth.

The previous plan is to store new relations in new keyspace and old ones
in old keyspace until it gets dropped. This makes the migration path
hard as we can't validate v2 writes and can't rollback. This patch gives
us a more smooth migration path:

- The first time we enable reldirv2 for a tenant, we copy over
everything in the old keyspace to the new one. This might create a short
spike of latency for the create relation operation, but it's oneoff.
- After that, we have identical v1/v2 keyspace and read/write both of
them. We validate reads every time we list the reldirs.
- If we are in `migrating` mode, use v1 as source of truth and log a
warning for failed v2 operations. If we are in `migrated` mode, use v2
as source of truth and error when writes fail.
- One compatibility test uses dataset from the time where we enabled
reldirv2 (of the original rollout plan), which only has relations
written to the v2 keyspace instead of the v1 keyspace. We had to adjust
it accordingly.
- Add `migrated_at` in index_part to indicate the LSN where we did the
initialize.

TODOs:

- Test if relv1 can be read below the migrated_at LSN.
- Move the initialization process to L0 compaction instead of doing it
on the write path.
- Disable relcache in the relv2 test case so that all code path gets
fully tested.

## Summary of changes

- New behavior of reldirv2 migration flags as described above.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2025-07-23 16:12:46 +00:00

132 lines
4.0 KiB
Python

from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.neon_fixtures import wait_for_last_flush_lsn
def test_pageserver_reldir_v2(
neon_env_builder: NeonEnvBuilder,
):
env = neon_env_builder.init_start(
initial_tenant_conf={
"rel_size_v2_enabled": "false",
}
)
endpoint = env.endpoints.create_start("main")
# Create a relation in v1
endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "legacy"
)
# Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do
# a "wait_flush_lsn" here, but it's easier to just do a restart.
env.pageserver.restart()
# Switch to v2
env.pageserver.http_client().update_tenant_config(
env.initial_tenant,
{
"rel_size_v2_enabled": True,
},
)
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "legacy"
)
# Check if both relations are still accessible
endpoint.safe_psql("SELECT * FROM foo1")
endpoint.safe_psql("SELECT * FROM foo2")
# Restart the endpoint
endpoint.stop()
endpoint.start()
# Check if both relations are still accessible again after restart
endpoint.safe_psql("SELECT * FROM foo1")
endpoint.safe_psql("SELECT * FROM foo2")
# Create a relation in v2
endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)")
# Delete a relation in v1
endpoint.safe_psql("DROP TABLE foo1")
# wait pageserver to apply the LSN
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
# Check if both relations are still accessible
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo3")
endpoint.safe_psql("SELECT * FROM foo4")
# Restart the endpoint
endpoint.stop()
# This will acquire a basebackup, which lists all relations.
endpoint.start()
# Check if both relations are still accessible after restart
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo3")
endpoint.safe_psql("SELECT * FROM foo4")
endpoint.safe_psql("DROP TABLE foo3")
# wait pageserver to apply the LSN
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
# Restart the endpoint again
endpoint.stop()
endpoint.start()
# Check if relations are still accessible
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
endpoint.safe_psql("SELECT * FROM foo4")
# Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached.
env.pageserver.http_client().update_tenant_config(
env.initial_tenant,
{
"rel_size_v2_enabled": False,
},
)
endpoint.stop()
endpoint.start()
# Check if the relation is still accessible
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo4")
env.pageserver.restart()
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "migrating"
)
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migrated_at"
]
is not None
)