mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-23 06:09:59 +00:00
## Problem LKB-197, #9516 To make sure the migration path is smooth. The previous plan is to store new relations in new keyspace and old ones in old keyspace until it gets dropped. This makes the migration path hard as we can't validate v2 writes and can't rollback. This patch gives us a more smooth migration path: - The first time we enable reldirv2 for a tenant, we copy over everything in the old keyspace to the new one. This might create a short spike of latency for the create relation operation, but it's oneoff. - After that, we have identical v1/v2 keyspace and read/write both of them. We validate reads every time we list the reldirs. - If we are in `migrating` mode, use v1 as source of truth and log a warning for failed v2 operations. If we are in `migrated` mode, use v2 as source of truth and error when writes fail. - One compatibility test uses dataset from the time where we enabled reldirv2 (of the original rollout plan), which only has relations written to the v2 keyspace instead of the v1 keyspace. We had to adjust it accordingly. - Add `migrated_at` in index_part to indicate the LSN where we did the initialize. TODOs: - Test if relv1 can be read below the migrated_at LSN. - Move the initialization process to L0 compaction instead of doing it on the write path. - Disable relcache in the relv2 test case so that all code path gets fully tested. ## Summary of changes - New behavior of reldirv2 migration flags as described above. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
132 lines
4.0 KiB
Python
132 lines
4.0 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from fixtures.neon_fixtures import (
|
|
NeonEnvBuilder,
|
|
)
|
|
|
|
from fixtures.neon_fixtures import wait_for_last_flush_lsn
|
|
|
|
|
|
def test_pageserver_reldir_v2(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf={
|
|
"rel_size_v2_enabled": "false",
|
|
}
|
|
)
|
|
|
|
endpoint = env.endpoints.create_start("main")
|
|
# Create a relation in v1
|
|
endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
|
|
endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
|
|
|
|
assert (
|
|
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
|
|
"rel_size_migration"
|
|
]
|
|
== "legacy"
|
|
)
|
|
|
|
# Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do
|
|
# a "wait_flush_lsn" here, but it's easier to just do a restart.
|
|
env.pageserver.restart()
|
|
|
|
# Switch to v2
|
|
env.pageserver.http_client().update_tenant_config(
|
|
env.initial_tenant,
|
|
{
|
|
"rel_size_v2_enabled": True,
|
|
},
|
|
)
|
|
|
|
assert (
|
|
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
|
|
"rel_size_migration"
|
|
]
|
|
== "legacy"
|
|
)
|
|
|
|
# Check if both relations are still accessible
|
|
endpoint.safe_psql("SELECT * FROM foo1")
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
|
|
# Restart the endpoint
|
|
endpoint.stop()
|
|
endpoint.start()
|
|
|
|
# Check if both relations are still accessible again after restart
|
|
endpoint.safe_psql("SELECT * FROM foo1")
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
|
|
# Create a relation in v2
|
|
endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
|
|
endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)")
|
|
# Delete a relation in v1
|
|
endpoint.safe_psql("DROP TABLE foo1")
|
|
# wait pageserver to apply the LSN
|
|
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
|
|
|
# Check if both relations are still accessible
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
endpoint.safe_psql("SELECT * FROM foo3")
|
|
endpoint.safe_psql("SELECT * FROM foo4")
|
|
|
|
# Restart the endpoint
|
|
endpoint.stop()
|
|
# This will acquire a basebackup, which lists all relations.
|
|
endpoint.start()
|
|
|
|
# Check if both relations are still accessible after restart
|
|
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
endpoint.safe_psql("SELECT * FROM foo3")
|
|
endpoint.safe_psql("SELECT * FROM foo4")
|
|
endpoint.safe_psql("DROP TABLE foo3")
|
|
# wait pageserver to apply the LSN
|
|
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
|
|
|
# Restart the endpoint again
|
|
endpoint.stop()
|
|
endpoint.start()
|
|
|
|
# Check if relations are still accessible
|
|
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
|
|
endpoint.safe_psql("SELECT * FROM foo4")
|
|
|
|
# Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached.
|
|
env.pageserver.http_client().update_tenant_config(
|
|
env.initial_tenant,
|
|
{
|
|
"rel_size_v2_enabled": False,
|
|
},
|
|
)
|
|
|
|
endpoint.stop()
|
|
endpoint.start()
|
|
|
|
# Check if the relation is still accessible
|
|
endpoint.safe_psql("SELECT * FROM foo2")
|
|
endpoint.safe_psql("SELECT * FROM foo4")
|
|
|
|
env.pageserver.restart()
|
|
|
|
assert (
|
|
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
|
|
"rel_size_migration"
|
|
]
|
|
== "migrating"
|
|
)
|
|
|
|
assert (
|
|
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
|
|
"rel_size_migrated_at"
|
|
]
|
|
is not None
|
|
)
|