test(pageserver): use reldirv2 by default in regress tests (#11081)

## Problem For pg_regress test, we do both v1 and v2; for all the rest, we default to v2. part of https://github.com/neondatabase/neon/issues/9516 ## Summary of changes Use reldir v2 across test cases by default. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-01-10 15:02:56 +00:00 · 2025-03-05 16:02:44 -05:00
parent 1fe23fe8d2
commit 2de3629b88
5 changed files with 48 additions and 16 deletions
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1964,14 +1964,12 @@ impl DatadirModification<'_> {
                .context("deserialize db")?
        };

-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
-
        let v2_enabled = self.maybe_enable_rel_size_v2()?;

        if v2_enabled {
+            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
@@ -2006,6 +2004,10 @@ impl DatadirModification<'_> {
            self.pending_directory_entries
                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
        } else {
+            // Add the new relation to the rel directory entry, and write it back
+            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
            if !dbdir_exists {
                self.pending_directory_entries
                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -2019,6 +2021,7 @@ impl DatadirModification<'_> {
                )),
            );
        }
+
        // Put size
        let size_key = rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
@@ -2141,7 +2144,7 @@ impl DatadirModification<'_> {
                    // Remove entry from relation size cache
                    self.tline.remove_cached_rel_size(&rel_tag);

-                    // Delete size entry, as well as all blocks
+                    // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
                    self.delete(rel_key_range(rel_tag));
                }
            }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2451,6 +2451,7 @@ impl Tenant {
            create_guard,
            initdb_lsn,
            None,
+            None,
        )
        .await
    }
@@ -2782,6 +2783,7 @@ impl Tenant {
                    timeline_create_guard,
                    initdb_lsn,
                    None,
+                    None,
                )
                .await
            }
@@ -4869,6 +4871,7 @@ impl Tenant {
                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
+                Some(src_timeline.get_rel_size_v2_status()),
            )
            .await?;

@@ -5142,6 +5145,7 @@ impl Tenant {
                timeline_create_guard,
                pgdata_lsn,
                None,
+                None,
            )
            .await?;

@@ -5220,13 +5224,14 @@ impl Tenant {
        create_guard: TimelineCreateGuard,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
+        rel_size_v2_status: Option<RelSizeMigration>,
    ) -> anyhow::Result<UninitializedTimeline<'a>> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
        resources
            .remote_client
-            .init_upload_queue_for_empty_remote(new_metadata)?;
+            .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;

        let timeline_struct = self
            .create_timeline_struct(
@@ -5238,7 +5243,7 @@ impl Tenant {
                CreateTimelineCause::Load,
                create_guard.idempotency.clone(),
                None,
-                None,
+                rel_size_v2_status,
            )
            .context("Failed to create timeline data structure")?;

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {

    /// Initialize the upload queue for the case where the remote storage is empty,
    /// i.e., it doesn't have an `IndexPart`.
+    ///
+    /// `rel_size_v2_status` needs to be carried over during branching, and that's why
+    /// it's passed in here.
    pub fn init_upload_queue_for_empty_remote(
        &self,
        local_metadata: &TimelineMetadata,
+        rel_size_v2_status: Option<RelSizeMigration>,
    ) -> anyhow::Result<()> {
        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
        // certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
            .as_ref()
            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        let initialized_queue =
+            upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1197,6 +1197,9 @@ class NeonEnv:
                    config.pageserver_default_tenant_config_compaction_algorithm
                )

+            tenant_config = ps_cfg.setdefault("tenant_config", {})
+            tenant_config["rel_size_v2_enabled"] = True  # Enable relsize_v2 by default in tests
+
            if self.pageserver_remote_storage is not None:
                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
                    self.pageserver_remote_storage
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,7 +5,7 @@ from __future__ import annotations

 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast

 import pytest
 from fixtures.log_helper import log
@@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
        pageserver.http_client().timeline_gc(shard, env.initial_timeline, None)


+def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]:
+    tenant_conf = tenant_conf.copy()
+    if reldir_type == "v2":
+        tenant_conf["rel_size_v2_enabled"] = "true"
+    else:
+        tenant_conf["rel_size_v2_enabled"] = "false"
+    return tenant_conf
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@pytest.mark.timeout(3000)  # Contains many sub-tests, is slow in debug builds
@pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_pg_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
@@ -130,6 +140,7 @@ def test_pg_regress(
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: int | None,
+    reldir_type: str,
 ):
    DBNAME = "regression"

@@ -142,7 +153,7 @@ def test_pg_regress(

    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
        initial_tenant_shard_count=shard_count,
    )

@@ -196,6 +207,7 @@ def test_pg_regress(
 #
@pytest.mark.timeout(1500)  # Contains many sub-tests, is slow in debug builds
@pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_isolation(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
@@ -204,6 +216,7 @@ def test_isolation(
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: int | None,
+    reldir_type: str,
 ):
    DBNAME = "isolation_regression"

@@ -211,7 +224,8 @@ def test_isolation(
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
    )

    # Connect to postgres and create a database called "regression".
@@ -267,6 +281,7 @@ def test_isolation(
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_sql_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
@@ -275,6 +290,7 @@ def test_sql_regress(
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: int | None,
+    reldir_type: str,
 ):
    DBNAME = "regression"

@@ -282,7 +298,8 @@ def test_sql_regress(
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
    )

    # Connect to postgres and create a database called "regression".
@@ -345,9 +362,7 @@ def test_tx_abort_with_many_relations(
    """

    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false",
-        }
+        initial_tenant_conf=patch_tenant_conf({}, reldir_type),
    )
    ep = env.endpoints.create_start(
        "main",