From 0bd16182f7b2e7abedbb218238d83928f67607bc Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 22 Apr 2024 12:47:24 +0100 Subject: [PATCH] pageserver: fix unlogged relations with sharding (#7454) ## Problem - #7451 INIT_FORKNUM blocks must be stored on shard 0 to enable including them in basebackup. This issue can be missed in simple tests because creating an unlogged table isn't sufficient -- to repro I had to create an _index_ on an unlogged table (then restart the endpoint). Closes: #7451 ## Summary of changes - Add a reproducer for the issue. - Tweak the condition for `key_is_shard0` to include anything that isn't a normal relation block _and_ any normal relation block whose forknum is INIT_FORKNUM. - To enable existing databases to recover from the issue, add a special case that omits relations if they were stored on the wrong INITFORK. This enables postgres to start and the user to drop the table and recreate it. --- libs/pageserver_api/src/shard.rs | 27 +++++++++++++++++- pageserver/src/basebackup.rs | 17 +++++++++-- test_runner/regress/test_sharding.py | 42 ++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index c293ad705b..6a8a5cc8f3 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -5,6 +5,7 @@ use crate::{ models::ShardParameters, }; use hex::FromHex; +use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; use utils::id::TenantId; @@ -537,6 +538,24 @@ impl ShardIdentity { } } + /// Special case for issue `` + /// + /// When we fail to read a forknum block, this function tells us whether we may ignore the error + /// as a symptom of that issue. + pub fn is_key_buggy_forknum(&self, key: &Key) -> bool { + if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM { + return false; + } + + let mut hash = murmurhash32(key.field4); + hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0)); + let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8); + + // The key may be affected by issue #7454: it is an initfork and it would not + // have mapped to shard 0 until we fixed that issue. + mapped_shard != ShardNumber(0) + } + /// Return true if the key should be discarded if found in this shard's /// data store, e.g. during compaction after a split. /// @@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - !is_rel_block_key(key) + // + // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table + // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 + // because they must be included in basebackups. + let is_initfork = key.field5 == INIT_FORKNUM; + + !is_rel_block_key(key) || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 0479d05f8f..107758f385 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -13,7 +13,7 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; -use pageserver_api::key::{key_to_slru_block, Key}; +use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key}; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::SystemTime; @@ -297,7 +297,20 @@ where if rel.forknum == INIT_FORKNUM { // I doubt we need _init fork itself, but having it at least // serves as a marker relation is unlogged. - self.add_rel(rel, rel).await?; + if let Err(_e) = self.add_rel(rel, rel).await { + if self + .timeline + .get_shard_identity() + .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0)) + { + // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation + // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows + // postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and + // recreate. + tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation"); + continue; + } + }; self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?; continue; } diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bfaab9125f..101d2620b0 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) diff = max_lsn - min_lsn assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" + + +def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): + """ + Check that an unlogged relation is handled properly on a sharded tenant + + Reproducer for https://github.com/neondatabase/neon/issues/7451 + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + + # We will create many tables to ensure it's overwhelmingly likely that at least one + # of them doesn't land on shard 0 + table_names = [f"my_unlogged_{i}" for i in range(0, 16)] + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));") + ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(1, "foo")] + ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);") + + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + # Check that table works: we can select and insert + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [] + ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(2, "bar")] + + # Ensure that post-endpoint-restart modifications are ingested happily by pageserver + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)