pageserver: fix unlogged relations with sharding (#7454)

## Problem

- #7451 

INIT_FORKNUM blocks must be stored on shard 0 to enable including them
in basebackup.

This issue can be missed in simple tests because creating an unlogged
table isn't sufficient -- to repro I had to create an _index_ on an
unlogged table (then restart the endpoint).

Closes: #7451 

## Summary of changes

- Add a reproducer for the issue.
- Tweak the condition for `key_is_shard0` to include anything that isn't
a normal relation block _and_ any normal relation block whose forknum is
INIT_FORKNUM.
- To enable existing databases to recover from the issue, add a special
case that omits relations if they were stored on the wrong INITFORK.
This enables postgres to start and the user to drop the table and
recreate it.
This commit is contained in:
John Spray
2024-04-22 12:47:24 +01:00
committed by GitHub
parent 6a5650d40c
commit 0bd16182f7
3 changed files with 83 additions and 3 deletions

View File

@@ -5,6 +5,7 @@ use crate::{
models::ShardParameters,
};
use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
use utils::id::TenantId;
@@ -537,6 +538,24 @@ impl ShardIdentity {
}
}
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
///
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
/// as a symptom of that issue.
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
return false;
}
let mut hash = murmurhash32(key.field4);
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
// The key may be affected by issue #7454: it is an initfork and it would not
// have mapped to shard 0 until we fixed that issue.
mapped_shard != ShardNumber(0)
}
/// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split.
///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
!is_rel_block_key(key)
//
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
// because they must be included in basebackups.
let is_initfork = key.field5 == INIT_FORKNUM;
!is_rel_block_key(key) || is_initfork
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -13,7 +13,7 @@
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point;
use pageserver_api::key::{key_to_slru_block, Key};
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite;
use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
if rel.forknum == INIT_FORKNUM {
// I doubt we need _init fork itself, but having it at least
// serves as a marker relation is unlogged.
self.add_rel(rel, rel).await?;
if let Err(_e) = self.add_rel(rel, rel).await {
if self
.timeline
.get_shard_identity()
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
{
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
// recreate.
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
continue;
}
};
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
continue;
}

View File

@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
diff = max_lsn - min_lsn
assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
"""
Check that an unlogged relation is handled properly on a sharded tenant
Reproducer for https://github.com/neondatabase/neon/issues/7451
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
neon_env_builder.start()
tenant_id = TenantId.generate()
timeline_id = TimelineId.generate()
env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
# We will create many tables to ensure it's overwhelmingly likely that at least one
# of them doesn't land on shard 0
table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
for table_name in table_names:
ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
result = ep.safe_psql(f"SELECT * from {table_name};")
assert result == [(1, "foo")]
ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
for table_name in table_names:
# Check that table works: we can select and insert
result = ep.safe_psql(f"SELECT * from {table_name};")
assert result == []
ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
result = ep.safe_psql(f"SELECT * from {table_name};")
assert result == [(2, "bar")]
# Ensure that post-endpoint-restart modifications are ingested happily by pageserver
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)