Compare commits

..

4 Commits

Author SHA1 Message Date
Christian Schwarz
434af3771d drop layer map lock before removing droppen popped frozen layer
This is a shot in the dark to fix the pageserver deadlock
described in https://github.com/neondatabase/neon/issues/3712

The ASCII art in the comment added in this commit describes a potential
deadlock that involves page cache and EphemeralFile.
So, I guess it's a useful change by itself.

But, I can't find a scenario with the current code base where
we would actually arrive in the described deadlock.

Let's see whether the deadlock from #3712 still reproduces with this
fixx. I can't repro it locally.

refs https://github.com/neondatabase/neon/issues/3712
2023-03-01 11:16:59 +01:00
sharnoff
1360361f60 Fix missing VM cgconfig.conf (#3718)
It was being added to the wrong stage in the dockerfile. This should fix
it, and resolves an ongoing issue on staging.
2023-02-28 21:11:00 -08:00
Alexander Bayandin
000eb1b069 Bump tempfile from 3.3.0 to 3.4.0 (#3709)
Update `tempfile` crate to get rid of `remove_dir_all` dependency
Ref https://github.com/neondatabase/neon/security/dependabot/15
2023-02-27 12:44:08 +00:00
Heikki Linnakangas
f51b48fa49 Fix UNLOGGED tables.
Instead of trying to create missing files on the way, send init fork contents as
main fork from pageserver during basebackup. Add test for that. Call
put_rel_drop for init forks; previously they weren't removed. Bump
vendor/postgres to revert previous approach on Postgres side.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>

ref https://github.com/neondatabase/postgres/pull/264
ref https://github.com/neondatabase/postgres/pull/259
ref https://github.com/neondatabase/neon/issues/1222
2023-02-24 23:30:02 +04:00
10 changed files with 121 additions and 33 deletions

18
Cargo.lock generated
View File

@@ -3067,15 +3067,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "remove_dir_all"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
dependencies = [
"winapi",
]
[[package]]
name = "reqwest"
version = "0.11.14"
@@ -3849,16 +3840,15 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.3.0"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
dependencies = [
"cfg-if",
"fastrand",
"libc",
"redox_syscall",
"remove_dir_all",
"winapi",
"rustix",
"windows-sys 0.42.0",
]
[[package]]

View File

@@ -150,7 +150,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
criterion = "0.4"
rcgen = "0.10"
rstest = "0.16"
tempfile = "3.2"
tempfile = "3.4"
tonic-build = "0.8"
# This is only needed for proxy's tests.

View File

@@ -10,7 +10,6 @@ RUN set -e \
&& rm -f /etc/inittab \
&& touch /etc/inittab
ADD vm-cgconfig.conf /etc/cgconfig.conf
RUN set -e \
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
@@ -26,6 +25,7 @@ RUN apt update && \
RUN adduser vm-informant --disabled-password --no-create-home
USER postgres
ADD vm-cgconfig.conf /etc/cgconfig.conf
COPY --from=informant /etc/inittab /etc/inittab
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant

View File

@@ -98,6 +98,15 @@ impl RelTag {
name
}
pub fn with_forknum(&self, forknum: u8) -> Self {
RelTag {
forknum,
spcnode: self.spcnode,
dbnode: self.dbnode,
relnode: self.relnode,
}
}
}
///

View File

@@ -13,7 +13,6 @@
use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use std::fmt::Write as FmtWrite;
use std::time::SystemTime;
use tokio::io;
@@ -34,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
use postgres_ffi::TransactionId;
use postgres_ffi::XLogFileName;
use postgres_ffi::PG_TLI;
@@ -191,14 +191,31 @@ where
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
// Gather and send relational files in each database if full backup is requested.
for rel in self
// If full backup is requested, include all relation files.
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?
{
if self.full_backup || rel.forknum == INIT_FORKNUM {
self.add_rel(rel).await?;
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
// `reinit.c` during recovery.
if rel.forknum == INIT_FORKNUM {
// I doubt we need _init fork itself, but having it at least
// serves as a marker relation is unlogged.
self.add_rel(rel, rel).await?;
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
continue;
}
if self.full_backup {
if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
{
// skip this, will include it when we reach the init fork
continue;
}
self.add_rel(rel, rel).await?;
}
}
}
@@ -221,15 +238,16 @@ where
Ok(())
}
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
/// Add contents of relfilenode `src`, naming it as `dst`.
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(tag, self.lsn, false, self.ctx)
.get_rel_size(src, self.lsn, false, self.ctx)
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
let file_name = tag.to_segfile_name(0);
let file_name = dst.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
@@ -245,12 +263,12 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
let file_name = tag.to_segfile_name(seg as u32);
let file_name = dst.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar.append(&header, segment_data.as_slice()).await?;

View File

@@ -2479,7 +2479,7 @@ impl Timeline {
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now.
{
{
let mut layers = self.layers.write().unwrap();
let l = layers.frozen_layers.pop_front();
@@ -2489,6 +2489,43 @@ impl Timeline {
assert!(LayerMap::compare_arced_layers(&l.unwrap(), &frozen_layer));
// release lock on 'layers'
drop(layers);
// Only drop the ephemeral file after releasing the layer map lock.
// The reason is that the drop function needs to lock page cache slots.
// If there is another thread that is already holding a slot which we
// need to lock, and that thread is waiting for the layer map lock, we
// would have a deadlock:
//
// wants ┌─────────┐
// us ────────────────►│ cache │
// ▲ │ slot │
// │ │ │ │
// assigned │ └────┼────┘
// │ │
// │ │assigned
// ┌────┼───┐ │
// │ │ │ ▼
// │ layers │◄─────────────── them
// │ │ wants
// └────────┘
//
// How can this happen? I don't know. Basically we need to walk up
// the call graph for all PageCache::try_lock_for_read and PageCache::try_lock_for_write
// and check whether any of them holds onto the guard object that these methods return.
// The block_io::BlockReader trait implementations look like good candidates.
// For example, the BlockReader::read_blk impl for EphemeralFile returns the guard object
// for the cache slot straight to the caller.
// But, there's are many callers of BlockReader::read_blk, and rust-analyzer has no way
// to find just the <EphemeralFile as BlockReader>::read_blk callers.
//
// One obvious place is InMemoryLayer::get_value_reconstruct_data , which uses a BlockReader::block_cursor.
// That cursor object holds onto the most recently read page cache slot until cursor object is dropped.
// But, all relevant uses of InMemoryLayer::get_value_reconstruct_data are in Timeline::get_reconstruct_data,
// which already holds the layer map lock in shared mode. So, it can't cause this deadlock.
//
// Maybe this is a red herring?
drop(l);
}
fail_point!("checkpoint-after-sync");

View File

@@ -37,7 +37,7 @@ use crate::walrecord::*;
use crate::ZERO_PAGE;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
use postgres_ffi::v14::xlog_utils::*;
use postgres_ffi::v14::CheckPoint;
@@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> {
)?;
for xnode in &parsed.xnodes {
for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
let rel = RelTag {
forknum,
spcnode: xnode.spcnode,

View File

@@ -0,0 +1,34 @@
from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
#
# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
# fork to reset them during recovery. In Neon, pageserver directly sends init
# fork contents as main fork during basebackup.
#
def test_unlogged(neon_simple_env: NeonEnv):
env = neon_simple_env
env.neon_cli.create_branch("test_unlogged", "empty")
pg = env.postgres.create_start("test_unlogged")
conn = pg.connect()
cur = conn.cursor()
cur.execute("CREATE UNLOGGED TABLE iut (id int);")
# create index to test unlogged index relation as well
cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
cur.execute("INSERT INTO iut values (42);")
# create another compute to fetch inital empty contents from pageserver
fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged")
pg2 = env.postgres.create_start(
"test_unlogged_basebackup",
)
conn2 = pg2.connect()
cur2 = conn2.cursor()
# after restart table should be empty but valid
cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
cur2.execute("EXECUTE iut_plan (43);")
cur2.execute("SELECT * FROM iut")
assert cur2.fetchall() == [(43,)]