mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 05:22:56 +00:00
Don't try to launch duplicate WAL redo thread if tenant already exists.
The codepath for tenant_create command first launched the WAL redo
thread, and then called branches::create_repo() which checked if the
tenant's directory already exists. That's problematic, because
launching the WAL redo thread will run initdb if the directory doesn't
already exist. Race condition: If the tenant already exists, it will
have a WAL redo thread already running, and the old and new WAL redo
thread might try to run initdb at the same time, causing all kinds of
weird failures.
The test_pageserver_api test was failing 100% repeatably on my laptop
because of this. I'm not sure why this doesn't occur on the CI:
Jul 31 18:05:48.877 INFO running initdb in "./tenants/5227e4eb90894775ac6b8a8c76f24b2e/wal-redo-datadir", location: pageserver::walredo, pageserver/src/walredo.rs:483
thread 'WAL redo thread' panicked at 'initdb failed: The files belonging to this database system will be owned by user "heikki".
This user must also own the server process.
The database cluster will be initialized with locale "C".
The default database encoding has accordingly been set to "SQL_ASCII".
The default text search configuration will be set to "english".
Data page checksums are disabled.
creating directory ./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir ... ok
creating subdirectories ... ok
selecting dynamic shared memory implementation ... posix
selecting default max_connections ... 100
selecting default shared_buffers ... 128MB
selecting default time zone ... Europe/Helsinki
creating configuration files ... ok
running bootstrap script ...
stderr:
2021-07-31 15:05:48.875 GMT [282569] LOG: could not open configuration file "/home/heikki/git-sandbox/zenith/test_output/test_tenant_list/repo/./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir/postgresql.conf": No such file or directory
2021-07-31 15:05:48.875 GMT [282569] FATAL: configuration file "/home/heikki/git-sandbox/zenith/test_output/test_tenant_list/repo/./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir/postgresql.conf" contains errors
child process exited with exit code 1
initdb: removing data directory "./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir"
This commit is contained in:
@@ -1,12 +1,13 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use crate::branches;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{PageServerConf, ZTenantId};
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use std::collections::HashMap;
|
||||
@@ -38,6 +39,24 @@ pub fn init(conf: &'static PageServerConf) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
}
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||
|
||||
m.insert(tenantid, Arc::new(repo));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
|
||||
@@ -17,7 +17,6 @@ use regex::Regex;
|
||||
use std::io::Write;
|
||||
use std::net::TcpListener;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
@@ -33,7 +32,6 @@ use crate::object_key::ObjectTag;
|
||||
use crate::page_cache;
|
||||
use crate::repository::{BufferTag, Modification, RelTag};
|
||||
use crate::walreceiver;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTenantId;
|
||||
use crate::ZTimelineId;
|
||||
@@ -508,9 +506,8 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let caps = re.captures(&query_string).ok_or_else(err)?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(self.conf, tenantid));
|
||||
let repo = branches::create_repo(self.conf, tenantid, wal_redo_manager)?;
|
||||
page_cache::insert_repository_for_tenant(tenantid, Arc::new(repo));
|
||||
|
||||
page_cache::create_repository_for_tenant(&self.conf, tenantid)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -62,7 +62,7 @@ def test_tenant_list(pageserver: ZenithPageserver, zenith_cli):
|
||||
cur = conn.cursor()
|
||||
|
||||
# check same tenant cannot be created twice
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'repo for {pageserver.initial_tenant} already exists'):
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
|
||||
cur.execute(f'tenant_create {pageserver.initial_tenant}')
|
||||
|
||||
# create one more tenant
|
||||
|
||||
Reference in New Issue
Block a user