Don't try to launch duplicate WAL redo thread if tenant already exists.

The codepath for tenant_create command first launched the WAL redo
thread, and then called branches::create_repo() which checked if the
tenant's directory already exists. That's problematic, because
launching the WAL redo thread will run initdb if the directory doesn't
already exist. Race condition: If the tenant already exists, it will
have a WAL redo thread already running, and the old and new WAL redo
thread might try to run initdb at the same time, causing all kinds of
weird failures.

The test_pageserver_api test was failing 100% repeatably on my laptop
because of this. I'm not sure why this doesn't occur on the CI:

    Jul 31 18:05:48.877 INFO running initdb in "./tenants/5227e4eb90894775ac6b8a8c76f24b2e/wal-redo-datadir", location: pageserver::walredo, pageserver/src/walredo.rs:483
    thread 'WAL redo thread' panicked at 'initdb failed: The files belonging to this database system will be owned by user "heikki".
    This user must also own the server process.

    The database cluster will be initialized with locale "C".
    The default database encoding has accordingly been set to "SQL_ASCII".
    The default text search configuration will be set to "english".

    Data page checksums are disabled.

    creating directory ./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir ... ok
    creating subdirectories ... ok
    selecting dynamic shared memory implementation ... posix
    selecting default max_connections ... 100
    selecting default shared_buffers ... 128MB
    selecting default time zone ... Europe/Helsinki
    creating configuration files ... ok
    running bootstrap script ...
    stderr:
    2021-07-31 15:05:48.875 GMT [282569] LOG:  could not open configuration file "/home/heikki/git-sandbox/zenith/test_output/test_tenant_list/repo/./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir/postgresql.conf": No such file or directory
    2021-07-31 15:05:48.875 GMT [282569] FATAL:  configuration file "/home/heikki/git-sandbox/zenith/test_output/test_tenant_list/repo/./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir/postgresql.conf" contains errors
    child process exited with exit code 1
    initdb: removing data directory "./tenants/0305b1326f3ea33add0929d516da7cb6/wal-redo-datadir"
This commit is contained in:
Heikki Linnakangas
2021-07-31 18:06:54 +03:00
parent bd7d811921
commit acc0f41985
3 changed files with 23 additions and 7 deletions

View File

@@ -1,12 +1,13 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use crate::branches;
use crate::object_repository::ObjectRepository;
use crate::repository::Repository;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::PostgresRedoManager;
use crate::{PageServerConf, ZTenantId};
use anyhow::{anyhow, Result};
use anyhow::{anyhow, bail, Result};
use lazy_static::lazy_static;
use log::info;
use std::collections::HashMap;
@@ -38,6 +39,24 @@ pub fn init(conf: &'static PageServerConf) {
}
}
pub fn create_repository_for_tenant(
conf: &'static PageServerConf,
tenantid: ZTenantId,
) -> Result<()> {
let mut m = REPOSITORY.lock().unwrap();
// First check that the tenant doesn't exist already
if m.get(&tenantid).is_some() {
bail!("tenant {} already exists", tenantid);
}
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
m.insert(tenantid, Arc::new(repo));
Ok(())
}
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
let o = &mut REPOSITORY.lock().unwrap();
o.insert(tenantid, repo);

View File

@@ -17,7 +17,6 @@ use regex::Regex;
use std::io::Write;
use std::net::TcpListener;
use std::str::FromStr;
use std::sync::Arc;
use std::thread;
use std::{io, net::TcpStream};
use zenith_utils::postgres_backend::PostgresBackend;
@@ -33,7 +32,6 @@ use crate::object_key::ObjectTag;
use crate::page_cache;
use crate::repository::{BufferTag, Modification, RelTag};
use crate::walreceiver;
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
use crate::ZTenantId;
use crate::ZTimelineId;
@@ -508,9 +506,8 @@ impl postgres_backend::Handler for PageServerHandler {
let caps = re.captures(&query_string).ok_or_else(err)?;
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
let wal_redo_manager = Arc::new(PostgresRedoManager::new(self.conf, tenantid));
let repo = branches::create_repo(self.conf, tenantid, wal_redo_manager)?;
page_cache::insert_repository_for_tenant(tenantid, Arc::new(repo));
page_cache::create_repository_for_tenant(&self.conf, tenantid)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

View File

@@ -62,7 +62,7 @@ def test_tenant_list(pageserver: ZenithPageserver, zenith_cli):
cur = conn.cursor()
# check same tenant cannot be created twice
with pytest.raises(psycopg2.DatabaseError, match=f'repo for {pageserver.initial_tenant} already exists'):
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
cur.execute(f'tenant_create {pageserver.initial_tenant}')
# create one more tenant