Grab safekeeper.lock on the whole directory instead of per tli.

closes #976
This commit is contained in:
Arseny Sher
2021-12-13 20:07:08 +03:00
parent e0d41ac6a3
commit 8f0cafd508
2 changed files with 18 additions and 25 deletions

View File

@@ -1,10 +1,12 @@
//
// Main entry point for the safekeeper executable
//
use anyhow::Result;
use anyhow::{Context, Result};
use clap::{App, Arg};
use const_format::formatcp;
use daemonize::Daemonize;
use fs2::FileExt;
use std::fs::File;
use std::path::{Path, PathBuf};
use std::thread;
use tracing::*;
@@ -21,6 +23,8 @@ use walkeeper::SafeKeeperConf;
use zenith_utils::shutdown::exit_now;
use zenith_utils::signals;
const LOCK_FILE_NAME: &str = "safekeeper.lock";
fn main() -> Result<()> {
zenith_metrics::set_common_metrics_prefix("safekeeper");
let arg_matches = App::new("Zenith safekeeper")
@@ -123,6 +127,16 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
info!("version: {}", GIT_VERSION);
// Prevent running multiple safekeepers on the same directory
let lock_file_path = conf.workdir.join(LOCK_FILE_NAME);
let lock_file = File::create(&lock_file_path).with_context(|| "failed to open lockfile")?;
lock_file.try_lock_exclusive().with_context(|| {
format!(
"control file {} is locked by some other process",
lock_file_path.display()
)
})?;
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
e

View File

@@ -1,9 +1,8 @@
//! This module contains timeline id -> safekeeper state map with file-backed
//! persistence and support for interaction between sending and receiving wal.
use anyhow::{anyhow, bail, ensure, Context, Result};
use anyhow::{bail, ensure, Context, Result};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fs2::FileExt;
use lazy_static::lazy_static;
use postgres_ffi::xlog_utils::{find_end_of_wal, PG_TLI};
use std::cmp::{max, min};
@@ -33,8 +32,6 @@ use std::convert::TryInto;
const CONTROL_FILE_NAME: &str = "safekeeper.control";
// needed to atomically update the state using `rename`
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
// dedicated lockfile to prevent running several safekeepers on the same data
const LOCK_FILE_NAME: &str = "safekeeper.lock";
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
@@ -120,9 +117,7 @@ impl SharedState {
pos
}
/// Restore SharedState from control file. Locks the control file along the
/// way to prevent running more than one instance of safekeeper on the same
/// data dir.
/// Restore SharedState from control file.
/// If create=false and file doesn't exist, bails out.
fn create_restore(
conf: &SafeKeeperConf,
@@ -326,8 +321,6 @@ impl GlobalTimelines {
#[derive(Debug)]
struct FileStorage {
// file used to prevent concurrent safekeepers running on the same data
lock_file: File,
// save timeline dir to avoid reconstructing it every time
timeline_dir: PathBuf,
conf: SafeKeeperConf,
@@ -365,26 +358,13 @@ impl FileStorage {
let timeline_dir = conf.timeline_dir(&timelineid);
let control_file_path = timeline_dir.join(CONTROL_FILE_NAME);
let lock_file_path = timeline_dir.join(LOCK_FILE_NAME);
info!(
"loading control file {}, create={:?} lock file {:?}",
"loading control file {}, create={:?}",
control_file_path.display(),
create,
lock_file_path.display(),
);
let lock_file = File::create(&lock_file_path).with_context(|| "failed to open lockfile")?;
// Lock file to prevent two or more active safekeepers
lock_file.try_lock_exclusive().map_err(|e| {
anyhow!(
"control file {:?} is locked by some other process: {}",
&control_file_path,
e
)
})?;
let mut control_file = OpenOptions::new()
.read(true)
.write(true)
@@ -432,7 +412,6 @@ impl FileStorage {
Ok((
FileStorage {
lock_file,
timeline_dir,
conf: conf.clone(),
persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS