mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-03 10:40:37 +00:00
Compare commits
7 Commits
walredo-co
...
temp-flaky
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a44796905 | ||
|
|
ed521e05e7 | ||
|
|
6a13500da4 | ||
|
|
9b8168ebde | ||
|
|
f9bb4dbf08 | ||
|
|
20ee204c27 | ||
|
|
3fdd85bcb8 |
@@ -233,7 +233,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
if << parameters.run_in_parallel >>; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
||||
fi;
|
||||
# Run the tests.
|
||||
#
|
||||
@@ -245,7 +245,7 @@ jobs:
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
|
||||
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -1204,7 +1204,6 @@ dependencies = [
|
||||
"daemonize",
|
||||
"futures",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
@@ -2578,7 +2577,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
]
|
||||
|
||||
@@ -2608,7 +2606,6 @@ dependencies = [
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"webpki",
|
||||
|
||||
@@ -25,7 +25,7 @@ Pageserver consists of:
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.52 or later is also required.
|
||||
@@ -108,13 +108,6 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop pageserver and all postgres instances you have just started:
|
||||
```sh
|
||||
> ./target/debug/zenith pg stop migration_check
|
||||
> ./target/debug/zenith pg stop main
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
|
||||
@@ -452,7 +452,9 @@ impl PostgresNode {
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
@@ -10,5 +10,5 @@
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -40,6 +40,3 @@ postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
|
||||
@@ -7,9 +7,8 @@ The Page Server has a few different duties:
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
- Backup to S3
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
|
||||
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
|
||||
@@ -20,12 +20,8 @@ use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches,
|
||||
defaults::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
},
|
||||
http, page_service, relish_storage, tenant_mgr, PageServerConf, RelishStorageConfig,
|
||||
RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR},
|
||||
http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
|
||||
@@ -45,7 +41,6 @@ struct CfgFileParams {
|
||||
auth_type: Option<String>,
|
||||
// see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
|
||||
relish_storage: Option<RelishStorage>,
|
||||
relish_storage_max_concurrent_sync: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -96,7 +91,6 @@ impl CfgFileParams {
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
relish_storage,
|
||||
relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,9 +110,6 @@ impl CfgFileParams {
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
relish_storage_max_concurrent_sync: self
|
||||
.relish_storage_max_concurrent_sync
|
||||
.or(other.relish_storage_max_concurrent_sync),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,34 +178,25 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
|
||||
Some(relish_storage_max_concurrent_sync) => {
|
||||
relish_storage_max_concurrent_sync.parse()?
|
||||
}
|
||||
None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
};
|
||||
let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RelishStorageConfig {
|
||||
max_concurrent_sync,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
let relish_storage_config =
|
||||
self.relish_storage
|
||||
.as_ref()
|
||||
.map(|storage_params| match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageConfig::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageConfig::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
});
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
@@ -240,7 +222,6 @@ impl CfgFileParams {
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
@@ -364,12 +345,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("relish-storage-max-concurrent-sync")
|
||||
.long("relish-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
@@ -480,20 +455,16 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("could not daemonize: {:#}", e),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
// keep join handles for spawned threads
|
||||
// don't spawn threads before daemonizing
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
|
||||
join_handles.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
|
||||
// keep join handles for spawned threads
|
||||
let mut join_handles = vec![];
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
|
||||
@@ -17,7 +17,6 @@ use std::{
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use log::*;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -119,7 +118,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
||||
println!("initializing tenantid {}", tenantid);
|
||||
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
|
||||
}
|
||||
crashsafe_dir::create_dir_all(conf.tenants_path())?;
|
||||
fs::create_dir_all(conf.tenants_path())?;
|
||||
|
||||
println!("pageserver init succeeded");
|
||||
Ok(())
|
||||
@@ -136,12 +135,12 @@ pub fn create_repo(
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
fs::create_dir_all(&repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
fs::create_dir(conf.timelines_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
@@ -151,13 +150,12 @@ pub fn create_repo(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -223,11 +221,7 @@ fn bootstrap_timeline(
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
println!(
|
||||
|
||||
@@ -22,8 +22,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -34,9 +33,8 @@ use std::{fs, thread};
|
||||
|
||||
use crate::layered_repository::inmemory_layer::FreezeLayers;
|
||||
use crate::relish::*;
|
||||
use crate::relish_storage::schedule_timeline_upload;
|
||||
use crate::relish_storage::storage_uploader::QueueBasedRelishUploader;
|
||||
use crate::repository::{GcResult, Repository, Timeline, WALRecord};
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
@@ -46,7 +44,6 @@ use zenith_metrics::{
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
@@ -75,11 +72,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
// Taken from PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SAFE_SIZE: usize = 512;
|
||||
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
@@ -119,9 +111,7 @@ pub struct LayeredRepository {
|
||||
timelines: Mutex<HashMap<ZTimelineId, Arc<LayeredTimeline>>>,
|
||||
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
/// Makes evey repo's timelines to backup their files to remote storage,
|
||||
/// when they get frozen.
|
||||
upload_relishes: bool,
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
}
|
||||
|
||||
/// Public interface
|
||||
@@ -136,7 +126,7 @@ impl Repository for LayeredRepository {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
std::fs::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
@@ -144,7 +134,7 @@ impl Repository for LayeredRepository {
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
@@ -153,8 +143,8 @@ impl Repository for LayeredRepository {
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.relish_uploader.as_ref().map(Arc::clone),
|
||||
0,
|
||||
false,
|
||||
)?;
|
||||
|
||||
let timeline_rc = Arc::new(timeline);
|
||||
@@ -188,8 +178,8 @@ impl Repository for LayeredRepository {
|
||||
ancestor_timeline: Some(src),
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
|
||||
std::fs::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?;
|
||||
|
||||
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
|
||||
|
||||
@@ -226,7 +216,6 @@ impl LayeredRepository {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn;
|
||||
|
||||
// Recurse to look up the ancestor timeline.
|
||||
//
|
||||
@@ -246,12 +235,12 @@ impl LayeredRepository {
|
||||
timelineid,
|
||||
self.tenantid,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
0, // init with 0 and update after layers are loaded,
|
||||
self.upload_relishes,
|
||||
self.relish_uploader.as_ref().map(Arc::clone),
|
||||
0, // init with 0 and update after layers are loaded
|
||||
)?;
|
||||
|
||||
// List the layers on disk, and load them into the layer map
|
||||
timeline.load_layer_map(disk_consistent_lsn)?;
|
||||
timeline.load_layer_map()?;
|
||||
|
||||
// needs to be after load_layer_map
|
||||
timeline.init_current_logical_size()?;
|
||||
@@ -280,14 +269,15 @@ impl LayeredRepository {
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
tenantid: ZTenantId,
|
||||
upload_relishes: bool,
|
||||
) -> LayeredRepository {
|
||||
LayeredRepository {
|
||||
tenantid,
|
||||
conf,
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
upload_relishes,
|
||||
relish_uploader: conf.relish_storage_config.as_ref().map(|config| {
|
||||
Arc::new(QueueBasedRelishUploader::new(config, &conf.workdir).unwrap())
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -362,36 +352,13 @@ impl LayeredRepository {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> Result<PathBuf> {
|
||||
let timeline_path = conf.timeline_path(&timelineid, &tenantid);
|
||||
let path = timeline_path.join("metadata");
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(first_save)
|
||||
.open(&path)?;
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let mut file = File::create(&path)?;
|
||||
|
||||
info!("saving metadata {}", path.display());
|
||||
|
||||
let mut metadata_bytes = TimelineMetadata::ser(data)?;
|
||||
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
}
|
||||
file.sync_all()?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(&timeline_path)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
file.write_all(&TimelineMetadata::ser(data)?)?;
|
||||
|
||||
Ok(path)
|
||||
}
|
||||
@@ -402,18 +369,9 @@ impl LayeredRepository {
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);
|
||||
let data = std::fs::read(&path)?;
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(calculated_checksum == expected_checksum);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
let data = TimelineMetadata::des(&data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
@@ -543,7 +501,7 @@ impl LayeredRepository {
|
||||
error!("timeline size calculation diverged, incremental doesn't match non incremental. incremental={} non_incremental={}", incremental, non_incremental);
|
||||
}
|
||||
}
|
||||
Err(e) => error!("failed to calculate non incremental timeline size: {:#}", e),
|
||||
Err(e) => error!("failed to calculate non incremental timeline size: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -589,6 +547,8 @@ pub struct LayeredTimeline {
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
|
||||
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
|
||||
// What page versions do we hold in the repository? If we get a
|
||||
// request > last_record_lsn, we need to wait until we receive all
|
||||
// the WAL up to the request. The SeqWait provides functions for
|
||||
@@ -636,9 +596,6 @@ pub struct LayeredTimeline {
|
||||
// TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst
|
||||
// ordering for its operations, but involves private modules, and macro trickery
|
||||
current_logical_size_gauge: IntGauge,
|
||||
|
||||
/// If `true`, will backup its timeline files to remote storage after freezing.
|
||||
upload_relishes: bool,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
@@ -646,11 +603,8 @@ impl Timeline for LayeredTimeline {
|
||||
/// Wait until WAL has been received up to the given LSN.
|
||||
fn wait_lsn(&self, lsn: Lsn) -> Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
// to a deadlock.
|
||||
assert!(
|
||||
!IS_WAL_RECEIVER.with(|c| c.get()),
|
||||
"wait_lsn called by WAL receiver thread"
|
||||
);
|
||||
// to a deadlock. FIXME: Is there a less hacky way to check that?
|
||||
assert_ne!(thread::current().name(), Some("WAL receiver thread"));
|
||||
|
||||
self.last_record_lsn
|
||||
.wait_for_timeout(lsn, TIMEOUT)
|
||||
@@ -1022,8 +976,8 @@ impl LayeredTimeline {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
relish_uploader: Option<Arc<QueueBasedRelishUploader>>,
|
||||
current_logical_size: usize,
|
||||
upload_relishes: bool,
|
||||
) -> Result<LayeredTimeline> {
|
||||
let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE
|
||||
.get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
|
||||
@@ -1035,6 +989,7 @@ impl LayeredTimeline {
|
||||
layers: Mutex::new(LayerMap::default()),
|
||||
|
||||
walredo_mgr,
|
||||
relish_uploader,
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
@@ -1047,7 +1002,6 @@ impl LayeredTimeline {
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
current_logical_size: AtomicUsize::new(current_logical_size),
|
||||
current_logical_size_gauge,
|
||||
upload_relishes,
|
||||
};
|
||||
Ok(timeline)
|
||||
}
|
||||
@@ -1055,7 +1009,7 @@ impl LayeredTimeline {
|
||||
///
|
||||
/// Scan the timeline directory to populate the layer map
|
||||
///
|
||||
fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
fn load_layer_map(&self) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"loading layer map for timeline {} into memory",
|
||||
self.timelineid
|
||||
@@ -1064,20 +1018,8 @@ impl LayeredTimeline {
|
||||
let (imgfilenames, mut deltafilenames) =
|
||||
filename::list_files(self.conf, self.timelineid, self.tenantid)?;
|
||||
|
||||
let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
|
||||
|
||||
// First create ImageLayer structs for each image file.
|
||||
for filename in imgfilenames.iter() {
|
||||
if filename.lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future image layer {} on timeline {}",
|
||||
filename, self.timelineid
|
||||
);
|
||||
|
||||
rename_to_backup(timeline_path.join(filename.to_string()))?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename);
|
||||
|
||||
info!(
|
||||
@@ -1095,17 +1037,6 @@ impl LayeredTimeline {
|
||||
deltafilenames.sort();
|
||||
|
||||
for filename in deltafilenames.iter() {
|
||||
ensure!(filename.start_lsn < filename.end_lsn);
|
||||
if filename.end_lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future delta layer {} on timeline {}",
|
||||
filename, self.timelineid
|
||||
);
|
||||
|
||||
rename_to_backup(timeline_path.join(filename.to_string()))?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let predecessor = layers.get(&filename.seg, filename.start_lsn);
|
||||
|
||||
let predecessor_str: String = if let Some(prec) = &predecessor {
|
||||
@@ -1253,12 +1184,13 @@ impl LayeredTimeline {
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
assert!(
|
||||
lsn > last_record_lsn,
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn,
|
||||
);
|
||||
if lsn <= last_record_lsn {
|
||||
panic!(
|
||||
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
|
||||
lsn,
|
||||
last_record_lsn
|
||||
);
|
||||
}
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer;
|
||||
@@ -1378,8 +1310,6 @@ impl LayeredTimeline {
|
||||
last_record_lsn
|
||||
);
|
||||
|
||||
let timeline_dir = File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;
|
||||
|
||||
// Take the in-memory layer with the oldest WAL record. If it's older
|
||||
// than the threshold, write it out to disk as a new image and delta file.
|
||||
// Repeat until all remaining in-memory layers are within the threshold.
|
||||
@@ -1391,8 +1321,6 @@ impl LayeredTimeline {
|
||||
// a lot of memory and/or aren't receiving much updates anymore.
|
||||
let mut disk_consistent_lsn = last_record_lsn;
|
||||
|
||||
let mut created_historics = false;
|
||||
|
||||
while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
|
||||
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
|
||||
|
||||
@@ -1445,9 +1373,10 @@ impl LayeredTimeline {
|
||||
drop(layers);
|
||||
let new_historics = frozen.write_to_disk(self)?;
|
||||
layers = self.layers.lock().unwrap();
|
||||
|
||||
if !new_historics.is_empty() {
|
||||
created_historics = true;
|
||||
if let Some(relish_uploader) = &self.relish_uploader {
|
||||
for label_path in new_historics.iter().filter_map(|layer| layer.path()) {
|
||||
relish_uploader.schedule_upload(self.timelineid, label_path);
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
@@ -1479,14 +1408,6 @@ impl LayeredTimeline {
|
||||
layer.unload()?;
|
||||
}
|
||||
|
||||
drop(layers);
|
||||
|
||||
if created_historics {
|
||||
// We must fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
// Save the metadata, with updated 'disk_consistent_lsn', to a
|
||||
// file in the timeline dir. After crash, we will restart WAL
|
||||
// streaming and processing from that point.
|
||||
@@ -1511,23 +1432,10 @@ impl LayeredTimeline {
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
let _metadata_path = LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&metadata,
|
||||
false,
|
||||
)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_upload(())
|
||||
// schedule_timeline_upload(LocalTimeline {
|
||||
// tenant_id: self.tenantid,
|
||||
// timeline_id: self.timelineid,
|
||||
// metadata_path,
|
||||
// image_layers: image_layer_uploads,
|
||||
// delta_layers: delta_layer_uploads,
|
||||
// disk_consistent_lsn,
|
||||
// });
|
||||
let metadata_path =
|
||||
LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?;
|
||||
if let Some(relish_uploader) = &self.relish_uploader {
|
||||
relish_uploader.schedule_upload(self.timelineid, metadata_path);
|
||||
}
|
||||
|
||||
// Also update the in-memory copy
|
||||
@@ -1960,23 +1868,3 @@ fn layer_ptr_eq(l1: &dyn Layer, l2: &dyn Layer) -> bool {
|
||||
// see here for more https://github.com/rust-lang/rust/issues/46139
|
||||
std::ptr::eq(l1_ptr as *const (), l2_ptr as *const ())
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
let mut new_path = path.clone();
|
||||
|
||||
for i in 0u32.. {
|
||||
new_path.set_file_name(format!("{}.{}.old", filename, i));
|
||||
if !new_path.exists() {
|
||||
std::fs::rename(&path, &new_path)?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!(
|
||||
"couldn't find an unused backup number for {:?}",
|
||||
path
|
||||
))
|
||||
}
|
||||
|
||||
@@ -42,10 +42,12 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
@@ -107,6 +109,12 @@ impl From<&DeltaLayer> for Summary {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PageVersionMeta {
|
||||
page_image_range: Option<BlobRange>,
|
||||
record_range: Option<BlobRange>,
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
@@ -144,7 +152,7 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: BTreeMap<(u32, Lsn), BlobRange>,
|
||||
page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
@@ -221,15 +229,15 @@ impl Layer for DeltaLayer {
|
||||
let mut iter = inner
|
||||
.page_version_metas
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() {
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
if let Some(img) = pv.page_image {
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img_range) = &entry.page_image_range {
|
||||
// Found a page image, return it
|
||||
let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = pv.record {
|
||||
} else if let Some(rec_range) = &entry.record_range {
|
||||
let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push(rec);
|
||||
if will_init {
|
||||
@@ -332,16 +340,16 @@ impl Layer for DeltaLayer {
|
||||
println!("--- page versions ---");
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.iter() {
|
||||
for (k, v) in inner.page_version_metas.iter() {
|
||||
let mut desc = String::new();
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
if let Some(img) = pv.page_image.as_ref() {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
if let Some(page_image_range) = v.page_image_range.as_ref() {
|
||||
let image = read_blob(&chapter, page_image_range)?;
|
||||
write!(&mut desc, " img {} bytes", image.len())?;
|
||||
}
|
||||
if let Some(rec) = pv.record.as_ref() {
|
||||
if let Some(record_range) = v.record_range.as_ref() {
|
||||
let record_bytes = read_blob(&chapter, record_range)?;
|
||||
let rec = WALRecord::des(&record_bytes)?;
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
@@ -351,7 +359,7 @@ impl Layer for DeltaLayer {
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
println!(" blk {} at {}: {}", k.0, k.1, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -424,10 +432,28 @@ impl DeltaLayer {
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
for (key, page_version) in page_versions {
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
let page_image_range = page_version
|
||||
.page_image
|
||||
.as_ref()
|
||||
.map(|page_image| page_version_writer.write_blob(page_image))
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(*key, blob_range);
|
||||
let record_range = page_version
|
||||
.record
|
||||
.as_ref()
|
||||
.map(|record| {
|
||||
let buf = WALRecord::ser(record)?;
|
||||
page_version_writer.write_blob(&buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(
|
||||
*key,
|
||||
PageVersionMeta {
|
||||
page_image_range,
|
||||
record_range,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(old.is_none());
|
||||
}
|
||||
@@ -461,8 +487,7 @@ impl DeltaLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
|
||||
@@ -290,11 +290,7 @@ pub fn list_files(
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == "wal"
|
||||
|| fname == "metadata"
|
||||
|| fname == "ancestor"
|
||||
|| fname.ends_with(".old")
|
||||
{
|
||||
} else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -337,8 +337,7 @@ impl ImageLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
|
||||
@@ -283,7 +283,6 @@ mod tests {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
@@ -292,20 +291,24 @@ mod tests {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
if expected.is_empty() {
|
||||
panic!("search with {} returned {}, expected None", key, v);
|
||||
}
|
||||
|
||||
if !expected.contains(&vstr.as_str()) {
|
||||
panic!(
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected
|
||||
);
|
||||
}
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
if !expected.is_empty() {
|
||||
panic!(
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod relish_storage;
|
||||
mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
@@ -40,7 +40,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -169,37 +168,18 @@ impl PageServerConf {
|
||||
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RelishStorageConfig {
|
||||
/// Limits the number of concurrent sync operations between pageserver and relish storage.
|
||||
pub max_concurrent_sync: usize,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RelishStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a relish storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RelishStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
pub enum RelishStorageConfig {
|
||||
/// Root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ pub fn thread_main(
|
||||
let local_auth = auth.clone();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("page server thread exiting with error: {:#}", err);
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -8,43 +8,14 @@
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue-based storage with the background machinery behind it to synchronize
|
||||
/// local page server layer files with external storage.
|
||||
mod synced_storage;
|
||||
/// A queue and the background machinery behind it to upload
|
||||
/// local page server layer files to external storage.
|
||||
pub mod storage_uploader;
|
||||
|
||||
use std::path::Path;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use self::local_fs::LocalFs;
|
||||
pub use self::synced_storage::schedule_timeline_upload;
|
||||
use crate::relish_storage::rust_s3::RustS3;
|
||||
use crate::{PageServerConf, RelishStorageKind};
|
||||
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone())?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
RelishStorageKind::AwsS3(s3_config) => synced_storage::run_storage_sync_thread(
|
||||
config,
|
||||
RustS3::new(s3_config)?,
|
||||
max_concurrent_sync,
|
||||
),
|
||||
}
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
|
||||
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::{relish_storage::RelishStorage, RelishStorageConfig};
|
||||
|
||||
use super::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
|
||||
pub struct QueueBasedRelishUploader {
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
}
|
||||
|
||||
impl QueueBasedRelishUploader {
|
||||
pub fn new(
|
||||
config: &RelishStorageConfig,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
|
||||
let _handle = match config {
|
||||
RelishStorageConfig::LocalFs(root) => {
|
||||
let relish_storage = LocalFs::new(root.clone())?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
RelishStorageConfig::AwsS3(s3_config) => {
|
||||
let relish_storage = RustS3::new(s3_config)?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { upload_queue })
|
||||
}
|
||||
|
||||
pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((timeline_id, relish_path))
|
||||
}
|
||||
}
|
||||
|
||||
fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
relish_storage: S,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> std::io::Result<thread::JoinHandle<()>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
thread::Builder::new()
|
||||
.name("Queue based relish uploader".to_string())
|
||||
.spawn(move || loop {
|
||||
runtime.block_on(async {
|
||||
upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
) {
|
||||
let mut queue_accessor = upload_queue.lock().unwrap();
|
||||
log::debug!("current upload queue length: {}", queue_accessor.len());
|
||||
let next_upload = queue_accessor.pop_front();
|
||||
drop(queue_accessor);
|
||||
|
||||
let (relish_timeline_id, relish_local_path) = match next_upload {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Don't spin and allow others to use the queue.
|
||||
// In future, could be improved to be more clever about delays depending on relish upload stats
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
|
||||
log::error!(
|
||||
"Failed to upload relish '{}' for timeline {}, reason: {}",
|
||||
relish_local_path.display(),
|
||||
relish_timeline_id,
|
||||
e
|
||||
);
|
||||
upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((relish_timeline_id, relish_local_path))
|
||||
} else {
|
||||
log::debug!("Relish successfully uploaded");
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
|
||||
relish_storage
|
||||
.upload_relish(relish_local_path, &destination)
|
||||
.await
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
use std::time::Duration;
|
||||
use std::{collections::BinaryHeap, sync::Mutex, thread};
|
||||
|
||||
use crate::{relish_storage::RelishStorage, PageServerConf};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref UPLOAD_QUEUE: Mutex<BinaryHeap<SyncTask>> = Mutex::new(BinaryHeap::new());
|
||||
}
|
||||
|
||||
pub fn schedule_timeline_upload(_local_timeline: ()) {
|
||||
// UPLOAD_QUEUE
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .push(SyncTask::Upload(local_timeline))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum SyncTask {}
|
||||
|
||||
pub fn run_storage_sync_thread<
|
||||
P: std::fmt::Debug,
|
||||
S: 'static + RelishStorage<RelishStoragePath = P>,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
relish_storage: S,
|
||||
max_concurrent_sync: usize,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("Queue based relish storage sync".to_string())
|
||||
.spawn(move || loop {
|
||||
let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
|
||||
log::debug!("Upload queue length: {}", queue_accessor.len());
|
||||
let next_task = queue_accessor.pop();
|
||||
drop(queue_accessor);
|
||||
match next_task {
|
||||
Some(task) => runtime.block_on(async {
|
||||
// suppress warnings
|
||||
let _ = (config, task, &relish_storage, max_concurrent_sync);
|
||||
todo!("omitted for brevity")
|
||||
}),
|
||||
None => {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
})?;
|
||||
Ok(Some(handle))
|
||||
}
|
||||
@@ -212,18 +212,12 @@ mod tests {
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use hex_literal::hex;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
@@ -259,53 +253,39 @@ mod tests {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
struct RepoHarness {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
}
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
impl RepoHarness {
|
||||
fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
@@ -419,10 +399,11 @@ mod tests {
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
let repo = get_test_repo("test_drop_extend")?;
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
@@ -455,10 +436,11 @@ mod tests {
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
let repo = get_test_repo("test_truncate_extend")?;
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
@@ -555,8 +537,9 @@ mod tests {
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
@@ -617,8 +600,9 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let repo = get_test_repo("test_list_rels_drop")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
@@ -636,8 +620,9 @@ mod tests {
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
@@ -657,7 +642,7 @@ mod tests {
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
@@ -671,8 +656,9 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
@@ -690,8 +676,9 @@ mod tests {
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
@@ -719,89 +706,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata");
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_layerfiles() -> Result<()> {
|
||||
const TEST_NAME: &str = "future_layerfiles";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let make_empty_file = |filename: &str| -> std::io::Result<()> {
|
||||
let path = timeline_path.join(filename);
|
||||
|
||||
assert!(!path.exists());
|
||||
std::fs::write(&path, &[])?;
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let image_filename = format!("pg_control_0_{:016X}", 8000);
|
||||
let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
let check_old = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(!path.exists());
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(backup_path.exists());
|
||||
};
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
check_old(&image_filename, 1);
|
||||
check_old(&delta_filename, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager;
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::relish::*;
|
||||
@@ -173,7 +173,8 @@ fn import_relfile(
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -267,7 +268,8 @@ fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Pa
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), e);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
@@ -9,84 +9,46 @@ use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
|
||||
REPOSITORY.lock().unwrap()
|
||||
}
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = access_repository();
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
let repo = init_repo(conf, tenantid);
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<LayeredRepository> {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
true,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
repo
|
||||
}
|
||||
|
||||
// TODO kb Currently unused function, will later be used when the relish storage downloads a new layer.
|
||||
// Relevant PR: https://github.com/zenithdb/zenith/pull/686
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) {
|
||||
log::info!(
|
||||
"Registering new download, tenant id {}, timeline id: {}",
|
||||
tenant_id,
|
||||
timeline_id
|
||||
);
|
||||
match access_repository().entry(tenant_id) {
|
||||
Entry::Occupied(o) => init_timeline(o.get().as_ref(), timeline_id),
|
||||
Entry::Vacant(v) => {
|
||||
log::info!("New repo initialized");
|
||||
let new_repo = init_repo(conf, tenant_id);
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
v.insert(new_repo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
let mut m = access_repository();
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
@@ -101,13 +63,14 @@ pub fn create_repository_for_tenant(
|
||||
}
|
||||
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
access_repository().insert(tenantid, repo);
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
access_repository()
|
||||
.get(&tenantid)
|
||||
.map(Arc::clone)
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
o.get(&tenantid)
|
||||
.map(|repo| Arc::clone(repo))
|
||||
.ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
@@ -20,7 +20,6 @@ use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cell::Cell;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
@@ -28,7 +27,6 @@ use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
@@ -46,13 +44,6 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
// Boolean that is true only for WAL receiver threads
|
||||
//
|
||||
// This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -73,10 +64,12 @@ pub fn launch_wal_receiver(
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
//
|
||||
// NOTE: This thread name is checked in the assertion in wait_lsn. If you change
|
||||
// this, make sure you update the assertion too.
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
@@ -165,7 +158,7 @@ fn walreceiver_main(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
bail!("No previous WAL position");
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
|
||||
@@ -249,20 +249,164 @@ impl PostgresRedoManager {
|
||||
process: &mut PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let rel = request.rel;
|
||||
let blknum = request.blknum;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result = if let RelishTag::Relation(rel) = request.rel {
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if let RelishTag::Relation(rel) = rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag {
|
||||
rel,
|
||||
blknum: request.blknum,
|
||||
};
|
||||
process
|
||||
.apply_wal_records(buf_tag, &request.base_img, &request.records)
|
||||
.await
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
} else {
|
||||
Ok(redo_nonrel(request))
|
||||
};
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multixact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let RelishTag::Slru {
|
||||
slru,
|
||||
segno: rec_segno,
|
||||
} = rel
|
||||
{
|
||||
if slru == SlruKind::MultiXactMembers {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let pageno =
|
||||
i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if segno == rec_segno && rpageno == blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1
|
||||
<< pg_constants::MXACT_MEMBER_BITS_PER_XACT)
|
||||
- 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid
|
||||
% pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
@@ -270,13 +414,13 @@ impl PostgresRedoManager {
|
||||
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
request.records.len(),
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
request.lsn
|
||||
lsn
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {:#}", e);
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
@@ -314,7 +458,7 @@ impl PostgresRedoProcess {
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:#}", e);
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
@@ -397,11 +541,12 @@ impl PostgresRedoProcess {
|
||||
async fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
base_img: &Option<Bytes>,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let stdin = &mut self.stdin;
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
@@ -411,47 +556,58 @@ impl PostgresRedoProcess {
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
let mut capacity = 1 + BEGIN_REDO_MSG_LEN;
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
capacity += 1 + PUSH_PAGE_MSG_LEN;
|
||||
}
|
||||
capacity += (1 + APPLY_MSG_HEADER_LEN) * records.len();
|
||||
capacity += records.iter().map(|rec| rec.rec.len()).sum::<usize>();
|
||||
capacity += 1 + GET_PAGE_MSG_LEN;
|
||||
|
||||
let mut buf = BytesMut::with_capacity(capacity);
|
||||
|
||||
build_begin_redo_for_block_msg(&mut buf, tag);
|
||||
|
||||
if let Some(base_img) = base_img.as_ref() {
|
||||
build_push_page_msg(&mut buf, tag, base_img);
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
|
||||
for record in records {
|
||||
build_apply_record_msg(&mut buf, record.lsn, &record.rec);
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
build_get_page_msg(&mut buf, tag);
|
||||
|
||||
debug_assert_eq!(capacity, buf.len());
|
||||
|
||||
timeout(TIMEOUT, stdin.write_all(&buf)).await??;
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = vec![0u8; 8192];
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<Vec<u8>, Error>(buf)
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let (buf, _) = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
Ok::<Bytes, Error>(Bytes::from(buf))
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -459,189 +615,81 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
const TAG_LEN: usize = 4 * 4;
|
||||
const PAGE_SIZE: usize = 8192;
|
||||
const BEGIN_REDO_MSG_LEN: usize = 4 + 1 + TAG_LEN;
|
||||
const PUSH_PAGE_MSG_LEN: usize = 4 + 1 + TAG_LEN + PAGE_SIZE;
|
||||
const APPLY_MSG_HEADER_LEN: usize = 4 + 8;
|
||||
const GET_PAGE_MSG_LEN: usize = 4 + 1 + TAG_LEN;
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
fn build_begin_redo_for_block_msg(buf: &mut BytesMut, tag: BufferTag) {
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(BEGIN_REDO_MSG_LEN as u32);
|
||||
|
||||
tag.ser_into(&mut buf.writer())
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
fn build_push_page_msg(buf: &mut BytesMut, tag: BufferTag, base_img: &Bytes) {
|
||||
debug_assert_eq!(base_img.len(), PAGE_SIZE);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(PUSH_PAGE_MSG_LEN as u32);
|
||||
tag.ser_into(&mut buf.writer())
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
buf.extend(base_img);
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(buf: &mut BytesMut, endlsn: Lsn, rec: &Bytes) {
|
||||
buf.put_u8(b'A');
|
||||
|
||||
let len = APPLY_MSG_HEADER_LEN + rec.len();
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.extend(rec);
|
||||
}
|
||||
|
||||
fn build_get_page_msg(buf: &mut BytesMut, tag: BufferTag) {
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(GET_PAGE_MSG_LEN as u32);
|
||||
tag.ser_into(&mut buf.writer())
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn redo_nonrel(request: &WalRedoRequest) -> Bytes {
|
||||
let rel = request.rel;
|
||||
let blknum = request.blknum;
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = &request.base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in &request.records {
|
||||
let mut buf = record.rec.clone();
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
assert!(
|
||||
slru == SlruKind::Clog,
|
||||
"Not valid XACT relish tag {:?}",
|
||||
rel
|
||||
);
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
};
|
||||
let parsed_xact = XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multixact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let RelishTag::Slru {
|
||||
slru,
|
||||
segno: rec_segno,
|
||||
} = rel
|
||||
{
|
||||
if slru == SlruKind::MultiXactMembers {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if segno == rec_segno && rpageno == blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
|
||||
- 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32 * 4)
|
||||
as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
page.freeze()
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
|
||||
pub struct CPlaneApi {
|
||||
auth_endpoint: &'static str,
|
||||
@@ -8,7 +8,7 @@ pub struct CPlaneApi {
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
@@ -16,13 +16,8 @@ pub struct DatabaseInfo {
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
@@ -41,14 +36,12 @@ impl CPlaneApi {
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
|
||||
|
||||
@@ -176,7 +176,6 @@ impl ProxyConnection {
|
||||
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
) {
|
||||
@@ -248,7 +247,7 @@ databases without opening the browser.
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +16,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
log.info("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
@@ -27,7 +30,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
log.info(f'LSN after 100 rows: {lsn_a}')
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
@@ -37,7 +40,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
log.info(f'LSN after 100100 rows: {lsn_b}')
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
@@ -52,7 +55,7 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
log.info(f'LSN after 200100 rows: {lsn_c}')
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
@@ -89,4 +92,4 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
try:
|
||||
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
except subprocess.CalledProcessError:
|
||||
print("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
log.info("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -5,6 +5,10 @@ from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -24,7 +28,7 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
]
|
||||
|
||||
pg = postgres.create_start('test_clog_truncate', config_lines=config)
|
||||
print('postgres is running on test_clog_truncate branch')
|
||||
log.info('postgres is running on test_clog_truncate branch')
|
||||
|
||||
# Install extension containing function needed for test
|
||||
pg.safe_psql('CREATE EXTENSION zenith_test_utils')
|
||||
@@ -33,22 +37,22 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('select test_consume_xids(1000*1000*10);')
|
||||
print('xids consumed')
|
||||
log.info('xids consumed')
|
||||
|
||||
# call a checkpoint to trigger TruncateSubtrans
|
||||
cur.execute('CHECKPOINT;')
|
||||
|
||||
# ensure WAL flush
|
||||
cur.execute('select txid_current()')
|
||||
print(cur.fetchone())
|
||||
log.info(cur.fetchone())
|
||||
|
||||
# wait for autovacuum to truncate the pg_xact
|
||||
# XXX Is it worth to add a timeout here?
|
||||
pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000')
|
||||
print("pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
log.info(f"pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
|
||||
while os.path.isfile(pg_xact_0000_path):
|
||||
print("file exists. wait for truncation. " "pg_xact_0000_path = " + pg_xact_0000_path)
|
||||
log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}")
|
||||
time.sleep(5)
|
||||
|
||||
# checkpoint to advance latest lsn
|
||||
@@ -59,14 +63,14 @@ def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
|
||||
lsn_after_truncation = cur.fetchone()[0]
|
||||
|
||||
# create new branch after clog truncation and start a compute node on it
|
||||
print('create branch at lsn_after_truncation ' + lsn_after_truncation)
|
||||
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
|
||||
zenith_cli.run(
|
||||
["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])
|
||||
|
||||
pg2 = postgres.create_start('test_clog_truncate_new')
|
||||
print('postgres is running on test_clog_truncate_new branch')
|
||||
log.info('postgres is running on test_clog_truncate_new branch')
|
||||
|
||||
# check that new node doesn't contain truncated segment
|
||||
pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
|
||||
print("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
|
||||
log.info("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
|
||||
assert os.path.isfile(pg_xact_0000_path_new) is False
|
||||
|
||||
@@ -2,6 +2,10 @@ from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -14,7 +18,7 @@ def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFact
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
log.info('postgres is running on test_config branch')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -4,6 +4,10 @@ import pathlib
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -19,7 +23,7 @@ def test_createdb(
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
log.info("postgres is running on 'test_createdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -53,7 +57,7 @@ def test_dropdb(
|
||||
zenith_cli.run(["branch", "test_dropdb", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_dropdb')
|
||||
print("postgres is running on 'test_dropdb' branch")
|
||||
log.info("postgres is running on 'test_dropdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -88,13 +92,13 @@ def test_dropdb(
|
||||
|
||||
# Test that database subdir exists on the branch before drop
|
||||
dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
|
||||
print(dbpath)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == True
|
||||
|
||||
# Test that database subdir doesn't exist on the branch after drop
|
||||
dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
|
||||
print(dbpath)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == False
|
||||
|
||||
|
||||
@@ -2,6 +2,10 @@ from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -12,7 +16,7 @@ def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: Postgres
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_createuser')
|
||||
print("postgres is running on 'test_createuser' branch")
|
||||
log.info("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -15,7 +19,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
print("postgres is running on 'test_multixact' branch")
|
||||
log.info("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -55,7 +59,7 @@ def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
print("postgres is running on 'test_multixact_new' branch")
|
||||
log.info("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
|
||||
@@ -2,6 +2,10 @@ from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
@@ -18,7 +22,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
|
||||
pg = postgres.create_start('test_old_request_lsn')
|
||||
print('postgres is running on test_old_request_lsn branch')
|
||||
log.info('postgres is running on test_old_request_lsn branch')
|
||||
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
@@ -46,7 +50,7 @@ def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: Pos
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
print(f'shared_buffers is {row[0]}, table size {row[1]}');
|
||||
log.info(f'shared_buffers is {row[0]}, table size {row[1]}');
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
cur.execute('VACUUM foo');
|
||||
|
||||
@@ -6,6 +6,10 @@ from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
@@ -40,7 +44,7 @@ def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres:
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
print("shared_buffers is {}, table size {}", row[0], row[1]);
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}");
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Stop and restart pageserver. This is a more or less graceful shutdown, although
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -8,7 +12,7 @@ def test_pgbench(postgres: PostgresFactory, pg_bin, zenith_cli):
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
log.info("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr()
|
||||
|
||||
|
||||
@@ -3,13 +3,20 @@ import pytest
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test restarting and recreating a postgres instance
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
|
||||
# XXX: with_wal_acceptors=True fails now, would be fixed with
|
||||
# `postgres --sync-walkeepers` patches.
|
||||
#
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False])
|
||||
def test_restart_compute(
|
||||
zenith_cli,
|
||||
pageserver: ZenithPageserver,
|
||||
@@ -27,7 +34,7 @@ def test_restart_compute(
|
||||
|
||||
pg = postgres.create_start('test_restart_compute',
|
||||
wal_acceptors=wal_acceptor_connstrs)
|
||||
print("postgres is running on 'test_restart_compute' branch")
|
||||
log.info("postgres is running on 'test_restart_compute' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -36,7 +43,7 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -49,7 +56,7 @@ def test_restart_compute(
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
r = cur.fetchone()
|
||||
assert r == (5000050000, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Insert another row
|
||||
cur.execute("INSERT INTO t VALUES (100001, 'payload2')")
|
||||
@@ -57,7 +64,7 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# Again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -72,7 +79,7 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
# And again remove data directory and restart
|
||||
pg.stop_and_destroy().create_start('test_restart_compute',
|
||||
@@ -85,4 +92,4 @@ def test_restart_compute(
|
||||
|
||||
r = cur.fetchone()
|
||||
assert r == (100001, )
|
||||
print("res = ", r)
|
||||
log.info(f"res = {r}")
|
||||
|
||||
@@ -2,12 +2,16 @@ from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time;
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def print_gc_result(row):
|
||||
print("GC duration {elapsed} ms".format_map(row));
|
||||
print(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
log.info("GC duration {elapsed} ms".format_map(row));
|
||||
log.info(" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}".format_map(row))
|
||||
log.info(" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}".format_map(row))
|
||||
|
||||
|
||||
#
|
||||
@@ -35,7 +39,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
|
||||
row = cur.fetchone();
|
||||
print("relfilenode is {}", row[0]);
|
||||
log.info(f"relfilenode is {row[0]}");
|
||||
|
||||
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
@@ -50,7 +54,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# update to confuse our numbers either.
|
||||
cur.execute("DELETE FROM foo")
|
||||
|
||||
print("Running GC before test")
|
||||
log.info("Running GC before test")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
@@ -61,7 +65,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Insert a row and run GC. Checkpoint should freeze the layer
|
||||
# so that there is only the most recent image layer left for the rel,
|
||||
# removing the old image and delta layer.
|
||||
print("Inserting one row and running GC")
|
||||
log.info("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
@@ -73,7 +77,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Insert two more rows and run GC.
|
||||
# This should create new image and delta layer file with the new contents, and
|
||||
# then remove the old one image and the just-created delta layer.
|
||||
print("Inserting two more rows and running GC")
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
@@ -85,7 +89,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create two new layer files and remove old ones.
|
||||
print("Inserting two more rows and running GC")
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
@@ -97,7 +101,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
print("Run GC again, with nothing to do")
|
||||
log.info("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
@@ -108,7 +112,7 @@ def test_layerfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
print("Drop table and run GC again");
|
||||
log.info("Drop table and run GC again");
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
|
||||
@@ -3,6 +3,9 @@ from uuid import UUID
|
||||
import psycopg2.extras
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
def test_timeline_size(
|
||||
zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin
|
||||
@@ -15,7 +18,7 @@ def test_timeline_size(
|
||||
assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
|
||||
|
||||
pgmain = postgres.create_start("test_timeline_size")
|
||||
print("postgres is running on 'test_timeline_size' branch")
|
||||
log.info("postgres is running on 'test_timeline_size' branch")
|
||||
|
||||
with closing(pgmain.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
@@ -2,6 +2,9 @@ import os
|
||||
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, PgBin
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,7 +16,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
log.info("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
@@ -45,7 +48,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
print(twophase_files)
|
||||
log.info(twophase_files)
|
||||
assert len(twophase_files) == 4
|
||||
|
||||
cur.execute("COMMIT PREPARED 'insert_three'")
|
||||
@@ -53,7 +56,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
cur.execute('CHECKPOINT')
|
||||
|
||||
twophase_files = os.listdir(pg.pg_twophase_dir_path())
|
||||
print(twophase_files)
|
||||
log.info(twophase_files)
|
||||
assert len(twophase_files) == 2
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
@@ -67,7 +70,7 @@ def test_twophase(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFa
|
||||
|
||||
# Check that we restored only needed twophase files
|
||||
twophase_files2 = os.listdir(pg2.pg_twophase_dir_path())
|
||||
print(twophase_files2)
|
||||
log.info(twophase_files2)
|
||||
assert twophase_files2.sort() == twophase_files.sort()
|
||||
|
||||
conn2 = pg2.connect()
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
@@ -11,7 +15,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
zenith_cli.run(["branch", "test_vm_bit_clear", "empty"])
|
||||
pg = postgres.create_start('test_vm_bit_clear')
|
||||
|
||||
print("postgres is running on 'test_vm_bit_clear' branch")
|
||||
log.info("postgres is running on 'test_vm_bit_clear' branch")
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
@@ -63,7 +67,7 @@ def test_vm_bit_clear(pageserver: ZenithPageserver, postgres: PostgresFactory, p
|
||||
# server at the right point-in-time avoids that full-page image.
|
||||
pg_new = postgres.create_start('test_vm_bit_clear_new')
|
||||
|
||||
print("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
log.info("postgres is running on 'test_vm_bit_clear_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
|
||||
@@ -10,6 +10,10 @@ from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory, PgBin
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -284,10 +288,10 @@ def test_sync_safekeepers(repo_dir: str, pg_bin: PgBin, wa_factory: WalAcceptorF
|
||||
)
|
||||
lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"])
|
||||
lsn_after_append.append(lsn_hex)
|
||||
print(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}")
|
||||
|
||||
# run sync safekeepers
|
||||
lsn_after_sync = pg.sync_safekeepers()
|
||||
print(f"lsn after sync = {lsn_after_sync}")
|
||||
log.info(f"lsn after sync = {lsn_after_sync}")
|
||||
|
||||
assert all(lsn_after_sync == lsn for lsn in lsn_after_append)
|
||||
|
||||
@@ -4,7 +4,10 @@ import random
|
||||
|
||||
from fixtures.zenith_fixtures import WalAcceptor, WalAcceptorFactory, ZenithPageserver, PostgresFactory, Postgres
|
||||
from typing import List
|
||||
from fixtures.utils import debug_print
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root.wal_acceptor_async')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -63,18 +66,18 @@ class WorkerStats(object):
|
||||
self.counters[worker_id] += 1
|
||||
|
||||
def check_progress(self):
|
||||
debug_print("Workers progress: {}".format(self.counters))
|
||||
log.debug("Workers progress: {}".format(self.counters))
|
||||
|
||||
# every worker should finish at least one tx
|
||||
assert all(cnt > 0 for cnt in self.counters)
|
||||
|
||||
progress = sum(self.counters)
|
||||
print('All workers made {} transactions'.format(progress))
|
||||
log.info('All workers made {} transactions'.format(progress))
|
||||
|
||||
|
||||
async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer):
|
||||
pg_conn = await pg.connect_async()
|
||||
debug_print('Started worker {}'.format(worker_id))
|
||||
log.debug('Started worker {}'.format(worker_id))
|
||||
|
||||
while stats.running:
|
||||
from_uid = random.randint(0, n_accounts - 1)
|
||||
@@ -84,9 +87,9 @@ async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accou
|
||||
await bank_transfer(pg_conn, from_uid, to_uid, amount)
|
||||
stats.inc_progress(worker_id)
|
||||
|
||||
debug_print('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid))
|
||||
|
||||
debug_print('Finished worker {}'.format(worker_id))
|
||||
log.debug('Finished worker {}'.format(worker_id))
|
||||
|
||||
await pg_conn.close()
|
||||
|
||||
@@ -134,7 +137,7 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[WalAcceptor], n_
|
||||
|
||||
victim.start()
|
||||
|
||||
print('Iterations are finished, exiting coroutines...')
|
||||
log.info('Iterations are finished, exiting coroutines...')
|
||||
stats.running = False
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
@@ -3,6 +3,10 @@ import os
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
from fixtures.zenith_fixtures import PageserverPort, PostgresFactory, check_restored_datadir_content
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
@@ -38,7 +42,7 @@ def test_zenith_regress(postgres: PostgresFactory, pg_bin, zenith_cli, test_outp
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
print(pg_regress_command)
|
||||
log.info(pg_regress_command)
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from pprint import pprint
|
||||
|
||||
import os
|
||||
import re
|
||||
import timeit
|
||||
@@ -136,8 +134,7 @@ class ZenithBenchmarker:
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', all_metrics,
|
||||
re.MULTILINE)
|
||||
matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics)
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
@contextmanager
|
||||
|
||||
37
test_runner/fixtures/log_helper.py
Normal file
37
test_runner/fixtures/log_helper.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import logging
|
||||
import logging.config
|
||||
import time
|
||||
|
||||
# timestamp in UTC+-00:00 aka GMT
|
||||
class UTCFormatter(logging.Formatter):
|
||||
converter = time.gmtime
|
||||
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"formatters": {
|
||||
"defaultFormatter": {
|
||||
"()": UTCFormatter,
|
||||
"format": "%(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
|
||||
"datefmt": "%Y-%m-%d %H:%M:%S"
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "defaultFormatter",
|
||||
"level": "DEBUG",
|
||||
"stream": "ext://sys.stderr"
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"root": {
|
||||
"level": "DEBUG",
|
||||
"handlers": ["console"]
|
||||
},
|
||||
"root.wal_acceptor_async": {
|
||||
"level": "INFO" # lot of logs on DEBUG level
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logging.config.dictConfig(LOGGING)
|
||||
@@ -3,6 +3,9 @@ import subprocess
|
||||
|
||||
from typing import Any, List
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
def get_self_dir() -> str:
|
||||
""" Get the path to the directory where this script lives. """
|
||||
@@ -39,7 +42,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
log.info('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
@@ -58,14 +61,6 @@ def global_counter() -> int:
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
|
||||
def debug_print(*args, **kwargs) -> None:
|
||||
""" Print to the console if TEST_DEBUG_PRINT is set in env.
|
||||
|
||||
All parameters are passed to print().
|
||||
"""
|
||||
if os.environ.get('TEST_DEBUG_PRINT') is not None:
|
||||
print(*args, **kwargs)
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
""" Convert lsn from int to standard hex notation. """
|
||||
return "{:X}/{:X}".format(num >> 32, num & 0xffffffff)
|
||||
|
||||
@@ -27,6 +27,11 @@ from typing_extensions import Literal
|
||||
import requests
|
||||
|
||||
from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
@@ -71,9 +76,7 @@ def pytest_configure(config):
|
||||
# This is bad; we don't want any of those processes polluting the
|
||||
# result of the test.
|
||||
# NOTE this shows as an internal pytest error, there might be a better way
|
||||
raise Exception(
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.'
|
||||
)
|
||||
raise Exception('found interfering processes running')
|
||||
|
||||
|
||||
def determine_scope(fixture_name: str, config: Any) -> str:
|
||||
@@ -188,13 +191,13 @@ class ZenithCli:
|
||||
|
||||
>>> result = zenith_cli.run(...)
|
||||
>>> assert result.stderr == ""
|
||||
>>> print(result.stdout)
|
||||
>>> log.info(result.stdout)
|
||||
"""
|
||||
|
||||
assert type(arguments) == list
|
||||
|
||||
args = [self.bin_zenith] + arguments
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
log.info('Running command "{}"'.format(' '.join(args)))
|
||||
|
||||
# Interceipt CalledProcessError and print more info
|
||||
try:
|
||||
@@ -211,7 +214,7 @@ class ZenithCli:
|
||||
stdout: {exc.stdout}
|
||||
stderr: {exc.stderr}
|
||||
"""
|
||||
print(msg)
|
||||
log.info(msg)
|
||||
|
||||
raise Exception(msg) from exc
|
||||
|
||||
@@ -419,7 +422,7 @@ class ZenithPageserver(PgProtocol):
|
||||
def pageserver_port(port_distributor: PortDistributor) -> PageserverPort:
|
||||
pg = port_distributor.get_port()
|
||||
http = port_distributor.get_port()
|
||||
print(f"pageserver_port: pg={pg} http={http}")
|
||||
log.info(f"pageserver_port: pg={pg} http={http}")
|
||||
return PageserverPort(pg=pg, http=http)
|
||||
|
||||
|
||||
@@ -443,7 +446,7 @@ def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: Pageserver
|
||||
yield ps
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting pageserver cleanup')
|
||||
log.info('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
|
||||
class PgBin:
|
||||
@@ -481,7 +484,7 @@ class PgBin:
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
log.info('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
@@ -498,7 +501,7 @@ class PgBin:
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
log.info('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
|
||||
|
||||
@@ -566,12 +569,12 @@ class Postgres(PgProtocol):
|
||||
|
||||
assert self.branch is not None
|
||||
|
||||
print(f"Starting postgres on branch {self.branch}")
|
||||
log.info(f"Starting postgres on branch {self.branch}")
|
||||
|
||||
run_result = self.zenith_cli.run(['pg', 'start', self.branch, f'--tenantid={self.tenant_id}', f'--port={self.port}'])
|
||||
self.running = True
|
||||
|
||||
print(f"stdout: {run_result.stdout}")
|
||||
log.info(f"stdout: {run_result.stdout}")
|
||||
|
||||
return self
|
||||
|
||||
@@ -786,7 +789,7 @@ def postgres(zenith_cli: ZenithCli, initial_tenant: str, repo_dir: str, pg_bin:
|
||||
yield pgfactory
|
||||
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting postgres cleanup')
|
||||
log.info('Starting postgres cleanup')
|
||||
pgfactory.stop_all()
|
||||
|
||||
def read_pid(path: Path):
|
||||
@@ -817,7 +820,7 @@ class WalAcceptor:
|
||||
# Tell page server it can receive WAL from this WAL safekeeper
|
||||
cmd.extend(["--pageserver", f"localhost:{self.pageserver_port}"])
|
||||
cmd.extend(["--recall", "1 second"])
|
||||
print('Running command "{}"'.format(' '.join(cmd)))
|
||||
log.info('Running command "{}"'.format(' '.join(cmd)))
|
||||
env = {'PAGESERVER_AUTH_TOKEN': self.auth_token} if self.auth_token else None
|
||||
subprocess.run(cmd, check=True, env=env)
|
||||
|
||||
@@ -846,10 +849,10 @@ class WalAcceptor:
|
||||
return pid
|
||||
|
||||
def stop(self) -> 'WalAcceptor':
|
||||
print('Stopping wal acceptor {}'.format(self.num))
|
||||
log.info('Stopping wal acceptor {}'.format(self.num))
|
||||
pid = self.get_pid()
|
||||
if pid is None:
|
||||
print("Wal acceptor {} is not running".format(self.num))
|
||||
log.info("Wal acceptor {} is not running".format(self.num))
|
||||
return self
|
||||
|
||||
try:
|
||||
@@ -875,10 +878,10 @@ class WalAcceptor:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
request_json = json.dumps(request)
|
||||
print(f"JSON_CTRL request on port {self.port}: {request_json}")
|
||||
log.info(f"JSON_CTRL request on port {self.port}: {request_json}")
|
||||
cur.execute("JSON_CTRL " + request_json)
|
||||
all = cur.fetchall()
|
||||
print(f"JSON_CTRL response: {all[0][0]}")
|
||||
log.info(f"JSON_CTRL response: {all[0][0]}")
|
||||
return json.loads(all[0][0])
|
||||
|
||||
class WalAcceptorFactory:
|
||||
@@ -936,7 +939,7 @@ def wa_factory(zenith_binpath: str, repo_dir: str, pageserver_port: PageserverPo
|
||||
)
|
||||
yield wafactory
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting wal acceptors cleanup')
|
||||
log.info('Starting wal acceptors cleanup')
|
||||
wafactory.stop_all()
|
||||
|
||||
|
||||
@@ -945,7 +948,7 @@ def base_dir() -> str:
|
||||
""" find the base directory (currently this is the git root) """
|
||||
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
print('\nbase_dir is', base_dir)
|
||||
log.info(f'base_dir is {base_dir}')
|
||||
return base_dir
|
||||
|
||||
|
||||
@@ -974,7 +977,7 @@ def test_output_dir(request: Any, top_output_dir: str) -> str:
|
||||
test_name = 'shared'
|
||||
|
||||
test_output_dir = os.path.join(top_output_dir, test_name)
|
||||
print('test_output_dir is', test_output_dir)
|
||||
log.info(f'test_output_dir is {test_output_dir}')
|
||||
shutil.rmtree(test_output_dir, ignore_errors=True)
|
||||
mkdir_if_needed(test_output_dir)
|
||||
return test_output_dir
|
||||
@@ -1016,7 +1019,7 @@ def pg_distrib_dir(base_dir: str) -> str:
|
||||
pg_dir = env_postgres_bin
|
||||
else:
|
||||
pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
|
||||
print('postgres dir is', pg_dir)
|
||||
log.info(f'postgres dir is {pg_dir}')
|
||||
if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
|
||||
raise Exception('postgres not found at "{}"'.format(pg_dir))
|
||||
return pg_dir
|
||||
@@ -1055,7 +1058,7 @@ def list_files_to_compare(pgdata_dir: str):
|
||||
pgdata_files.append(rel_file)
|
||||
|
||||
pgdata_files.sort()
|
||||
print(pgdata_files)
|
||||
log.info(pgdata_files)
|
||||
return pgdata_files
|
||||
|
||||
# pg is the existing and running compute node, that we want to compare with a basebackup
|
||||
@@ -1101,9 +1104,9 @@ def check_restored_datadir_content(zenith_cli: ZenithCli, test_output_dir: str,
|
||||
restored_dir_path,
|
||||
pgdata_files,
|
||||
shallow=False)
|
||||
print('filecmp result mismatch and error lists:')
|
||||
print(mismatch)
|
||||
print(error)
|
||||
log.info('filecmp result mismatch and error lists:')
|
||||
log.info(mismatch)
|
||||
log.info(error)
|
||||
|
||||
for f in mismatch:
|
||||
|
||||
|
||||
@@ -2,6 +2,10 @@ import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
@@ -31,7 +35,7 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
|
||||
zenith_cli.run(["branch", "test_bulk_insert", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_bulk_insert')
|
||||
print("postgres is running on 'test_bulk_insert' branch")
|
||||
log.info("postgres is running on 'test_bulk_insert' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
import timeit
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import (
|
||||
TenantFactory,
|
||||
ZenithCli,
|
||||
PostgresFactory,
|
||||
)
|
||||
|
||||
pytest_plugins = ("fixtures.benchmark_fixture")
|
||||
|
||||
# Run bulk tenant creation test.
|
||||
#
|
||||
# Collects metrics:
|
||||
#
|
||||
# 1. Time to create {1,10,50} tenants
|
||||
# 2. Average creation time per tenant
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tenants_count', [1, 5, 10])
|
||||
@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa'])
|
||||
def test_bulk_tenant_create(
|
||||
zenith_cli: ZenithCli,
|
||||
tenant_factory: TenantFactory,
|
||||
postgres: PostgresFactory,
|
||||
wa_factory,
|
||||
use_wal_acceptors: str,
|
||||
tenants_count: int,
|
||||
zenbenchmark,
|
||||
):
|
||||
"""Measure tenant creation time (with and without wal acceptors)"""
|
||||
|
||||
time_slices = []
|
||||
|
||||
for i in range(tenants_count):
|
||||
start = timeit.default_timer()
|
||||
|
||||
tenant = tenant_factory.create()
|
||||
zenith_cli.run([
|
||||
"branch", f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", "main",
|
||||
f"--tenantid={tenant}"
|
||||
])
|
||||
|
||||
if use_wal_acceptors == 'with_wa':
|
||||
wa_factory.start_n_new(3)
|
||||
|
||||
pg_tenant = postgres.create_start(
|
||||
f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}",
|
||||
tenant,
|
||||
wal_acceptors=wa_factory.get_connstrs() if use_wal_acceptors == 'with_wa' else None,
|
||||
)
|
||||
|
||||
end = timeit.default_timer()
|
||||
time_slices.append(end - start)
|
||||
|
||||
pg_tenant.stop()
|
||||
|
||||
zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
|
||||
@@ -2,6 +2,10 @@ import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
|
||||
@@ -31,7 +35,7 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
|
||||
zenith_cli.run(["branch", "test_pgbench_perf", "empty"])
|
||||
|
||||
pg = postgres.create_start('test_pgbench_perf')
|
||||
print("postgres is running on 'test_pgbench_perf' branch")
|
||||
log.info("postgres is running on 'test_pgbench_perf' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
"""
|
||||
Use this test to see what happens when tests fail.
|
||||
@@ -22,7 +26,7 @@ def test_broken(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_broken", "empty"])
|
||||
|
||||
postgres.create_start("test_broken")
|
||||
print('postgres is running')
|
||||
log.info('postgres is running')
|
||||
|
||||
print('THIS NEXT COMMAND WILL FAIL:')
|
||||
log.info('THIS NEXT COMMAND WILL FAIL:')
|
||||
pg_bin.run('pgbench -i_am_a_broken_test'.split())
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 56c561aa77...fbb29f3d1a
@@ -76,43 +76,6 @@ safekeepers.
|
||||
See README_PROTO.md for a more detailed desription of the consensus
|
||||
protocol. spec/ contains TLA+ specification of it.
|
||||
|
||||
# Q&A
|
||||
|
||||
Q: Why have a separate service instead of connecting Page Server directly to a
|
||||
primary PostgreSQL node?
|
||||
A: Page Server is a single server which can be lost. As our primary
|
||||
fault-tolerant storage is S3, we do not want to wait for it before
|
||||
committing a transaction. The WAL service acts as a temporary fault-tolerant
|
||||
storage for recent data before it gets to the Page Server and then finally
|
||||
to S3. Whenever WALs and pages are committed to S3, WAL's storage can be
|
||||
trimmed.
|
||||
|
||||
Q: What if the compute node evicts a page, needs it back, but the page is yet
|
||||
to reach the Page Server?
|
||||
A: If the compute node has evicted a page, all changes from that page are
|
||||
already committed, i.e. they are saved on majority of WAL safekeepers. These
|
||||
WAL records will eventually reach the Page Server. The Page Server notes
|
||||
that the compute note requests pages with a very recent LSN and will not
|
||||
respond to the compute node until it a corresponding WAL is received from WAL
|
||||
safekeepers.
|
||||
|
||||
Q: How long may Page Server wait for?
|
||||
A: Not too long, hopefully. If a page is evicted, it probably was not used for
|
||||
a while, so the WAL service have had enough time to push changes to the Page
|
||||
Server. There may be issues if there is no backpressure and compute node with
|
||||
WAL service run ahead of Page Server, though.
|
||||
There is no backpressure right now, so you may even see some spurious
|
||||
timeouts in tests.
|
||||
|
||||
Q: How do WAL safekeepers communicate with each other?
|
||||
A: They may only send each other messages via the compute node, they never
|
||||
communicate directly with each other.
|
||||
|
||||
Q: Why have a consensus algorithm if there is only a single compute node?
|
||||
A: Actually there may be moments with multiple PostgreSQL nodes running at the
|
||||
same time. E.g. we are bringing one up and one down. We would like to avoid
|
||||
simultaneous writes from different nodes, so there should be a consensus on
|
||||
who is the primary node.
|
||||
|
||||
# Terminology
|
||||
|
||||
|
||||
@@ -275,8 +275,8 @@ impl AcceptorProposerMessage {
|
||||
pub trait Storage {
|
||||
/// Persist safekeeper state on disk, optionally syncing it.
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk and sync it.
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk.
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
/// SafeKeeper which consumes events (messages from compute) and provides
|
||||
@@ -423,7 +423,7 @@ where
|
||||
let mut last_rec_lsn = Lsn(0);
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
.write_wal(&self.s, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
@@ -546,7 +546,7 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, _server: &ServerInfo, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, _s: &SafeKeeperState, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,8 +18,8 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, ServerInfo,
|
||||
Storage, SK_FORMAT_VERSION, SK_MAGIC,
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, Storage,
|
||||
SK_FORMAT_VERSION, SK_MAGIC,
|
||||
};
|
||||
use crate::WalAcceptorConf;
|
||||
use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};
|
||||
@@ -32,7 +32,7 @@ struct SharedState {
|
||||
sk: SafeKeeper<FileStorage>,
|
||||
/// For receiving-sending wal cooperation
|
||||
/// quorum commit LSN we've notified walsenders about
|
||||
notified_commit_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
/// combined hot standby feedback from all replicas
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
@@ -72,7 +72,7 @@ impl SharedState {
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
notified_commit_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts: 0,
|
||||
@@ -186,7 +186,7 @@ impl Timeline {
|
||||
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
loop {
|
||||
let commit_lsn = shared_state.notified_commit_lsn;
|
||||
let commit_lsn = shared_state.commit_lsn;
|
||||
// This must be `>`, not `>=`.
|
||||
if commit_lsn > lsn {
|
||||
return commit_lsn;
|
||||
@@ -198,8 +198,8 @@ impl Timeline {
|
||||
// Notify caught-up WAL senders about new WAL data received
|
||||
pub fn notify_wal_senders(&self, commit_lsn: Lsn) {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
if shared_state.notified_commit_lsn < commit_lsn {
|
||||
shared_state.notified_commit_lsn = commit_lsn;
|
||||
if shared_state.commit_lsn < commit_lsn {
|
||||
shared_state.commit_lsn = commit_lsn;
|
||||
self.cond.notify_all();
|
||||
}
|
||||
}
|
||||
@@ -337,14 +337,14 @@ impl Storage for FileStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
fn write_wal(&mut self, s: &SafeKeeperState, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
let wal_seg_size = server.wal_seg_size as usize;
|
||||
let ztli = server.ztli;
|
||||
let wal_seg_size = s.server.wal_seg_size as usize;
|
||||
let ztli = s.server.ztli;
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
|
||||
@@ -365,7 +365,7 @@ impl Storage for FileStorage {
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
// note: we basically don't support changing pg timeline
|
||||
let wal_file_name = XLogFileName(server.tli, segno, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(s.server.tli, segno, wal_seg_size);
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
|
||||
@@ -7,4 +7,3 @@ edition = "2018"
|
||||
prometheus = {version = "0.12", default_features=false} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! Otherwise, we might not see all metrics registered via
|
||||
//! a default registry.
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::race::OnceBox;
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_histogram, Histogram};
|
||||
pub use prometheus::{register_histogram_vec, HistogramVec};
|
||||
@@ -25,29 +24,9 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
|
||||
prometheus::gather()
|
||||
}
|
||||
|
||||
static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
|
||||
|
||||
/// Sets a prefix which will be used for all common metrics, typically a service
|
||||
/// name like 'pageserver'. Should be executed exactly once in the beginning of
|
||||
/// any executable which uses common metrics.
|
||||
pub fn set_common_metrics_prefix(prefix: &'static str) {
|
||||
COMMON_METRICS_PREFIX.set(prefix.into()).unwrap();
|
||||
}
|
||||
|
||||
/// Prepends a prefix to a common metric name so they are distinguished between
|
||||
/// different services, see https://github.com/zenithdb/zenith/pull/681
|
||||
/// A call to set_common_metrics_prefix() is necessary prior to calling this.
|
||||
pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
|
||||
format!(
|
||||
"{}_{}",
|
||||
COMMON_METRICS_PREFIX.get().unwrap(),
|
||||
unprefixed_metric_name
|
||||
)
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
|
||||
new_common_metric_name("disk_io_bytes"),
|
||||
"pageserver_disk_io_bytes",
|
||||
"Bytes written and read from disk, grouped by the operation (read|write)",
|
||||
&["io_operation"]
|
||||
)
|
||||
|
||||
@@ -38,4 +38,3 @@ rustls-split = "0.2.1"
|
||||
hex-literal = "0.3"
|
||||
bytes = "1.0"
|
||||
webpki = "0.21"
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -1,125 +0,0 @@
|
||||
use std::{
|
||||
fs::{self, File},
|
||||
io,
|
||||
path::Path,
|
||||
};
|
||||
|
||||
/// Similar to [`std::fs::create_dir`], except we fsync the
|
||||
/// created directory and its parent.
|
||||
pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
let path = path.as_ref();
|
||||
|
||||
fs::create_dir(path)?;
|
||||
File::open(path)?.sync_all()?;
|
||||
|
||||
if let Some(parent) = path.parent() {
|
||||
File::open(parent)?.sync_all()
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"can't find parent",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to [`std::fs::create_dir_all`], except we fsync all
|
||||
/// newly created directories and the pre-existing parent.
|
||||
pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
let mut path = path.as_ref();
|
||||
|
||||
let mut dirs_to_create = Vec::new();
|
||||
|
||||
// Figure out which directories we need to create.
|
||||
loop {
|
||||
match path.metadata() {
|
||||
Ok(metadata) if metadata.is_dir() => break,
|
||||
Ok(_) => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("non-directory found in path: {:?}", path),
|
||||
));
|
||||
}
|
||||
Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
dirs_to_create.push(path);
|
||||
|
||||
match path.parent() {
|
||||
Some(parent) => path = parent,
|
||||
None => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"can't find parent",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create directories from parent to child.
|
||||
for &path in dirs_to_create.iter().rev() {
|
||||
fs::create_dir(path)?;
|
||||
}
|
||||
|
||||
// Fsync the created directories from child to parent.
|
||||
for &path in dirs_to_create.iter() {
|
||||
File::open(path)?.sync_all()?;
|
||||
}
|
||||
|
||||
// If we created any new directories, fsync the parent.
|
||||
if !dirs_to_create.is_empty() {
|
||||
File::open(path)?.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_create_dir_fsyncd() {
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let existing_dir_path = dir.path();
|
||||
let err = create_dir(existing_dir_path).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
|
||||
|
||||
let child_dir = existing_dir_path.join("child");
|
||||
create_dir(child_dir).unwrap();
|
||||
|
||||
let nested_child_dir = existing_dir_path.join("child1").join("child2");
|
||||
let err = create_dir(nested_child_dir).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::NotFound);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_dir_all_fsyncd() {
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let existing_dir_path = dir.path();
|
||||
create_dir_all(existing_dir_path).unwrap();
|
||||
|
||||
let child_dir = existing_dir_path.join("child");
|
||||
assert!(!child_dir.exists());
|
||||
create_dir_all(&child_dir).unwrap();
|
||||
assert!(child_dir.exists());
|
||||
|
||||
let nested_child_dir = existing_dir_path.join("child1").join("child2");
|
||||
assert!(!nested_child_dir.exists());
|
||||
create_dir_all(&nested_child_dir).unwrap();
|
||||
assert!(nested_child_dir.exists());
|
||||
|
||||
let file_path = existing_dir_path.join("file");
|
||||
std::fs::write(&file_path, b"").unwrap();
|
||||
|
||||
let err = create_dir_all(&file_path).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
|
||||
|
||||
let invalid_dir_path = file_path.join("folder");
|
||||
create_dir_all(&invalid_dir_path).unwrap_err();
|
||||
}
|
||||
}
|
||||
@@ -9,14 +9,14 @@ use routerify::ext::RequestExt;
|
||||
use routerify::RequestInfo;
|
||||
use routerify::{Middleware, Router, RouterBuilder, RouterService};
|
||||
use std::net::TcpListener;
|
||||
use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
|
||||
use zenith_metrics::{register_int_counter, IntCounter};
|
||||
use zenith_metrics::{Encoder, TextEncoder};
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
lazy_static! {
|
||||
static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
|
||||
new_common_metric_name("serve_metrics_count"),
|
||||
"pageserver_serve_metrics_count",
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
@@ -18,9 +18,6 @@ pub mod pq_proto;
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing directories/trees
|
||||
pub mod crashsafe_dir;
|
||||
|
||||
// common authentication routines
|
||||
pub mod auth;
|
||||
|
||||
|
||||
@@ -192,7 +192,9 @@ impl AtomicLsn {
|
||||
/// This operation will panic on overflow.
|
||||
pub fn fetch_add(&self, val: u64) -> Lsn {
|
||||
let prev = self.inner.fetch_add(val, Ordering::AcqRel);
|
||||
assert!(prev.checked_add(val).is_some(), "AtomicLsn overflow");
|
||||
if prev.checked_add(val).is_none() {
|
||||
panic!("AtomicLsn overflow");
|
||||
}
|
||||
Lsn(prev)
|
||||
}
|
||||
|
||||
|
||||
@@ -78,10 +78,6 @@ macro_rules! zid_newtype {
|
||||
pub fn generate() -> Self {
|
||||
$t(ZId::generate())
|
||||
}
|
||||
|
||||
pub const fn from_array(b: [u8; 16]) -> Self {
|
||||
$t(ZId(b))
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for $t {
|
||||
|
||||
Reference in New Issue
Block a user