mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-18 04:40:37 +00:00
Compare commits
5 Commits
batch-fsyn
...
wip-perf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea56b4a36a | ||
|
|
7cf7215ce2 | ||
|
|
23713eb44f | ||
|
|
28675739de | ||
|
|
ea90d102e2 |
5
Cargo.lock
generated
5
Cargo.lock
generated
@@ -208,8 +208,6 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "bookfile"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
|
||||
dependencies = [
|
||||
"aversion",
|
||||
"byteorder",
|
||||
@@ -1195,6 +1193,7 @@ dependencies = [
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"nix",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
@@ -1213,7 +1212,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
"workspace_hack",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
@@ -2578,7 +2576,6 @@ dependencies = [
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"lazy_static",
|
||||
"nix",
|
||||
"postgres",
|
||||
"rand",
|
||||
"routerify",
|
||||
|
||||
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
bookfile = "^0.3"
|
||||
bookfile = { path = "../../bookfile" }
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
@@ -37,7 +37,7 @@ async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener};
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use signal_hook::consts::signal::*;
|
||||
@@ -27,6 +28,7 @@ use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
|
||||
layered_repository,
|
||||
RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
@@ -43,7 +45,6 @@ struct CfgFileParams {
|
||||
checkpoint_period: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
open_mem_limit: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
auth_validation_public_key_path: Option<String>,
|
||||
auth_type: Option<String>,
|
||||
@@ -105,7 +106,6 @@ impl CfgFileParams {
|
||||
checkpoint_period: get_arg("checkpoint_period"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
open_mem_limit: get_arg("open_mem_limit"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
@@ -124,7 +124,6 @@ impl CfgFileParams {
|
||||
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
@@ -169,11 +168,6 @@ impl CfgFileParams {
|
||||
None => DEFAULT_GC_PERIOD,
|
||||
};
|
||||
|
||||
let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
|
||||
Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
|
||||
None => DEFAULT_OPEN_MEM_LIMIT,
|
||||
};
|
||||
|
||||
let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
|
||||
Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
|
||||
None => env::current_dir()?.join("tmp_install"),
|
||||
@@ -245,7 +239,6 @@ impl CfgFileParams {
|
||||
checkpoint_period,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
open_mem_limit,
|
||||
|
||||
superuser: String::from(DEFAULT_SUPERUSER),
|
||||
|
||||
@@ -316,12 +309,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("open_mem_limit")
|
||||
.long("open_mem_limit")
|
||||
.takes_value(true)
|
||||
.help("Amount of memory reserved for buffering incoming WAL"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
@@ -494,13 +481,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
"Starting pageserver http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
|
||||
|
||||
info!(
|
||||
"Starting pageserver pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
@@ -563,6 +550,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
let global_job_thread = layered_repository::launch_global_job_thread(conf);
|
||||
|
||||
for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
|
||||
match info.signal {
|
||||
SIGQUIT => {
|
||||
@@ -591,6 +580,12 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
|
||||
// Shut down global job thread
|
||||
global_job_thread
|
||||
.join()
|
||||
.expect("thread panicked");
|
||||
|
||||
info!("Pageserver shut down successfully completed");
|
||||
exit(0);
|
||||
}
|
||||
@@ -616,7 +611,6 @@ mod tests {
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
@@ -640,7 +634,6 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
@@ -675,7 +668,6 @@ local_path = 'relish_storage_local_VALUE'
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
@@ -702,7 +694,6 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
|
||||
@@ -23,7 +23,6 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{repository::Repository, PageServerConf};
|
||||
use crate::{restore_local_repo, LOG_FILE_NAME};
|
||||
|
||||
@@ -36,7 +35,7 @@ pub struct BranchInfo {
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
pub current_logical_size: usize,
|
||||
pub current_logical_size_non_incremental: Option<usize>,
|
||||
pub current_logical_size_non_incremental: usize,
|
||||
}
|
||||
|
||||
impl BranchInfo {
|
||||
@@ -45,7 +44,6 @@ impl BranchInfo {
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
repo: &Arc<dyn Repository>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> Result<Self> {
|
||||
let name = path
|
||||
.as_ref()
|
||||
@@ -80,14 +78,6 @@ impl BranchInfo {
|
||||
);
|
||||
}
|
||||
|
||||
// non incremental size calculation can be heavy, so let it be optional
|
||||
// needed for tests to check size calculation
|
||||
let current_logical_size_non_incremental = include_non_incremental_logical_size
|
||||
.then(|| {
|
||||
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
@@ -95,7 +85,8 @@ impl BranchInfo {
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size: timeline.get_current_logical_size(),
|
||||
current_logical_size_non_incremental,
|
||||
current_logical_size_non_incremental: timeline
|
||||
.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -239,7 +230,7 @@ fn bootstrap_timeline(
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
timeline.checkpoint_forced()?;
|
||||
|
||||
println!(
|
||||
"created initial timeline {} timeline.lsn {}",
|
||||
@@ -257,11 +248,7 @@ fn bootstrap_timeline(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> Result<Vec<BranchInfo>> {
|
||||
pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
|
||||
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
@@ -271,13 +258,7 @@ pub(crate) fn get_branches(
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
BranchInfo::from_path(
|
||||
dir_entry.path(),
|
||||
conf,
|
||||
tenantid,
|
||||
&repo,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -339,7 +320,7 @@ pub(crate) fn create_branch(
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
current_logical_size: 0,
|
||||
current_logical_size_non_incremental: Some(0),
|
||||
current_logical_size_non_incremental: 0,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -25,11 +25,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
get:
|
||||
description: Get branches for tenant
|
||||
responses:
|
||||
@@ -78,11 +73,6 @@ paths:
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
get:
|
||||
description: Get branches for tenant
|
||||
responses:
|
||||
@@ -270,6 +260,7 @@ components:
|
||||
- timeline_id
|
||||
- latest_valid_lsn
|
||||
- current_logical_size
|
||||
- current_logical_size_non_incremental
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
|
||||
@@ -86,59 +86,31 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
Ok(json_response(StatusCode::CREATED, response_data)?)
|
||||
}
|
||||
|
||||
// Gate non incremental logical size calculation behind a flag
|
||||
// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
|
||||
// and tenants it can take noticeable amount of time. Also the value currently used only in tests
|
||||
fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
|
||||
request
|
||||
.uri()
|
||||
.query()
|
||||
.map(|v| {
|
||||
url::form_urlencoded::parse(v.as_bytes())
|
||||
.into_owned()
|
||||
.any(|(param, _)| param == "include-non-incremental-logical-size")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
check_permission(&request, Some(tenantid))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
|
||||
crate::branches::get_branches(
|
||||
get_config(&request),
|
||||
&tenantid,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
crate::branches::get_branches(get_config(&request), &tenantid)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
// TODO add to swagger
|
||||
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
|
||||
let conf = get_state(&request).conf;
|
||||
let path = conf.branch_path(&branch_name, &tenantid);
|
||||
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(
|
||||
path,
|
||||
conf,
|
||||
&tenantid,
|
||||
&repo,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
BranchInfo::from_path(path, conf, &tenantid, &repo)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
@@ -16,12 +16,13 @@ use bookfile::Book;
|
||||
use bytes::Bytes;
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::pg_constants::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use zenith_utils::batch_fsync::BatchFsync;
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
@@ -29,9 +30,9 @@ use std::ops::{Bound::Included, Deref};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::{metadata_path, TimelineMetadata};
|
||||
use crate::relish::*;
|
||||
use crate::relish_storage::schedule_timeline_upload;
|
||||
use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
|
||||
@@ -39,7 +40,6 @@ use crate::tenant_mgr;
|
||||
use crate::walreceiver;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
|
||||
@@ -47,6 +47,7 @@ use zenith_metrics::{
|
||||
register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec,
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
@@ -54,24 +55,24 @@ use zenith_utils::seqwait::SeqWait;
|
||||
mod blob;
|
||||
mod delta_layer;
|
||||
mod filename;
|
||||
mod global_layer_map;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod interval_tree;
|
||||
mod layer_map;
|
||||
pub mod metadata;
|
||||
mod jobs;
|
||||
mod page_versions;
|
||||
mod storage_layer;
|
||||
|
||||
use delta_layer::DeltaLayer;
|
||||
use image_layer::ImageLayer;
|
||||
|
||||
use global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
use inmemory_layer::InMemoryLayer;
|
||||
use layer_map::LayerMap;
|
||||
use layer_map::{LayerId, LayerMap};
|
||||
use storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE,
|
||||
};
|
||||
use jobs::{GlobalJob, schedule_job};
|
||||
pub use jobs::launch_global_job_thread;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
@@ -113,6 +114,8 @@ lazy_static! {
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
@@ -129,10 +132,17 @@ pub struct LayeredRepository {
|
||||
/// Makes evey repo's timelines to backup their files to remote storage,
|
||||
/// when they get frozen.
|
||||
upload_relishes: bool,
|
||||
|
||||
is_gc_scheduled: Mutex<bool>,
|
||||
}
|
||||
|
||||
/// Public interface
|
||||
impl Repository for LayeredRepository {
|
||||
|
||||
fn upgrade_to_layered_repository(&self) -> &crate::layered_repository::LayeredRepository {
|
||||
self
|
||||
}
|
||||
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
@@ -145,7 +155,12 @@ impl Repository for LayeredRepository {
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0));
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
prev_record_lsn: None,
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
@@ -184,7 +199,12 @@ impl Repository for LayeredRepository {
|
||||
// Create the metadata file, noting the ancestor of the new timeline.
|
||||
// There is initially no data in it, but all the read-calls know to look
|
||||
// into the ancestor.
|
||||
let metadata = TimelineMetadata::new(start_lsn, dst_prev, Some(src), start_lsn);
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: start_lsn,
|
||||
prev_record_lsn: dst_prev,
|
||||
ancestor_timeline: Some(src),
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
|
||||
|
||||
@@ -196,33 +216,31 @@ impl Repository for LayeredRepository {
|
||||
/// Public entry point to GC. All the logic is in the private
|
||||
/// gc_iteration_internal function, this public facade just wraps it for
|
||||
/// metrics collection.
|
||||
fn gc_iteration(
|
||||
fn gc_manual(
|
||||
&self,
|
||||
target_timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["gc"])
|
||||
.with_label_values(&["gc_manual"])
|
||||
.observe_closure_duration(|| {
|
||||
self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc)
|
||||
})
|
||||
}
|
||||
|
||||
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> {
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
fn gc_scheduled(&self) -> Result<GcResult> {
|
||||
let result = STORAGE_TIME
|
||||
.with_label_values(&["gc_scheduled"])
|
||||
.observe_closure_duration(|| {
|
||||
self.gc_iteration_internal(None, self.conf.gc_horizon, false)
|
||||
});
|
||||
|
||||
for (timelineid, timeline) in timelines.iter() {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
|
||||
.entered();
|
||||
let mut guard = self.is_gc_scheduled.lock().unwrap();
|
||||
|
||||
timeline.checkpoint(cconf)?;
|
||||
}
|
||||
}
|
||||
*guard = false;
|
||||
|
||||
Ok(())
|
||||
result
|
||||
}
|
||||
|
||||
// Wait for all threads to complete and persist repository data before pageserver shutdown.
|
||||
@@ -234,7 +252,7 @@ impl Repository for LayeredRepository {
|
||||
walreceiver::stop_wal_receiver(*timelineid);
|
||||
// Wait for syncing data to disk
|
||||
trace!("repo shutdown. checkpoint timeline {}", timelineid);
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
timeline.checkpoint_forced()?;
|
||||
|
||||
//TODO Wait for walredo process to shutdown too
|
||||
}
|
||||
@@ -256,14 +274,14 @@ impl LayeredRepository {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn;
|
||||
|
||||
// Recurse to look up the ancestor timeline.
|
||||
//
|
||||
// TODO: If you have a very deep timeline history, this could become
|
||||
// expensive. Perhaps delay this until we need to look up a page in
|
||||
// ancestor.
|
||||
let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline() {
|
||||
let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline {
|
||||
Some(self.get_timeline_locked(ancestor_timelineid, timelines)?)
|
||||
} else {
|
||||
None
|
||||
@@ -312,9 +330,56 @@ impl LayeredRepository {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
walredo_mgr,
|
||||
upload_relishes,
|
||||
is_gc_scheduled: Mutex::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch the checkpointer thread in given repository.
|
||||
///
|
||||
pub fn launch_checkpointer_thread(
|
||||
conf: &'static PageServerConf,
|
||||
rc: Arc<LayeredRepository>,
|
||||
) -> JoinHandle<()> {
|
||||
std::thread::Builder::new()
|
||||
.name("Checkpointer thread".into())
|
||||
.spawn(move || {
|
||||
// FIXME: relaunch it? Panic is not good.
|
||||
rc.checkpoint_loop(conf).expect("Checkpointer thread died");
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
///
|
||||
/// Checkpointer thread's main loop
|
||||
///
|
||||
fn checkpoint_loop(&self, conf: &'static PageServerConf) -> Result<()> {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
std::thread::sleep(conf.checkpoint_period);
|
||||
info!("checkpointer thread for tenant {} waking up", self.tenantid);
|
||||
|
||||
// checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
|
||||
// bytes of WAL since last checkpoint.
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for (timelineid, timeline) in timelines.iter() {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
|
||||
.entered();
|
||||
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["checkpoint_timed"])
|
||||
.observe_closure_duration(|| {
|
||||
timeline.checkpoint_internal(conf.checkpoint_distance, false)
|
||||
})?
|
||||
}
|
||||
// release lock on 'timelines'
|
||||
}
|
||||
}
|
||||
trace!("Checkpointer thread shut down");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -468,7 +533,7 @@ impl LayeredRepository {
|
||||
// so that they too can be garbage collected. That's
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
timeline.checkpoint_forced()?;
|
||||
info!("timeline {} checkpoint_before_gc done", timelineid);
|
||||
}
|
||||
|
||||
@@ -483,6 +548,66 @@ impl LayeredRepository {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
|
||||
pub ancestor_timeline: Option<ZTimelineId>,
|
||||
pub ancestor_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let mut metadata_bytes = TimelineMetadata::ser(self)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayeredTimeline {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -551,10 +676,18 @@ pub struct LayeredTimeline {
|
||||
/// Must always be acquired before the layer map/individual layer lock
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
|
||||
is_checkpoint_scheduled: Mutex<bool>,
|
||||
last_gc: Mutex<Option<Lsn>>,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline for LayeredTimeline {
|
||||
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline {
|
||||
self
|
||||
}
|
||||
|
||||
fn get_ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
@@ -729,15 +862,23 @@ impl Timeline for LayeredTimeline {
|
||||
/// Public entry point for checkpoint(). All the logic is in the private
|
||||
/// checkpoint_internal function, this public facade just wraps it for
|
||||
/// metrics collection.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> {
|
||||
match cconf {
|
||||
CheckpointConfig::Forced => STORAGE_TIME
|
||||
.with_label_values(&["forced checkpoint"])
|
||||
.observe_closure_duration(|| self.checkpoint_internal(0)),
|
||||
CheckpointConfig::Distance(distance) => STORAGE_TIME
|
||||
.with_label_values(&["checkpoint"])
|
||||
.observe_closure_duration(|| self.checkpoint_internal(distance)),
|
||||
}
|
||||
fn checkpoint_forced(&self) -> Result<()> {
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["checkpoint_forced"])
|
||||
//pass checkpoint_distance=0 to force checkpoint
|
||||
.observe_closure_duration(|| self.checkpoint_internal(0, true))
|
||||
}
|
||||
|
||||
fn checkpoint_scheduled(&self) -> Result<()> {
|
||||
|
||||
let result = STORAGE_TIME
|
||||
.with_label_values(&["checkpoint_scheduled"])
|
||||
.observe_closure_duration(|| self.checkpoint_internal(self.conf.checkpoint_distance, false));
|
||||
|
||||
let mut guard = self.is_checkpoint_scheduled.lock().unwrap();
|
||||
*guard = false;
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn get_last_record_lsn(&self) -> Lsn {
|
||||
@@ -801,10 +942,6 @@ impl Timeline for LayeredTimeline {
|
||||
_write_guard: self.write_lock.lock().unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl LayeredTimeline {
|
||||
@@ -835,18 +972,21 @@ impl LayeredTimeline {
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
last: metadata.disk_consistent_lsn(),
|
||||
prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
|
||||
last: metadata.disk_consistent_lsn,
|
||||
prev: metadata.prev_record_lsn.unwrap_or(Lsn(0)),
|
||||
}),
|
||||
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),
|
||||
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn.0),
|
||||
|
||||
ancestor_timeline: ancestor,
|
||||
ancestor_lsn: metadata.ancestor_lsn(),
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
current_logical_size: AtomicUsize::new(current_logical_size),
|
||||
current_logical_size_gauge,
|
||||
upload_relishes,
|
||||
|
||||
write_lock: Mutex::new(()),
|
||||
|
||||
is_checkpoint_scheduled: Mutex::new(false),
|
||||
last_gc: Mutex::new(None),
|
||||
};
|
||||
Ok(timeline)
|
||||
}
|
||||
@@ -1015,7 +1155,7 @@ impl LayeredTimeline {
|
||||
///
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result<Arc<InMemoryLayer>> {
|
||||
fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result<Arc<dyn Layer>> {
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
|
||||
assert!(lsn.is_aligned());
|
||||
@@ -1030,7 +1170,9 @@ impl LayeredTimeline {
|
||||
|
||||
// Do we have a layer open for writing already?
|
||||
let layer;
|
||||
if let Some(open_layer) = layers.get_open(&seg) {
|
||||
if let Some(open_layer_arc) = layers.get_open(&seg) {
|
||||
let open_layer = open_layer_arc.upgrade_to_inmemory_layer().expect("open layer is not an in-memory layer");
|
||||
|
||||
if open_layer.get_start_lsn() > lsn {
|
||||
bail!("unexpected open layer in the future");
|
||||
}
|
||||
@@ -1055,7 +1197,7 @@ impl LayeredTimeline {
|
||||
lsn,
|
||||
)?;
|
||||
} else {
|
||||
return Ok(open_layer);
|
||||
return Ok(open_layer_arc);
|
||||
}
|
||||
}
|
||||
// No writeable layer for this relation. Create one.
|
||||
@@ -1120,7 +1262,7 @@ impl LayeredTimeline {
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL.
|
||||
fn checkpoint_internal(&self, checkpoint_distance: u64) -> Result<()> {
|
||||
fn checkpoint_internal(&self, checkpoint_distance: u64, forced: bool) -> Result<()> {
|
||||
let mut write_guard = self.write_lock.lock().unwrap();
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
|
||||
@@ -1146,14 +1288,16 @@ impl LayeredTimeline {
|
||||
// a lot of memory and/or aren't receiving much updates anymore.
|
||||
let mut disk_consistent_lsn = last_record_lsn;
|
||||
|
||||
let mut batch_fsync = BatchFsync::default();
|
||||
|
||||
let mut layer_uploads = Vec::new();
|
||||
while let Some((oldest_layer_id, oldest_layer, oldest_generation)) =
|
||||
layers.peek_oldest_open()
|
||||
{
|
||||
while let Some((oldest_layer_id, oldest_layer_arc, oldest_generation)) = layers.peek_oldest_open() {
|
||||
let oldest_layer = oldest_layer_arc.upgrade_to_inmemory_layer().expect("open layer is not an in-memory layer");
|
||||
|
||||
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
|
||||
|
||||
if tenant_mgr::shutdown_requested() && !forced {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Does this layer need freezing?
|
||||
//
|
||||
// Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE.
|
||||
@@ -1179,8 +1323,8 @@ impl LayeredTimeline {
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
let mut this_layer_uploads = self.evict_layer(oldest_layer_id, &mut batch_fsync)?;
|
||||
layer_uploads.append(&mut this_layer_uploads);
|
||||
// Evict the layer
|
||||
self.evict_layer(oldest_layer_id)?;
|
||||
|
||||
write_guard = self.write_lock.lock().unwrap();
|
||||
layers = self.layers.lock().unwrap();
|
||||
@@ -1189,23 +1333,13 @@ impl LayeredTimeline {
|
||||
// Call unload() on all frozen layers, to release memory.
|
||||
// This shouldn't be much memory, as only metadata is slurped
|
||||
// into memory.
|
||||
for layer in layers.iter_historic_layers() {
|
||||
for (_layer_id, layer) in layers.iter_historic_layers() {
|
||||
layer.unload()?;
|
||||
}
|
||||
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
if !layer_uploads.is_empty() {
|
||||
// We must fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
let timeline_dir =
|
||||
File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;
|
||||
batch_fsync.add(timeline_dir)?;
|
||||
}
|
||||
|
||||
batch_fsync.done()?;
|
||||
|
||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||
// After crash, we will restart WAL streaming and processing from that point.
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
@@ -1226,12 +1360,12 @@ impl LayeredTimeline {
|
||||
|
||||
let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timelineid,
|
||||
self.ancestor_lsn,
|
||||
);
|
||||
prev_record_lsn: ondisk_prev_record_lsn,
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
|
||||
LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
@@ -1251,53 +1385,129 @@ impl LayeredTimeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn evict_layer(&self, layer_id: LayerId, batch_fsync: &mut BatchFsync) -> Result<Vec<PathBuf>> {
|
||||
// Mark the layer as no longer accepting writes and record the end_lsn.
|
||||
// This happens in-place, no new layers are created now.
|
||||
// We call `get_last_record_lsn` again, which may be different from the
|
||||
// original load, as we may have released the write lock since then.
|
||||
pub fn schedule_checkpoint_if_needed(&self) -> Result<()> {
|
||||
|
||||
let mut guard = self.is_checkpoint_scheduled.lock().unwrap();
|
||||
if *guard == true {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let RecordLsn {
|
||||
last: last_record_lsn,
|
||||
prev: _prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
if let Some((_oldest_layer_id, oldest_layer_arc, _oldest_generation)) = layers.peek_oldest_open() {
|
||||
let oldest_layer = oldest_layer_arc.upgrade_to_inmemory_layer().expect("open layer is not an in-memory layer");
|
||||
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
|
||||
let distance = last_record_lsn.widening_sub(oldest_pending_lsn);
|
||||
if distance > self.conf.checkpoint_distance.into() {
|
||||
schedule_job(GlobalJob::CheckpointTimeline(self.tenantid, self.timelineid));
|
||||
*guard = true;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schedule_gc_if_needed(&self) -> Result<()> {
|
||||
|
||||
let RecordLsn {
|
||||
last: last_record_lsn,
|
||||
prev: _prev_record_lsn,
|
||||
} = self.last_record_lsn.load();
|
||||
|
||||
let gc_needed = {
|
||||
let last_gc = self.last_gc.lock().unwrap();
|
||||
|
||||
if let Some(last_gc) = *last_gc {
|
||||
let distance = last_record_lsn.widening_sub(last_gc);
|
||||
if distance > std::cmp::max(10*1024*1024, self.conf.gc_horizon / 2) as i128 {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
if !gc_needed {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(self.tenantid)?;
|
||||
let repo = repo.upgrade_to_layered_repository();
|
||||
|
||||
let mut gc_scheduled = repo.is_gc_scheduled.lock().unwrap();
|
||||
if *gc_scheduled == true {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
schedule_job(GlobalJob::GarbageCollect(self.tenantid));
|
||||
*gc_scheduled = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn evict_layer(&self, layer_id: LayerId) -> Result<()> {
|
||||
let mut write_guard = self.write_lock.lock().unwrap();
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
|
||||
let mut layer_uploads = Vec::new();
|
||||
if let Some(victim_layer_arc) = layers.get_with_id(layer_id) {
|
||||
|
||||
let global_layer_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
if let Some(oldest_layer) = global_layer_map.get(&layer_id) {
|
||||
drop(global_layer_map);
|
||||
oldest_layer.freeze(self.get_last_record_lsn());
|
||||
if let Some(victim_layer) = victim_layer_arc.upgrade_to_inmemory_layer() {
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
layers.remove_open(layer_id);
|
||||
layers.insert_historic(oldest_layer.clone());
|
||||
// Mark the layer as no longer accepting writes and record the end_lsn.
|
||||
// This happens in-place, no new layers are created now.
|
||||
// We call `get_last_record_lsn` again, which may be different from the
|
||||
// original load, as we may have released the write lock since then.
|
||||
victim_layer.freeze(self.get_last_record_lsn());
|
||||
|
||||
// Write the now-frozen layer to disk. That could take a while, so release the lock while do it
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
layers.remove(layer_id);
|
||||
|
||||
let new_historics = oldest_layer.write_to_disk(self, batch_fsync)?;
|
||||
let frozen_layer_id = layers.insert_historic(victim_layer_arc.clone());
|
||||
|
||||
write_guard = self.write_lock.lock().unwrap();
|
||||
layers = self.layers.lock().unwrap();
|
||||
// Write the now-frozen layer to disk. That could take a while, so release the lock while do it
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
layers.remove_historic(oldest_layer);
|
||||
let new_historics = victim_layer.write_to_disk(self)?;
|
||||
let created_historics = !new_historics.is_empty();
|
||||
|
||||
// Add the historics to the LayerMap
|
||||
for delta_layer in new_historics.delta_layers {
|
||||
layer_uploads.push(delta_layer.path());
|
||||
layers.insert_historic(Arc::new(delta_layer));
|
||||
}
|
||||
for image_layer in new_historics.image_layers {
|
||||
layer_uploads.push(image_layer.path());
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
write_guard = self.write_lock.lock().unwrap();
|
||||
layers = self.layers.lock().unwrap();
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
layers.remove(frozen_layer_id);
|
||||
|
||||
// Add the historics to the LayerMap
|
||||
for delta_layer in new_historics.delta_layers {
|
||||
// FIXME layer_uploads.push(delta_layer.path());
|
||||
layers.insert_historic(Arc::new(delta_layer));
|
||||
}
|
||||
for image_layer in new_historics.image_layers {
|
||||
// FIXME layer_uploads.push(image_layer.path());
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
}
|
||||
|
||||
if created_historics {
|
||||
// We must fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
//
|
||||
// TODO: it's inefficient to do this after every eviction, if we're evicting
|
||||
// a lot of layers.
|
||||
let timeline_dir =
|
||||
File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
}
|
||||
}
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
Ok(layer_uploads)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -1332,7 +1542,7 @@ impl LayeredTimeline {
|
||||
|
||||
debug!("retain_lsns: {:?}", retain_lsns);
|
||||
|
||||
let mut layers_to_remove: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
let mut layers_to_remove: Vec<LayerId> = Vec::new();
|
||||
|
||||
// Scan all on-disk layers in the timeline.
|
||||
//
|
||||
@@ -1343,7 +1553,7 @@ impl LayeredTimeline {
|
||||
// 4. this layer doesn't serve as a tombstone for some older layer;
|
||||
//
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
'outer: for (layer_id, l) in layers.iter_historic_layers() {
|
||||
let seg = l.get_seg_tag();
|
||||
|
||||
if seg.rel.is_relation() {
|
||||
@@ -1354,7 +1564,7 @@ impl LayeredTimeline {
|
||||
|
||||
// 1. Is it newer than cutoff point?
|
||||
if l.get_end_lsn() > cutoff {
|
||||
info!(
|
||||
trace!(
|
||||
"keeping {} {}-{} because it's newer than cutoff {}",
|
||||
seg,
|
||||
l.get_start_lsn(),
|
||||
@@ -1373,7 +1583,7 @@ impl LayeredTimeline {
|
||||
for retain_lsn in &retain_lsns {
|
||||
// start_lsn is inclusive and end_lsn is exclusive
|
||||
if l.get_start_lsn() <= *retain_lsn && *retain_lsn < l.get_end_lsn() {
|
||||
info!(
|
||||
trace!(
|
||||
"keeping {} {}-{} because it's needed by branch point {}",
|
||||
seg,
|
||||
l.get_start_lsn(),
|
||||
@@ -1392,7 +1602,7 @@ impl LayeredTimeline {
|
||||
// 3. Is there a later on-disk layer for this relation?
|
||||
if !l.is_dropped() && !layers.newer_image_layer_exists(l.get_seg_tag(), l.get_end_lsn())
|
||||
{
|
||||
info!(
|
||||
trace!(
|
||||
"keeping {} {}-{} because it is the latest layer",
|
||||
seg,
|
||||
l.get_start_lsn(),
|
||||
@@ -1461,7 +1671,7 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
if is_tombstone {
|
||||
info!(
|
||||
trace!(
|
||||
"keeping {} {}-{} because this layer servers as a tombstome for older layer",
|
||||
seg,
|
||||
l.get_start_lsn(),
|
||||
@@ -1485,27 +1695,32 @@ impl LayeredTimeline {
|
||||
l.get_end_lsn(),
|
||||
l.is_dropped()
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
layers_to_remove.push(layer_id);
|
||||
}
|
||||
|
||||
// Actually delete the layers from disk and remove them from the map.
|
||||
// (couldn't do this in the loop above, because you cannot modify a collection
|
||||
// while iterating it. BTreeMap::retain() would be another option)
|
||||
for doomed_layer in layers_to_remove {
|
||||
doomed_layer.delete()?;
|
||||
layers.remove_historic(doomed_layer.clone());
|
||||
for doomed_layer_id in layers_to_remove {
|
||||
if let Some(doomed_layer) = layers.get_with_id(doomed_layer_id) {
|
||||
|
||||
match (
|
||||
doomed_layer.is_dropped(),
|
||||
doomed_layer.get_seg_tag().rel.is_relation(),
|
||||
) {
|
||||
(true, true) => result.ondisk_relfiles_dropped += 1,
|
||||
(true, false) => result.ondisk_nonrelfiles_dropped += 1,
|
||||
(false, true) => result.ondisk_relfiles_removed += 1,
|
||||
(false, false) => result.ondisk_nonrelfiles_removed += 1,
|
||||
doomed_layer.delete()?;
|
||||
layers.remove(doomed_layer_id);
|
||||
match (
|
||||
doomed_layer.is_dropped(),
|
||||
doomed_layer.get_seg_tag().rel.is_relation(),
|
||||
) {
|
||||
(true, true) => result.ondisk_relfiles_dropped += 1,
|
||||
(true, false) => result.ondisk_nonrelfiles_dropped += 1,
|
||||
(false, true) => result.ondisk_relfiles_removed += 1,
|
||||
(false, false) => result.ondisk_nonrelfiles_removed += 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut guard = self.last_gc.lock().unwrap();
|
||||
*guard = Some(cutoff);
|
||||
|
||||
result.elapsed = now.elapsed();
|
||||
Ok(result)
|
||||
}
|
||||
@@ -1698,9 +1913,10 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
|
||||
let seg = SegmentTag::from_blknum(rel, blknum);
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
let delta_size = layer.put_wal_record(lsn, blknum, rec);
|
||||
let delta_size = layer.upgrade_to_inmemory_layer().unwrap().put_wal_record(lsn, blknum, rec);
|
||||
self.tl
|
||||
.increase_current_logical_size(delta_size * BLCKSZ as u32);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1717,7 +1933,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
let seg = SegmentTag::from_blknum(rel, blknum);
|
||||
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
let delta_size = layer.put_page_image(blknum, lsn, img);
|
||||
let delta_size = layer.upgrade_to_inmemory_layer().unwrap().put_page_image(blknum, lsn, img);
|
||||
|
||||
self.tl
|
||||
.increase_current_logical_size(delta_size * BLCKSZ as u32);
|
||||
@@ -1762,7 +1978,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
};
|
||||
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
layer.drop_segment(lsn);
|
||||
layer.upgrade_to_inmemory_layer().unwrap().drop_segment(lsn);
|
||||
}
|
||||
|
||||
// Truncate the last remaining segment to the specified size
|
||||
@@ -1772,7 +1988,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
segno: last_remain_seg,
|
||||
};
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE)
|
||||
layer.upgrade_to_inmemory_layer().unwrap().put_truncation(lsn, relsize % RELISH_SEG_SIZE)
|
||||
}
|
||||
self.tl
|
||||
.decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32);
|
||||
@@ -1800,7 +2016,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
segno: remove_segno,
|
||||
};
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
layer.drop_segment(lsn);
|
||||
layer.upgrade_to_inmemory_layer().unwrap().drop_segment(lsn);
|
||||
}
|
||||
self.tl
|
||||
.decrease_current_logical_size(oldsize * BLCKSZ as u32);
|
||||
@@ -1814,7 +2030,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
// TODO handle TwoPhase relishes
|
||||
let seg = SegmentTag::from_blknum(rel, 0);
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
layer.drop_segment(lsn);
|
||||
layer.upgrade_to_inmemory_layer().unwrap().drop_segment(lsn);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1848,6 +2064,15 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
@@ -1867,36 +2092,3 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
path
|
||||
))
|
||||
}
|
||||
|
||||
//----- Global layer management
|
||||
|
||||
/// Check if too much memory is being used by open layers. If so, evict
|
||||
pub fn evict_layer_if_needed(conf: &PageServerConf) -> Result<()> {
|
||||
// Keep evicting layers until we are below the memory threshold.
|
||||
let mut global_layer_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
while let Some((layer_id, layer)) = global_layer_map.find_victim_if_needed(conf.open_mem_limit)
|
||||
{
|
||||
drop(global_layer_map);
|
||||
let tenantid = layer.get_tenant_id();
|
||||
let timelineid = layer.get_timeline_id();
|
||||
|
||||
let _entered =
|
||||
info_span!("global evict", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("evicting {}", layer.filename().display());
|
||||
drop(layer);
|
||||
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
let mut batch_fsync = BatchFsync::default();
|
||||
|
||||
timeline
|
||||
.upgrade_to_layered_timeline()
|
||||
.evict_layer(layer_id, &mut batch_fsync)?;
|
||||
|
||||
batch_fsync.done()?;
|
||||
|
||||
global_layer_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,8 +6,8 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
pub offset: u64,
|
||||
pub size: usize,
|
||||
}
|
||||
|
||||
pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
|
||||
@@ -42,13 +42,13 @@ use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::vfd::VirtualFile;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::batch_fsync::BatchFsync;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
@@ -146,6 +146,8 @@ pub struct DeltaLayerInner {
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
|
||||
vfile: VirtualFile,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -191,9 +193,11 @@ impl Layer for DeltaLayer {
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
// TODO: avoid opening the file for each read
|
||||
let (_path, book) = self.open_book()?;
|
||||
let mut inner = self.load()?;
|
||||
let file = inner.vfile.open()?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
let inner = self.load()?;
|
||||
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
@@ -226,6 +230,9 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
// release metadata lock and close the file
|
||||
|
||||
let file = book.close();
|
||||
inner.vfile.cache(file);
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
@@ -365,12 +372,23 @@ impl DeltaLayer {
|
||||
dropped: bool,
|
||||
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
batch_fsync: &mut BatchFsync,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let path = Self::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&DeltaFileName {
|
||||
seg: seg,
|
||||
start_lsn: start_lsn,
|
||||
end_lsn: end_lsn,
|
||||
dropped: dropped,
|
||||
}
|
||||
);
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -383,6 +401,7 @@ impl DeltaLayer {
|
||||
loaded: true,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes,
|
||||
vfile: VirtualFile::new(&path),
|
||||
}),
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
@@ -438,8 +457,7 @@ impl DeltaLayer {
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
let file = writer.into_inner()?;
|
||||
batch_fsync.add(file)?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
@@ -503,11 +521,9 @@ impl DeltaLayer {
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
*inner = DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas,
|
||||
relsizes,
|
||||
};
|
||||
inner.loaded = true;
|
||||
inner.page_version_metas = page_version_metas;
|
||||
inner.relsizes = relsizes;
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
@@ -519,6 +535,13 @@ impl DeltaLayer {
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
) -> DeltaLayer {
|
||||
let path = Self::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&filename,
|
||||
);
|
||||
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -531,6 +554,7 @@ impl DeltaLayer {
|
||||
loaded: false,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
vfile: VirtualFile::new(&path),
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -541,7 +565,7 @@ impl DeltaLayer {
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
@@ -554,6 +578,7 @@ impl DeltaLayer {
|
||||
loaded: false,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
vfile: VirtualFile::new(path),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use anyhow::Result;
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
use super::METADATA_FILE_NAME;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
|
||||
@@ -1,209 +0,0 @@
|
||||
//!
|
||||
//! Global registry of open layers.
|
||||
//!
|
||||
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
|
||||
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
|
||||
//! in-memory layers in the system, and know when we need to evict some to release
|
||||
//! memory.
|
||||
//!
|
||||
//! Each layer is assigned a unique ID when it's registered in the global registry.
|
||||
//! The ID can be used to relocate the layer later, without having to hold locks.
|
||||
//!
|
||||
|
||||
use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::inmemory_layer::InMemoryLayer;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref GLOBAL_LAYER_MAP: RwLock<OpenLayers> = RwLock::new(OpenLayers::default());
|
||||
}
|
||||
|
||||
///
|
||||
/// How much memory is being used by all the open layers? This is used to trigger
|
||||
/// freezing and evicting an open layer to disk.
|
||||
///
|
||||
/// This is only a rough approximation, it leaves out a lot of things like malloc()
|
||||
/// overhead. But as long there is enough "slop" and it's not set too close to the RAM
|
||||
/// size on the system, it's good enough.
|
||||
pub static GLOBAL_OPEN_MEM_USAGE: AtomicUsize = AtomicUsize::new(0);
|
||||
|
||||
// TODO these types can probably be smaller
|
||||
#[derive(PartialEq, Eq, Clone, Copy)]
|
||||
pub struct LayerId {
|
||||
index: usize,
|
||||
tag: u64, // to avoid ABA problem
|
||||
}
|
||||
|
||||
enum SlotData {
|
||||
Occupied(Arc<InMemoryLayer>),
|
||||
/// Vacant slots form a linked list, the value is the index
|
||||
/// of the next vacant slot in the list.
|
||||
Vacant(Option<usize>),
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
tag: u64,
|
||||
data: SlotData,
|
||||
usage_count: AtomicU8, // for clock algorithm
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct OpenLayers {
|
||||
slots: Vec<Slot>,
|
||||
num_occupied: usize,
|
||||
next_victim: AtomicUsize,
|
||||
|
||||
// Head of free-slot list.
|
||||
next_empty_slot_idx: Option<usize>,
|
||||
}
|
||||
|
||||
impl OpenLayers {
|
||||
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let slot_idx = match self.next_empty_slot_idx {
|
||||
Some(slot_idx) => slot_idx,
|
||||
None => {
|
||||
let idx = self.slots.len();
|
||||
self.slots.push(Slot {
|
||||
tag: 0,
|
||||
data: SlotData::Vacant(None),
|
||||
usage_count: AtomicU8::new(0),
|
||||
});
|
||||
idx
|
||||
}
|
||||
};
|
||||
let slots_len = self.slots.len();
|
||||
|
||||
let slot = &mut self.slots[slot_idx];
|
||||
|
||||
match slot.data {
|
||||
SlotData::Occupied(_) => {
|
||||
panic!("an occupied slot was in the free list");
|
||||
}
|
||||
SlotData::Vacant(next_empty_slot_idx) => {
|
||||
self.next_empty_slot_idx = next_empty_slot_idx;
|
||||
}
|
||||
}
|
||||
|
||||
slot.data = SlotData::Occupied(layer);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
self.num_occupied += 1;
|
||||
assert!(self.num_occupied <= slots_len);
|
||||
|
||||
LayerId {
|
||||
index: slot_idx,
|
||||
tag: slot.tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
|
||||
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
|
||||
if slot.tag != layer_id.tag {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let SlotData::Occupied(layer) = &slot.data {
|
||||
let _ = slot.usage_count.fetch_update(
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
|old_usage_count| {
|
||||
if old_usage_count < MAX_USAGE_COUNT {
|
||||
Some(old_usage_count + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Find a victim layer to evict, if the total memory usage of all open layers
|
||||
/// is larger than 'limit'
|
||||
pub fn find_victim_if_needed(&self, limit: usize) -> Option<(LayerId, Arc<InMemoryLayer>)> {
|
||||
let mem_usage = GLOBAL_OPEN_MEM_USAGE.load(Ordering::Relaxed);
|
||||
|
||||
if mem_usage > limit {
|
||||
self.find_victim()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_victim(&self) -> Option<(LayerId, Arc<InMemoryLayer>)> {
|
||||
if self.num_occupied == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Run the clock algorithm.
|
||||
//
|
||||
// FIXME: It's theoretically possible that a constant stream of get() requests
|
||||
// comes in faster than we advance the clock hand, so that this never finishes.
|
||||
loop {
|
||||
// FIXME: Because we interpret the clock hand variable modulo slots.len(), the
|
||||
// hand effectively jumps to a more or less random place whenever the array is
|
||||
// expanded. That's relatively harmless, it just leads to a non-optimal choice
|
||||
// of victim. Also, in a server that runs for long enough, the array should reach
|
||||
// a steady-state size and not grow anymore.
|
||||
let next_victim = self.next_victim.fetch_add(1, Ordering::Relaxed) % self.slots.len();
|
||||
|
||||
let slot = &self.slots[next_victim];
|
||||
|
||||
if let SlotData::Occupied(data) = &slot.data {
|
||||
fn update_fn(old_usage_count: u8) -> Option<u8> {
|
||||
if old_usage_count > 0 {
|
||||
Some(old_usage_count - 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
if slot
|
||||
.usage_count
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, update_fn)
|
||||
.is_err()
|
||||
{
|
||||
// Found a slot with usage_count == 0. Return it.
|
||||
return Some((
|
||||
LayerId {
|
||||
index: next_victim,
|
||||
tag: slot.tag,
|
||||
},
|
||||
Arc::clone(data),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO this won't be a public API in the future
|
||||
pub fn remove(&mut self, layer_id: &LayerId) {
|
||||
let slot = &mut self.slots[layer_id.index];
|
||||
|
||||
if slot.tag != layer_id.tag {
|
||||
return;
|
||||
}
|
||||
|
||||
match &slot.data {
|
||||
SlotData::Occupied(_layer) => {
|
||||
// TODO evict the layer
|
||||
}
|
||||
SlotData::Vacant(_) => unimplemented!(),
|
||||
}
|
||||
|
||||
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
|
||||
self.next_empty_slot_idx = Some(layer_id.index);
|
||||
|
||||
assert!(self.num_occupied > 0);
|
||||
self.num_occupied -= 1;
|
||||
|
||||
slot.tag = slot.tag.wrapping_add(1);
|
||||
}
|
||||
}
|
||||
@@ -29,6 +29,7 @@ use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::vfd::VirtualFile;
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
@@ -36,10 +37,9 @@ use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{BufWriter, Read, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use zenith_utils::batch_fsync::BatchFsync;
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
@@ -111,6 +111,8 @@ pub struct ImageLayerInner {
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
|
||||
vfile: VirtualFile,
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -152,11 +154,12 @@ impl Layer for ImageLayer {
|
||||
) -> Result<PageReconstructResult> {
|
||||
assert!(lsn >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
let mut inner = self.load()?;
|
||||
|
||||
let base_blknum = blknum % RELISH_SEG_SIZE;
|
||||
|
||||
let (_path, book) = self.open_book()?;
|
||||
let mut file = inner.vfile.open()?;
|
||||
let mut book = Book::new(&mut file)?;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
@@ -167,17 +170,20 @@ impl Layer for ImageLayer {
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * base_blknum as u64;
|
||||
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
chapter.read_exact_at(&mut buf, offset)?;
|
||||
let mut chapter = book.exclusive_chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
chapter.seek(SeekFrom::Start(offset))?;
|
||||
chapter.read_exact(&mut buf)?;
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(base_blknum == 0);
|
||||
book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
|
||||
book.exclusive_read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
|
||||
}
|
||||
};
|
||||
|
||||
inner.vfile.cache(file);
|
||||
|
||||
reconstruct_data.page_img = Some(Bytes::from(buf));
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -262,7 +268,6 @@ impl ImageLayer {
|
||||
seg: SegmentTag,
|
||||
lsn: Lsn,
|
||||
base_images: Vec<Bytes>,
|
||||
batch_fsync: &mut BatchFsync,
|
||||
) -> Result<ImageLayer> {
|
||||
let image_type = if seg.rel.is_blocky() {
|
||||
let num_blocks: u32 = base_images.len().try_into()?;
|
||||
@@ -272,6 +277,16 @@ impl ImageLayer {
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
|
||||
let path = Self::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName {
|
||||
seg: seg,
|
||||
lsn: lsn,
|
||||
}
|
||||
);
|
||||
|
||||
let layer = ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -281,12 +296,12 @@ impl ImageLayer {
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
loaded: true,
|
||||
image_type: image_type.clone(),
|
||||
vfile: VirtualFile::new(&path),
|
||||
}),
|
||||
};
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
let path = layer.path();
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let file = File::create(&path)?;
|
||||
@@ -322,8 +337,7 @@ impl ImageLayer {
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
let file = writer.into_inner()?;
|
||||
batch_fsync.add(file)?;
|
||||
writer.get_ref().sync_all()?;
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
@@ -339,7 +353,6 @@ impl ImageLayer {
|
||||
timeline: &LayeredTimeline,
|
||||
src: &dyn Layer,
|
||||
lsn: Lsn,
|
||||
batch_fsync: &mut BatchFsync,
|
||||
) -> Result<ImageLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
let timelineid = timeline.timelineid;
|
||||
@@ -368,15 +381,7 @@ impl ImageLayer {
|
||||
base_images.push(img);
|
||||
}
|
||||
|
||||
Self::create(
|
||||
conf,
|
||||
timelineid,
|
||||
timeline.tenantid,
|
||||
seg,
|
||||
lsn,
|
||||
base_images,
|
||||
batch_fsync,
|
||||
)
|
||||
Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -390,7 +395,8 @@ impl ImageLayer {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
let book = Book::new(inner.vfile.open()?)?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -428,12 +434,10 @@ impl ImageLayer {
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
debug!("loaded from {}", &self.path().display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
loaded: true,
|
||||
image_type,
|
||||
};
|
||||
inner.loaded = true;
|
||||
inner.image_type = image_type;
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
@@ -454,6 +458,14 @@ impl ImageLayer {
|
||||
tenantid: ZTenantId,
|
||||
filename: &ImageFileName,
|
||||
) -> ImageLayer {
|
||||
|
||||
let path = Self::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
filename,
|
||||
);
|
||||
|
||||
ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -463,6 +475,7 @@ impl ImageLayer {
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
vfile: VirtualFile::new(&path),
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -483,6 +496,7 @@ impl ImageLayer {
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
vfile: VirtualFile::new(path),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
|
||||
//!
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::global_layer_map::GLOBAL_OPEN_MEM_USAGE;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
|
||||
};
|
||||
@@ -17,9 +16,7 @@ use anyhow::{bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::batch_fsync::BatchFsync;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -72,15 +69,6 @@ pub struct InMemoryLayerInner {
|
||||
/// a non-blocky rel, 'segsizes' is not used and is always empty.
|
||||
///
|
||||
segsizes: VecMap<Lsn, u32>,
|
||||
|
||||
/// Approximate amount of memory used by this layer.
|
||||
///
|
||||
/// TODO: This is currently a very crude metric, we don't take into account allocator
|
||||
/// overhead, memory fragmentation, memory used by the VecMaps, nor many other things.
|
||||
/// Just the actual # of bytes of a page image (8 kB) or the size of a WAL record.
|
||||
///
|
||||
/// Whenever this is changed, you must also modify GLOBAL_OPEN_MEM_USAGE accordingly!
|
||||
mem_usage: usize,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
@@ -101,16 +89,12 @@ impl InMemoryLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InMemoryLayerInner {
|
||||
fn drop(&mut self) {
|
||||
if self.mem_usage > 0 {
|
||||
GLOBAL_OPEN_MEM_USAGE.fetch_sub(self.mem_usage, Ordering::Relaxed);
|
||||
self.mem_usage = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
|
||||
fn upgrade_to_inmemory_layer(&self) -> Option<&InMemoryLayer> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
@@ -306,6 +290,12 @@ pub struct LayersOnDisk {
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
|
||||
impl LayersOnDisk {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.delta_layers.is_empty() && self.image_layers.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_pending_lsn(&self) -> Lsn {
|
||||
@@ -349,7 +339,6 @@ impl InMemoryLayer {
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
mem_usage: 0,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -396,13 +385,6 @@ impl InMemoryLayer {
|
||||
|
||||
inner.assert_writeable();
|
||||
|
||||
let mut mem_usage = 0;
|
||||
if let Some(img) = &pv.page_image {
|
||||
mem_usage += img.len();
|
||||
} else if let Some(rec) = &pv.record {
|
||||
mem_usage += rec.rec.len();
|
||||
}
|
||||
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
|
||||
|
||||
if old.is_some() {
|
||||
@@ -413,9 +395,6 @@ impl InMemoryLayer {
|
||||
);
|
||||
}
|
||||
|
||||
inner.mem_usage += mem_usage;
|
||||
GLOBAL_OPEN_MEM_USAGE.fetch_add(mem_usage, Ordering::Relaxed);
|
||||
|
||||
// Also update the relation size, if this extended the relation.
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1;
|
||||
@@ -551,7 +530,6 @@ impl InMemoryLayer {
|
||||
dropped: false,
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
mem_usage: 0,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -581,16 +559,6 @@ impl InMemoryLayer {
|
||||
for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
assert!(lsn <= end_lsn);
|
||||
}
|
||||
|
||||
// It's a bit premature to subtract the global mem usage here already.
|
||||
// This layer consumes memory until it's written out to disk and dropped.
|
||||
// But GLOBAL_OPEN_MEM_USAGE is used to trigger layer eviction, if there are
|
||||
// too many open layers, and from that point of view this should no longer be
|
||||
// counted against the global mem usage.
|
||||
if inner.mem_usage > 0 {
|
||||
GLOBAL_OPEN_MEM_USAGE.fetch_sub(inner.mem_usage, Ordering::Relaxed);
|
||||
inner.mem_usage = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -602,11 +570,7 @@ impl InMemoryLayer {
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(
|
||||
&self,
|
||||
timeline: &LayeredTimeline,
|
||||
batch_fsync: &mut BatchFsync,
|
||||
) -> Result<LayersOnDisk> {
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
@@ -636,7 +600,6 @@ impl InMemoryLayer {
|
||||
true,
|
||||
inner.page_versions.ordered_page_version_iter(None),
|
||||
inner.segsizes.clone(),
|
||||
batch_fsync,
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
@@ -674,7 +637,6 @@ impl InMemoryLayer {
|
||||
false,
|
||||
page_versions,
|
||||
segsizes,
|
||||
batch_fsync,
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
trace!(
|
||||
@@ -691,7 +653,7 @@ impl InMemoryLayer {
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive, batch_fsync)?;
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
|
||||
@@ -41,23 +41,22 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IntervalTree<I: ?Sized>
|
||||
pub struct IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
points: BTreeMap<I::Key, Point<I>>,
|
||||
}
|
||||
|
||||
struct Point<I: ?Sized> {
|
||||
struct Point<I> {
|
||||
/// All intervals that contain this point, in no particular order.
|
||||
///
|
||||
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
|
||||
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
|
||||
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
|
||||
/// a linear search is OK.
|
||||
elements: Vec<Arc<I>>,
|
||||
elements: Vec<I>,
|
||||
}
|
||||
|
||||
/// Abstraction for an interval that can be stored in the tree
|
||||
@@ -75,14 +74,14 @@ pub trait IntervalItem {
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> IntervalTree<I>
|
||||
impl<I> IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
I: IntervalItem + PartialEq + Clone,
|
||||
{
|
||||
/// Return an element that contains 'key', or precedes it.
|
||||
///
|
||||
/// If there are multiple candidates, returns the one with the highest 'end' key.
|
||||
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
|
||||
pub fn search(&self, key: I::Key) -> Option<&I> {
|
||||
// Find the greatest point that precedes or is equal to the search key. If there is
|
||||
// none, returns None.
|
||||
let (_, p) = self.points.range(..=key).next_back()?;
|
||||
@@ -100,7 +99,7 @@ where
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
Some(Arc::clone(highest_item))
|
||||
Some(highest_item)
|
||||
}
|
||||
|
||||
/// Iterate over all items with start bound >= 'key'
|
||||
@@ -119,7 +118,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: Arc<I>) {
|
||||
pub fn insert(&mut self, item: &I) {
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
assert!(start_key < end_key);
|
||||
@@ -133,18 +132,18 @@ where
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
!point.elements.iter().any(|x| x == item),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
point.elements.push(Arc::clone(&item));
|
||||
point.elements.push(item.clone());
|
||||
}
|
||||
if !found_start_point {
|
||||
// Create a new Point for the starting point
|
||||
|
||||
// Look at the previous point, and copy over elements that overlap with this
|
||||
// new point
|
||||
let mut new_elements: Vec<Arc<I>> = Vec::new();
|
||||
let mut new_elements: Vec<I> = Vec::new();
|
||||
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
|
||||
let overlapping_prev_elements = prev_point
|
||||
.elements
|
||||
@@ -154,7 +153,7 @@ where
|
||||
|
||||
new_elements.extend(overlapping_prev_elements);
|
||||
}
|
||||
new_elements.push(item);
|
||||
new_elements.push(item.clone());
|
||||
|
||||
let new_point = Point {
|
||||
elements: new_elements,
|
||||
@@ -163,7 +162,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, item: &Arc<I>) {
|
||||
pub fn remove(&mut self, item: &I) {
|
||||
// range search points
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
@@ -176,7 +175,7 @@ where
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
point.elements.retain(|other| other != item);
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
@@ -191,19 +190,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntervalIter<'a, I: ?Sized>
|
||||
pub struct IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, I>)>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem + ?Sized,
|
||||
I: IntervalItem,
|
||||
{
|
||||
type Item = Arc<I>;
|
||||
type Item = &'a I;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Iterate over all elements in all the points in 'point_iter'. To avoid
|
||||
@@ -214,7 +213,7 @@ where
|
||||
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
|
||||
for elem in elem_iter {
|
||||
if elem.start_key() == *point_key {
|
||||
return Some(Arc::clone(elem));
|
||||
return Some(elem);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -230,7 +229,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> Default for IntervalTree<I>
|
||||
impl<I> Default for IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
@@ -246,7 +245,7 @@ mod tests {
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
struct MockItem {
|
||||
start_key: u32,
|
||||
end_key: u32,
|
||||
@@ -288,7 +287,7 @@ mod tests {
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
expected: &[&str],
|
||||
) -> Option<Arc<MockItem>> {
|
||||
) -> Option<MockItem> {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
@@ -299,7 +298,7 @@ mod tests {
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
Some(v)
|
||||
Some(v.clone())
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
@@ -331,12 +330,12 @@ mod tests {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Simple, non-overlapping ranges.
|
||||
tree.insert(Arc::new(MockItem::new(10, 11)));
|
||||
tree.insert(Arc::new(MockItem::new(11, 12)));
|
||||
tree.insert(Arc::new(MockItem::new(12, 13)));
|
||||
tree.insert(Arc::new(MockItem::new(18, 19)));
|
||||
tree.insert(Arc::new(MockItem::new(17, 18)));
|
||||
tree.insert(Arc::new(MockItem::new(15, 16)));
|
||||
tree.insert(&MockItem::new(10, 11));
|
||||
tree.insert(&MockItem::new(11, 12));
|
||||
tree.insert(&MockItem::new(12, 13));
|
||||
tree.insert(&MockItem::new(18, 19));
|
||||
tree.insert(&MockItem::new(17, 18));
|
||||
tree.insert(&MockItem::new(15, 16));
|
||||
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &["10-11"]);
|
||||
@@ -370,13 +369,13 @@ mod tests {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Overlapping items
|
||||
tree.insert(Arc::new(MockItem::new(22, 24)));
|
||||
tree.insert(Arc::new(MockItem::new(23, 25)));
|
||||
let x24_26 = Arc::new(MockItem::new(24, 26));
|
||||
tree.insert(Arc::clone(&x24_26));
|
||||
let x26_28 = Arc::new(MockItem::new(26, 28));
|
||||
tree.insert(Arc::clone(&x26_28));
|
||||
tree.insert(Arc::new(MockItem::new(25, 27)));
|
||||
tree.insert(&MockItem::new(22, 24));
|
||||
tree.insert(&MockItem::new(23, 25));
|
||||
let x24_26 = MockItem::new(24, 26);
|
||||
tree.insert(&x24_26);
|
||||
let x26_28 = MockItem::new(26, 28);
|
||||
tree.insert(&x26_28);
|
||||
tree.insert(&MockItem::new(25, 27));
|
||||
|
||||
assert_search(&tree, 22, &["22-24"]);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
@@ -403,10 +402,10 @@ mod tests {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Items containing other items
|
||||
tree.insert(Arc::new(MockItem::new(31, 39)));
|
||||
tree.insert(Arc::new(MockItem::new(32, 34)));
|
||||
tree.insert(Arc::new(MockItem::new(33, 35)));
|
||||
tree.insert(Arc::new(MockItem::new(30, 40)));
|
||||
tree.insert(&MockItem::new(31, 39));
|
||||
tree.insert(&MockItem::new(32, 34));
|
||||
tree.insert(&MockItem::new(33, 35));
|
||||
tree.insert(&MockItem::new(30, 40));
|
||||
|
||||
assert_search(&tree, 30, &["30-40"]);
|
||||
assert_search(&tree, 31, &["30-40", "31-39"]);
|
||||
@@ -427,16 +426,16 @@ mod tests {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Duplicate keys
|
||||
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
|
||||
tree.insert(Arc::clone(&item_a));
|
||||
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
|
||||
tree.insert(Arc::clone(&item_b));
|
||||
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
|
||||
tree.insert(Arc::clone(&item_c));
|
||||
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
|
||||
tree.insert(Arc::clone(&item_d));
|
||||
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
|
||||
tree.insert(Arc::clone(&item_e));
|
||||
let item_a = MockItem::new_str(55, 56, "a");
|
||||
tree.insert(&item_a);
|
||||
let item_b = MockItem::new_str(55, 56, "b");
|
||||
tree.insert(&item_b);
|
||||
let item_c = MockItem::new_str(55, 56, "c");
|
||||
tree.insert(&item_c);
|
||||
let item_d = MockItem::new_str(54, 56, "d");
|
||||
tree.insert(&item_d);
|
||||
let item_e = MockItem::new_str(55, 57, "e");
|
||||
tree.insert(&item_e);
|
||||
|
||||
dump_tree(&tree);
|
||||
|
||||
@@ -461,8 +460,8 @@ mod tests {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Inserting the same item twice is not cool
|
||||
let item = Arc::new(MockItem::new(1, 2));
|
||||
tree.insert(Arc::clone(&item));
|
||||
tree.insert(Arc::clone(&item)); // fails assertion
|
||||
let item = MockItem::new(1, 2);
|
||||
tree.insert(&item);
|
||||
tree.insert(&item); // fails assertion
|
||||
}
|
||||
}
|
||||
|
||||
129
pageserver/src/layered_repository/jobs.rs
Normal file
129
pageserver/src/layered_repository/jobs.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use crate::tenant_mgr;
|
||||
use crate::layered_repository::layer_map;
|
||||
use crate::PageServerConf;
|
||||
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use tracing::*;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::Mutex;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
static ref JOB_QUEUE: Mutex<GlobalJobQueue> = Mutex::new(GlobalJobQueue::default());
|
||||
}
|
||||
|
||||
|
||||
#[derive(Default)]
|
||||
struct GlobalJobQueue {
|
||||
jobs: VecDeque<GlobalJob>,
|
||||
}
|
||||
|
||||
pub enum GlobalJob {
|
||||
// To release memory
|
||||
EvictSomeLayer,
|
||||
|
||||
// To advance 'disk_consistent_lsn'
|
||||
CheckpointTimeline(ZTenantId, ZTimelineId),
|
||||
|
||||
// To free up disk space
|
||||
GarbageCollect(ZTenantId),
|
||||
}
|
||||
|
||||
pub fn schedule_job(job: GlobalJob) {
|
||||
let mut queue = JOB_QUEUE.lock().unwrap();
|
||||
queue.jobs.push_back(job);
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch the global job handler thread
|
||||
///
|
||||
/// TODO: This ought to be a pool of threads
|
||||
///
|
||||
pub fn launch_global_job_thread(conf: &'static PageServerConf) -> JoinHandle<()> {
|
||||
std::thread::Builder::new()
|
||||
.name("Global Job thread".into())
|
||||
.spawn(move || {
|
||||
// FIXME: relaunch it? Panic is not good.
|
||||
|
||||
global_job_loop(conf).expect("Global job thread died");
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn global_job_loop(conf: &'static PageServerConf) -> Result<()> {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
std::thread::sleep(conf.checkpoint_period);
|
||||
info!("global job thread waking up");
|
||||
|
||||
let mut queue = JOB_QUEUE.lock().unwrap();
|
||||
while let Some(job) = queue.jobs.pop_front() {
|
||||
drop(queue);
|
||||
|
||||
let result = match job {
|
||||
GlobalJob::EvictSomeLayer => {
|
||||
evict_layer()
|
||||
},
|
||||
GlobalJob::CheckpointTimeline(tenantid, timelineid) => {
|
||||
checkpoint_timeline(tenantid, timelineid)
|
||||
}
|
||||
GlobalJob::GarbageCollect(tenantid) => {
|
||||
gc_tenant(tenantid)
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(err) = result {
|
||||
error!("job ended in error: {:#}", err);
|
||||
}
|
||||
|
||||
queue = JOB_QUEUE.lock().unwrap();
|
||||
}
|
||||
}
|
||||
trace!("Checkpointer thread shut down");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Freeze and write out an in-memory layer
|
||||
fn evict_layer() -> Result<()>
|
||||
{
|
||||
// Pick a victim
|
||||
while let Some(layer_id) = layer_map::find_victim() {
|
||||
let victim_layer = match layer_map::get_layer(layer_id) {
|
||||
Some(l) => l,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let tenantid = victim_layer.get_tenant_id();
|
||||
let timelineid = victim_layer.get_timeline_id();
|
||||
|
||||
let _entered = info_span!("global evict", timeline = %timelineid, tenant = %tenantid)
|
||||
.entered();
|
||||
|
||||
info!("evicting {}", victim_layer.filename().display());
|
||||
|
||||
drop(victim_layer);
|
||||
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
timeline.upgrade_to_layered_timeline().evict_layer(layer_id)?
|
||||
}
|
||||
info!("no more eviction needed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn checkpoint_timeline(tenantid: ZTenantId, timelineid: ZTimelineId) -> Result<()> {
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
timeline.checkpoint_scheduled()
|
||||
}
|
||||
|
||||
fn gc_tenant(tenantid: ZTenantId) -> Result<()> {
|
||||
let tenant = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
tenant.gc_scheduled()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -9,6 +9,23 @@
|
||||
//! new image and delta layers and corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Global layer registry:
|
||||
//
|
||||
// Every layer is inserted into the global registry, and assigned an ID
|
||||
//
|
||||
// The global registry tracks memory usage and usage count for each layer
|
||||
//
|
||||
//
|
||||
// In addition to that, there is a per-timeline LayerMap, used for lookups
|
||||
//
|
||||
//
|
||||
|
||||
|
||||
|
||||
use crate::layered_repository::{schedule_job, GlobalJob};
|
||||
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
@@ -17,12 +34,11 @@ use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_INMEMORY_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
|
||||
@@ -30,6 +46,176 @@ lazy_static! {
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric");
|
||||
|
||||
// Global layer map
|
||||
static ref LAYERS: Mutex<GlobalLayerMap> = Mutex::new(GlobalLayerMap::new());
|
||||
}
|
||||
|
||||
const MAX_OPEN_LAYERS: usize = 10;
|
||||
|
||||
const MAX_LOADED_LAYERS: usize = 100;
|
||||
|
||||
struct GlobalLayerEntry {
|
||||
tag: u64, // to fix ABA problem
|
||||
layer: Option<Arc<dyn Layer>>,
|
||||
usage_count: u32,
|
||||
}
|
||||
|
||||
struct GlobalLayerMap {
|
||||
open_layers: Vec<GlobalLayerEntry>,
|
||||
clock_arm: u32,
|
||||
|
||||
num_open_layers: usize,
|
||||
eviction_scheduled: bool,
|
||||
|
||||
historic_layers: Vec<GlobalLayerEntry>,
|
||||
}
|
||||
|
||||
impl GlobalLayerMap {
|
||||
pub fn new() -> GlobalLayerMap {
|
||||
GlobalLayerMap {
|
||||
open_layers: Vec::new(),
|
||||
clock_arm: 0,
|
||||
historic_layers: Vec::new(),
|
||||
eviction_scheduled: false,
|
||||
num_open_layers: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&mut self, layer_id: LayerId) -> Option<Arc<dyn Layer>> {
|
||||
let e = if layer_id.is_historic() {
|
||||
let idx = (layer_id.index - 1) as usize;
|
||||
&mut self.historic_layers[idx]
|
||||
} else {
|
||||
let idx = ((-layer_id.index) - 1) as usize;
|
||||
&mut self.open_layers[idx]
|
||||
};
|
||||
if e.usage_count < 5 {
|
||||
e.usage_count += 1;
|
||||
}
|
||||
|
||||
e.layer.clone()
|
||||
}
|
||||
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let index = -(self.open_layers.len() as isize + 1);
|
||||
|
||||
let entry = GlobalLayerEntry {
|
||||
layer: Some(layer),
|
||||
usage_count: 1,
|
||||
tag: 1,
|
||||
};
|
||||
let tag = entry.tag;
|
||||
self.open_layers.push(entry);
|
||||
self.num_open_layers += 1;
|
||||
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
|
||||
if !self.eviction_scheduled && self.num_open_layers >= MAX_OPEN_LAYERS {
|
||||
info!("scheduling global eviction");
|
||||
schedule_job(GlobalJob::EvictSomeLayer);
|
||||
self.eviction_scheduled = true;
|
||||
}
|
||||
|
||||
LayerId {
|
||||
index,
|
||||
tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) -> LayerId {
|
||||
let index = self.historic_layers.len() as isize + 1;
|
||||
|
||||
let entry = GlobalLayerEntry {
|
||||
layer: Some(layer),
|
||||
usage_count: 1,
|
||||
tag: 1,
|
||||
};
|
||||
let tag = entry.tag;
|
||||
self.historic_layers.push(entry);
|
||||
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
|
||||
LayerId {
|
||||
index,
|
||||
tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, layer_id: LayerId) -> Option<Arc<dyn Layer>> {
|
||||
let old_layer;
|
||||
|
||||
if layer_id.is_historic() {
|
||||
let idx = (layer_id.index - 1) as usize;
|
||||
old_layer = self.historic_layers[idx].layer.take();
|
||||
if old_layer.is_some() {
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
} else {
|
||||
let idx = ((-layer_id.index) - 1) as usize;
|
||||
old_layer = self.open_layers[idx].layer.take();
|
||||
if old_layer.is_some() {
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
self.num_open_layers -= 1;
|
||||
}
|
||||
}
|
||||
old_layer
|
||||
}
|
||||
|
||||
pub fn find_victim(&mut self) -> Option<LayerId> {
|
||||
// run the clock algorithm among all open layers
|
||||
for _ in 0..self.open_layers.len() * 5 {
|
||||
self.clock_arm += 1;
|
||||
if self.clock_arm >= self.open_layers.len() as u32 {
|
||||
self.clock_arm = 0;
|
||||
}
|
||||
let next = self.clock_arm as usize;
|
||||
|
||||
if self.open_layers[next].usage_count == 0 {
|
||||
return Some(LayerId {
|
||||
index: -((next + 1) as isize),
|
||||
tag: self.open_layers[next].tag,
|
||||
});
|
||||
} else {
|
||||
self.open_layers[next].usage_count -= 1;
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_victim() -> Option<LayerId> {
|
||||
let mut l = LAYERS.lock().unwrap();
|
||||
|
||||
if l.num_open_layers >= MAX_OPEN_LAYERS {
|
||||
if let Some(x) = l.find_victim() {
|
||||
info!("found victim out of {} open layers", l.num_open_layers);
|
||||
Some(x)
|
||||
} else {
|
||||
info!("no victim found at {} open layers", l.num_open_layers);
|
||||
None
|
||||
}
|
||||
} else {
|
||||
info!("no victim needed at {} open layers", l.num_open_layers);
|
||||
l.eviction_scheduled = false;
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_layer(layer_id: LayerId) -> Option<Arc<dyn Layer>> {
|
||||
LAYERS.lock().unwrap().get(layer_id)
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
pub struct LayerId {
|
||||
index: isize,
|
||||
tag: u64
|
||||
}
|
||||
|
||||
impl LayerId {
|
||||
pub fn is_historic(&self) -> bool {
|
||||
self.index > 0
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -43,7 +229,7 @@ pub struct LayerMap {
|
||||
/// All in-memory layers, ordered by 'oldest_pending_lsn' and generation
|
||||
/// of each layer. This allows easy access to the in-memory layer that
|
||||
/// contains the oldest WAL record.
|
||||
open_layers: BinaryHeap<OpenLayerEntry>,
|
||||
open_layers: BinaryHeap<OpenLayerHeapEntry>,
|
||||
|
||||
/// Generation number, used to distinguish newly inserted entries in the
|
||||
/// binary heap from older entries during checkpoint.
|
||||
@@ -63,25 +249,32 @@ impl LayerMap {
|
||||
segentry.get(lsn)
|
||||
}
|
||||
|
||||
pub fn get_with_id(&self, layer_id: LayerId) -> Option<Arc<dyn Layer>> {
|
||||
// TODO: check that it belongs to this tenant+timeline
|
||||
LAYERS.lock().unwrap().get(layer_id)
|
||||
}
|
||||
|
||||
///
|
||||
/// Get the open layer for given segment for writing. Or None if no open
|
||||
/// layer exists.
|
||||
///
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<dyn Layer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry
|
||||
.open_layer_id
|
||||
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
|
||||
let (layer_id, _start_lsn) = segentry.open?;
|
||||
LAYERS.lock().unwrap().get(layer_id)
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert an open in-memory layer
|
||||
///
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
|
||||
let layer_id = LAYERS.lock().unwrap().insert_open(Arc::clone(&layer));
|
||||
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
let layer_id = segentry.update_open(Arc::clone(&layer));
|
||||
segentry.update_open(layer_id, layer.get_start_lsn());
|
||||
|
||||
let oldest_pending_lsn = layer.get_oldest_pending_lsn();
|
||||
|
||||
@@ -91,69 +284,52 @@ impl LayerMap {
|
||||
assert!(oldest_pending_lsn.is_aligned());
|
||||
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
let open_layer_entry = OpenLayerHeapEntry {
|
||||
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
|
||||
layer_id,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_layers.push(open_layer_entry);
|
||||
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove an open in-memory layer
|
||||
pub fn remove_open(&mut self, layer_id: LayerId) {
|
||||
// Note: we don't try to remove the entry from the binary heap.
|
||||
// It will be removed lazily by peek_oldest_open() when it's made it to
|
||||
// the top of the heap.
|
||||
/// Remove a layer
|
||||
pub fn remove(&mut self, layer_id: LayerId) {
|
||||
if let Some(layer) = LAYERS.lock().unwrap().remove(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
if layer_id.is_historic() {
|
||||
let tag = layer.get_seg_tag();
|
||||
|
||||
let layer_opt = {
|
||||
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
|
||||
let layer_opt = global_map.get(&layer_id);
|
||||
global_map.remove(&layer_id);
|
||||
// TODO it's bad that a ref can still exist after being evicted from cache
|
||||
layer_opt
|
||||
};
|
||||
|
||||
if let Some(layer) = layer_opt {
|
||||
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
|
||||
|
||||
if segentry.open_layer_id == Some(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
segentry.open_layer_id = None;
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&HistoricLayerIntervalTreeEntry::new(layer_id, layer));
|
||||
}
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!layer.is_writeable());
|
||||
assert!(layer.is_dropped());
|
||||
let segtag = layer.get_seg_tag();
|
||||
let mut segentry = self.segs.get_mut(&segtag).unwrap();
|
||||
if let Some(open) = segentry.open {
|
||||
if open.0 == layer_id {
|
||||
segentry.open = None;
|
||||
}
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
//assert!(!layer.is_writeable());
|
||||
//assert!(layer.is_dropped());
|
||||
}
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) -> LayerId {
|
||||
|
||||
let layer_id = LAYERS.lock().unwrap().insert_historic(Arc::clone(&layer));
|
||||
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
segentry.insert_historic(layer);
|
||||
segentry.insert_historic(layer_id, layer);
|
||||
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
///
|
||||
/// Remove an on-disk layer from the map.
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&layer);
|
||||
}
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
layer_id
|
||||
}
|
||||
|
||||
// List relations along with a flag that marks if they exist at the given lsn.
|
||||
@@ -214,12 +390,13 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
|
||||
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<dyn Layer>, u64)> {
|
||||
|
||||
while let Some(oldest_entry) = self.open_layers.peek() {
|
||||
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
|
||||
if let Some(layer) = LAYERS.lock().unwrap().get(oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id,
|
||||
layer,
|
||||
oldest_entry.generation));
|
||||
} else {
|
||||
self.open_layers.pop();
|
||||
}
|
||||
@@ -242,17 +419,14 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open_layer_id {
|
||||
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
|
||||
layer.dump()?;
|
||||
} else {
|
||||
println!("layer not found in global map");
|
||||
}
|
||||
if let Some(open) = &segentry.open {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
@@ -262,16 +436,39 @@ impl LayerMap {
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
impl IntervalItem for dyn Layer {
|
||||
#[derive(Clone)]
|
||||
struct HistoricLayerIntervalTreeEntry {
|
||||
layer_id: LayerId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl HistoricLayerIntervalTreeEntry {
|
||||
fn new(layer_id: LayerId, layer: Arc<dyn Layer>) -> HistoricLayerIntervalTreeEntry{
|
||||
HistoricLayerIntervalTreeEntry {
|
||||
layer_id,
|
||||
start_lsn: layer.get_start_lsn(),
|
||||
end_lsn: layer.get_end_lsn(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for HistoricLayerIntervalTreeEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.layer_id == other.layer_id
|
||||
}
|
||||
}
|
||||
impl IntervalItem for HistoricLayerIntervalTreeEntry {
|
||||
type Key = Lsn;
|
||||
|
||||
fn start_key(&self) -> Lsn {
|
||||
self.get_start_lsn()
|
||||
self.start_lsn
|
||||
}
|
||||
fn end_key(&self) -> Lsn {
|
||||
self.get_end_lsn()
|
||||
self.end_lsn
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,8 +482,8 @@ impl IntervalItem for dyn Layer {
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open_layer_id: Option<LayerId>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
open: Option<(LayerId, Lsn)>,
|
||||
historic: IntervalTree<HistoricLayerIntervalTreeEntry>,
|
||||
}
|
||||
|
||||
impl SegEntry {
|
||||
@@ -301,14 +498,19 @@ impl SegEntry {
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open_layer_id) = &self.open_layer_id {
|
||||
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
|
||||
if open_layer.get_start_lsn() <= lsn {
|
||||
return Some(open_layer);
|
||||
if let Some(open) = &self.open {
|
||||
if let Some(layer) = LAYERS.lock().unwrap().get(open.0) {
|
||||
if layer.get_start_lsn() <= lsn {
|
||||
return Some(layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.historic.search(lsn)
|
||||
if let Some(historic) = self.historic.search(lsn) {
|
||||
Some(LAYERS.lock().unwrap().get(historic.layer_id).unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn newer_image_layer_exists(&self, lsn: Lsn) -> bool {
|
||||
@@ -317,26 +519,25 @@ impl SegEntry {
|
||||
|
||||
self.historic
|
||||
.iter_newer(lsn)
|
||||
.any(|layer| !layer.is_incremental())
|
||||
.any(|e| {
|
||||
let layer = LAYERS.lock().unwrap().get(e.layer_id).unwrap();
|
||||
!layer.is_incremental()
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
if let Some(prev_open_layer_id) = &self.open_layer_id {
|
||||
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
|
||||
{
|
||||
assert!(!prev_open_layer.is_writeable());
|
||||
}
|
||||
pub fn update_open(&mut self, layer_id: LayerId, start_lsn: Lsn) {
|
||||
if let Some(_prev_open) = &self.open {
|
||||
//assert!(!prev_open.is_writeable());
|
||||
}
|
||||
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
|
||||
self.open_layer_id = Some(open_layer_id);
|
||||
open_layer_id
|
||||
self.open = Some((layer_id, start_lsn));
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(layer);
|
||||
pub fn insert_historic(&mut self, layer_id: LayerId, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(&HistoricLayerIntervalTreeEntry::new(layer_id, layer));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,12 +547,12 @@ impl SegEntry {
|
||||
/// The generation number associated with each entry can be used to distinguish
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_pending_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
generation: u64,
|
||||
layer_id: LayerId,
|
||||
struct OpenLayerHeapEntry {
|
||||
pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
pub generation: u64,
|
||||
pub layer_id: LayerId,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
impl Ord for OpenLayerHeapEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that. Entries with identical oldest_pending_lsn are ordered by generation
|
||||
@@ -361,32 +562,33 @@ impl Ord for OpenLayerEntry {
|
||||
.then_with(|| other.generation.cmp(&self.generation))
|
||||
}
|
||||
}
|
||||
impl PartialOrd for OpenLayerEntry {
|
||||
impl PartialOrd for OpenLayerHeapEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for OpenLayerEntry {
|
||||
impl PartialEq for OpenLayerHeapEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for OpenLayerEntry {}
|
||||
impl Eq for OpenLayerHeapEntry {}
|
||||
|
||||
/// Iterator returned by LayerMap::iter_historic_layers()
|
||||
pub struct HistoricLayerIter<'a> {
|
||||
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<IntervalIter<'a, dyn Layer>>,
|
||||
iter: Option<IntervalIter<'a, HistoricLayerIntervalTreeEntry>>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
type Item = Arc<dyn Layer>;
|
||||
type Item = (LayerId, Arc<dyn Layer>);
|
||||
|
||||
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
|
||||
loop {
|
||||
if let Some(x) = &mut self.iter {
|
||||
if let Some(x) = x.next() {
|
||||
return Some(Arc::clone(&x));
|
||||
let layer = LAYERS.lock().unwrap().get(x.layer_id).unwrap();
|
||||
return Some((x.layer_id, layer));
|
||||
}
|
||||
}
|
||||
if let Some((_tag, segentry)) = self.seg_iter.next() {
|
||||
@@ -460,7 +662,7 @@ mod tests {
|
||||
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.remove_open(layer_id);
|
||||
layers.remove(layer_id);
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::relish_storage::storage_sync`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
|
||||
use std::{convert::TryInto, path::PathBuf};
|
||||
|
||||
use anyhow::ensure;
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
|
||||
let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
|
||||
pub fn prev_record_lsn(&self) -> Option<Lsn> {
|
||||
self.prev_record_lsn
|
||||
}
|
||||
|
||||
pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline
|
||||
}
|
||||
|
||||
pub fn ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// This module is for direct conversion of metadata to bytes and back.
|
||||
/// For a certain metadata, besides the conversion a few verification steps has to
|
||||
/// be done, so all serde derives are hidden from the user, to avoid accidental
|
||||
/// verification-less metadata creation.
|
||||
mod serialize {
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
|
||||
|
||||
use super::TimelineMetadata;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SeTimelineMetadata<'a> {
|
||||
disk_consistent_lsn: &'a Lsn,
|
||||
prev_record_lsn: &'a Option<Lsn>,
|
||||
ancestor_timeline: &'a Option<ZTimelineId>,
|
||||
ancestor_lsn: &'a Lsn,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
|
||||
fn from(other: &'a TimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: &other.disk_consistent_lsn,
|
||||
prev_record_lsn: &other.prev_record_lsn,
|
||||
ancestor_timeline: &other.ancestor_timeline,
|
||||
ancestor_lsn: &other.ancestor_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub(super) struct DeTimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<DeTimelineMetadata> for TimelineMetadata {
|
||||
fn from(other: DeTimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: other.disk_consistent_lsn,
|
||||
prev_record_lsn: other.prev_record_lsn,
|
||||
ancestor_timeline: other.ancestor_timeline,
|
||||
ancestor_lsn: other.ancestor_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::repository::repo_harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
let original_metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Should serialize correct metadata to bytes");
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata, original_metadata,
|
||||
"Metadata that was serialized to bytes and deserialized back should not change"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
//! Common traits and structs for layers
|
||||
//!
|
||||
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
@@ -104,6 +105,12 @@ pub enum PageReconstructResult {
|
||||
/// in-memory and on-disk layers.
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
|
||||
fn upgrade_to_inmemory_layer(&self) -> Option<&InMemoryLayer> {
|
||||
None
|
||||
}
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
|
||||
@@ -18,10 +18,10 @@ pub mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
pub mod vfd;
|
||||
|
||||
pub mod defaults {
|
||||
use const_format::formatcp;
|
||||
@@ -43,8 +43,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
|
||||
pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -73,8 +71,6 @@ pub struct PageServerConf {
|
||||
pub gc_period: Duration,
|
||||
pub superuser: String,
|
||||
|
||||
pub open_mem_limit: usize,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
@@ -157,7 +153,6 @@ impl PageServerConf {
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
@@ -170,15 +165,6 @@ impl PageServerConf {
|
||||
}
|
||||
}
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CheckpointConfig {
|
||||
// Flush in-memory data that is older than this
|
||||
Distance(u64),
|
||||
// Flush all in-memory data
|
||||
Forced,
|
||||
}
|
||||
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RelishStorageConfig {
|
||||
|
||||
@@ -630,9 +630,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
|
||||
// since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
|
||||
// just use false in place of include non incremental logical size
|
||||
let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
|
||||
let branches = crate::branches::get_branches(self.conf, &tenantid)?;
|
||||
let branches_buf = serde_json::to_vec(&branches)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
@@ -692,7 +690,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
let result = repo.gc_manual(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
|
||||
@@ -16,9 +16,8 @@ use anyhow::{bail, Context};
|
||||
use tokio::{fs, io};
|
||||
use tracing::*;
|
||||
|
||||
use crate::layered_repository::metadata::METADATA_FILE_NAME;
|
||||
|
||||
use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
|
||||
pub struct LocalFs {
|
||||
pageserver_workdir: &'static Path,
|
||||
@@ -215,7 +214,6 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
relish_storage::test_utils::{
|
||||
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
|
||||
},
|
||||
|
||||
@@ -11,7 +11,7 @@ use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
layered_repository::METADATA_FILE_NAME,
|
||||
relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
@@ -75,7 +75,7 @@ use tracing::*;
|
||||
|
||||
use super::{RelishStorage, RemoteRelishInfo};
|
||||
use crate::{
|
||||
layered_repository::metadata::{metadata_path, TimelineMetadata},
|
||||
layered_repository::{metadata_path, TimelineMetadata},
|
||||
tenant_mgr::register_relish_download,
|
||||
PageServerConf,
|
||||
};
|
||||
@@ -151,9 +151,7 @@ struct RemoteTimeline {
|
||||
|
||||
impl RemoteTimeline {
|
||||
fn disk_consistent_lsn(&self) -> Option<Lsn> {
|
||||
self.metadata
|
||||
.as_ref()
|
||||
.map(|meta| meta.disk_consistent_lsn())
|
||||
self.metadata.as_ref().map(|meta| meta.disk_consistent_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -335,7 +333,7 @@ fn latest_timelines(
|
||||
if latest_timeline_id != &remote_timeline_id
|
||||
&& timeline_metadata
|
||||
.as_ref()
|
||||
.map(|metadata| metadata.disk_consistent_lsn())
|
||||
.map(|metadata| metadata.disk_consistent_lsn)
|
||||
< remote_timeline_data.disk_consistent_lsn()
|
||||
{
|
||||
*latest_timeline_id = remote_timeline_id;
|
||||
@@ -520,8 +518,8 @@ async fn upload_timeline<'a, P, S: 'static + RelishStorage<RelishStoragePath = P
|
||||
match &uploaded_timeline_files.metadata {
|
||||
None => debug!("Partially uploaded timeline found, downloading missing files only"),
|
||||
Some(remote_metadata) => {
|
||||
let new_lsn = new_upload.metadata.disk_consistent_lsn();
|
||||
let remote_lsn = remote_metadata.disk_consistent_lsn();
|
||||
let new_lsn = new_upload.metadata.disk_consistent_lsn;
|
||||
let remote_lsn = remote_metadata.disk_consistent_lsn;
|
||||
match new_lsn.cmp(&remote_lsn) {
|
||||
Ordering::Equal | Ordering::Less => {
|
||||
warn!(
|
||||
@@ -905,7 +903,7 @@ mod tests {
|
||||
|
||||
let new_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
assert!(
|
||||
new_upload_metadata.disk_consistent_lsn() < first_upload_metadata.disk_consistent_lsn()
|
||||
new_upload_metadata.disk_consistent_lsn < first_upload_metadata.disk_consistent_lsn
|
||||
);
|
||||
let new_upload =
|
||||
create_local_timeline(&repo_harness, TIMELINE_ID, &["b", "c"], new_upload_metadata)?;
|
||||
@@ -929,8 +927,7 @@ mod tests {
|
||||
)?;
|
||||
let second_paths = second_timeline.layers.clone();
|
||||
assert!(
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
first_upload_metadata.disk_consistent_lsn < second_upload_metadata.disk_consistent_lsn
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
@@ -958,8 +955,7 @@ mod tests {
|
||||
|
||||
let third_upload_metadata = dummy_metadata(Lsn(0x50));
|
||||
assert!(
|
||||
second_upload_metadata.disk_consistent_lsn()
|
||||
< third_upload_metadata.disk_consistent_lsn()
|
||||
second_upload_metadata.disk_consistent_lsn < third_upload_metadata.disk_consistent_lsn
|
||||
);
|
||||
let third_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
@@ -1253,7 +1249,7 @@ mod tests {
|
||||
while let Some(task) = queue_accessor.pop() {
|
||||
let task_lsn = match &task {
|
||||
SyncTask::Upload(LocalTimeline { metadata, .. }) => {
|
||||
Some(metadata.disk_consistent_lsn())
|
||||
Some(metadata.disk_consistent_lsn)
|
||||
}
|
||||
SyncTask::UrgentDownload(remote_timeline) | SyncTask::Download(remote_timeline) => {
|
||||
remote_timeline.disk_consistent_lsn()
|
||||
@@ -1261,8 +1257,8 @@ mod tests {
|
||||
};
|
||||
|
||||
if let Some(task_lsn) = task_lsn {
|
||||
if task_lsn == smaller_lsn_metadata.disk_consistent_lsn()
|
||||
|| task_lsn == bigger_lsn_metadata.disk_consistent_lsn()
|
||||
if task_lsn == smaller_lsn_metadata.disk_consistent_lsn
|
||||
|| task_lsn == bigger_lsn_metadata.disk_consistent_lsn
|
||||
{
|
||||
ordered_tasks.push(task);
|
||||
}
|
||||
@@ -1553,6 +1549,11 @@ mod tests {
|
||||
}
|
||||
|
||||
fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata {
|
||||
TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0))
|
||||
TimelineMetadata {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn: None,
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::relish::*;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -25,25 +24,25 @@ pub trait Repository: Send + Sync {
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this funtion is periodically called by gc thread.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
/// perform one garbage collection iteration.
|
||||
/// garbage collection is periodically performed by gc thread,
|
||||
/// but it can be explicitly requested through page server api.
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before CG
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
fn gc_iteration(
|
||||
fn gc_manual(
|
||||
&self,
|
||||
timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult>;
|
||||
|
||||
/// perform one checkpoint iteration, flushing in-memory data on disk.
|
||||
/// this function is periodically called by checkponter thread.
|
||||
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
fn gc_scheduled(&self) -> Result<GcResult>;
|
||||
|
||||
fn upgrade_to_layered_repository(&self) -> &crate::layered_repository::LayeredRepository;
|
||||
}
|
||||
|
||||
///
|
||||
@@ -149,7 +148,9 @@ pub trait Timeline: Send + Sync {
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
fn checkpoint_forced(&self) -> Result<()>;
|
||||
|
||||
fn checkpoint_scheduled(&self) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
@@ -161,7 +162,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
|
||||
/// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
|
||||
}
|
||||
|
||||
@@ -326,10 +326,9 @@ pub mod repo_harness {
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::layered_repository::metadata::METADATA_FILE_NAME;
|
||||
|
||||
use super::repo_harness::*;
|
||||
use super::*;
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
@@ -723,8 +722,8 @@ mod tests {
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint(CheckpointConfig::Forced)?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
newtline.checkpoint_forced()?;
|
||||
repo.gc_manual(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::tenant_threads;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
@@ -17,6 +16,7 @@ use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::thread::JoinHandle;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
@@ -28,7 +28,7 @@ struct Tenant {
|
||||
repo: Option<Arc<dyn Repository>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
|
||||
pub enum TenantState {
|
||||
// This tenant only exists in cloud storage. It cannot be accessed.
|
||||
CloudOnly,
|
||||
@@ -41,12 +41,10 @@ pub enum TenantState {
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
Active,
|
||||
// Tenant is active, but there is no walreceiver connection.
|
||||
Idle,
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
// The tenant cannot be accessed anymore for any reason, but graceful shutdown.
|
||||
Stopping,
|
||||
//Stopping,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
@@ -55,8 +53,6 @@ impl fmt::Display for TenantState {
|
||||
TenantState::CloudOnly => f.write_str("CloudOnly"),
|
||||
TenantState::Downloading => f.write_str("Downloading"),
|
||||
TenantState::Active => f.write_str("Active"),
|
||||
TenantState::Idle => f.write_str("Idle"),
|
||||
TenantState::Stopping => f.write_str("Stopping"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -65,6 +61,18 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS.lock().unwrap()
|
||||
}
|
||||
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Logically these handles belong to Repository,
|
||||
// but it's just simpler to store them separately
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
@@ -101,7 +109,7 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
pub fn register_relish_download(
|
||||
@@ -123,14 +131,10 @@ pub fn register_relish_download(
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => {
|
||||
init_timeline(repo.as_ref(), timeline_id);
|
||||
tenant.state = TenantState::Idle;
|
||||
return;
|
||||
}
|
||||
Some(repo) => init_timeline(repo.as_ref(), timeline_id),
|
||||
None => log::warn!("Initialize new repo"),
|
||||
}
|
||||
tenant.state = TenantState::Idle;
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
// init repo updates Tenant state
|
||||
@@ -151,23 +155,27 @@ pub fn shutdown_requested() -> bool {
|
||||
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn stop_tenant_threads(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
debug!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
debug!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shutdown_all_tenants() -> Result<()> {
|
||||
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
|
||||
let tenantids = list_tenantids()?;
|
||||
|
||||
for tenantid in &tenantids {
|
||||
set_tenant_state(*tenantid, TenantState::Stopping)?;
|
||||
}
|
||||
|
||||
for tenantid in tenantids {
|
||||
// Wait for checkpointer and GC to finish their job
|
||||
tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
|
||||
|
||||
stop_tenant_threads(tenantid);
|
||||
let repo = get_repository_for_tenant(tenantid)?;
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
repo.shutdown()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -194,38 +202,11 @@ pub fn create_repository_for_tenant(
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
tenant.state = TenantState::Active;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If tenant is not found in the repository, return CloudOnly state
|
||||
pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
|
||||
let m = access_tenants();
|
||||
match m.get(&tenantid) {
|
||||
Some(tenant) => tenant.state,
|
||||
None => TenantState::CloudOnly,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid);
|
||||
|
||||
match tenant {
|
||||
Some(tenant) => {
|
||||
if newstate == TenantState::Idle && tenant.state != TenantState::Active {
|
||||
// Only Active tenant can become Idle
|
||||
return Ok(tenant.state);
|
||||
}
|
||||
info!("set_tenant_state: {} -> {}", tenant.state, newstate);
|
||||
tenant.state = newstate;
|
||||
Ok(tenant.state)
|
||||
}
|
||||
None => bail!("Tenant not found for tenant {}", tenantid),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
let m = access_tenants();
|
||||
let tenant = m
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as checkpointer and GC
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Preserve handles to wait for thread completion
|
||||
// at shutdown
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"tenant_threads_count",
|
||||
"Number of live tenant threads",
|
||||
&["tenant_thread_type"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Launch checkpointer and GC for the tenant.
|
||||
// It's possible that the threads are running already,
|
||||
// if so, just don't spawn new ones.
|
||||
pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = handles
|
||||
.entry(tenantid)
|
||||
.or_insert_with(|| TenantHandleEntry {
|
||||
checkpointer_handle: None,
|
||||
gc_handle: None,
|
||||
});
|
||||
|
||||
if h.checkpointer_handle.is_none() {
|
||||
h.checkpointer_handle = std::thread::Builder::new()
|
||||
.name("Checkpointer thread".into())
|
||||
.spawn(move || {
|
||||
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
if h.gc_handle.is_none() {
|
||||
h.gc_handle = std::thread::Builder::new()
|
||||
.name("GC thread".into())
|
||||
.spawn(move || {
|
||||
gc_loop(tenantid, conf).expect("GC thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
trace!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
trace!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
handles.remove(&tenantid);
|
||||
}
|
||||
|
||||
///
|
||||
/// Checkpointer thread's main loop
|
||||
///
|
||||
fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
}
|
||||
|
||||
std::thread::sleep(conf.checkpoint_period);
|
||||
trace!("checkpointer thread for tenant {} waking up", tenantid);
|
||||
|
||||
// checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
|
||||
// bytes of WAL since last checkpoint.
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
|
||||
}
|
||||
|
||||
trace!(
|
||||
"checkpointer thread stopped for tenant {} state is {}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// GC thread's main loop
|
||||
///
|
||||
fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
}
|
||||
|
||||
trace!("gc thread for tenant {} waking up", tenantid);
|
||||
|
||||
// Garbage collect old files that are not needed for PITR anymore
|
||||
if conf.gc_horizon > 0 {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
|
||||
}
|
||||
|
||||
// TODO Write it in more adequate way using
|
||||
// condvar.wait_timeout() or something
|
||||
let mut sleep_time = conf.gc_period.as_secs();
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
|
||||
sleep_time -= 1;
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"GC thread stopped for tenant {} state is {}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
155
pageserver/src/vfd.rs
Normal file
155
pageserver/src/vfd.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
use std::fs::File;
|
||||
use std::io::Seek;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const INVALID_TAG: u64 = u64::MAX;
|
||||
|
||||
struct OpenFiles {
|
||||
next: usize,
|
||||
files: Vec<OpenFile>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref OPEN_FILES: Mutex<OpenFiles> = Mutex::new(OpenFiles {
|
||||
next: 0,
|
||||
files: Vec::new(),
|
||||
});
|
||||
}
|
||||
|
||||
struct OpenFile {
|
||||
tag: u64,
|
||||
file: Option<File>,
|
||||
}
|
||||
|
||||
pub struct VirtualFile {
|
||||
vfd: usize,
|
||||
tag: u64,
|
||||
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
|
||||
pub fn new(path: &Path) -> VirtualFile {
|
||||
VirtualFile {
|
||||
vfd: 0,
|
||||
tag: INVALID_TAG,
|
||||
path: path.to_path_buf(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn open(&mut self) -> std::io::Result<File> {
|
||||
|
||||
let mut l = OPEN_FILES.lock().unwrap();
|
||||
|
||||
if self.vfd < l.files.len() && l.files[self.vfd].tag == self.tag {
|
||||
|
||||
if let Some(mut file) = l.files[self.vfd].file.take() {
|
||||
// return cached File
|
||||
file.rewind()?;
|
||||
return Ok(file);
|
||||
}
|
||||
}
|
||||
File::open(&self.path)
|
||||
}
|
||||
|
||||
pub fn cache(&mut self, file: File) {
|
||||
|
||||
let mut l = OPEN_FILES.lock().unwrap();
|
||||
|
||||
let next = if l.next >= l.files.len() {
|
||||
if l.files.len() < 100 {
|
||||
l.files.push(OpenFile {
|
||||
tag: 0,
|
||||
file: None
|
||||
});
|
||||
l.files.len() - 1
|
||||
} else {
|
||||
// wrap around
|
||||
0
|
||||
}
|
||||
} else {
|
||||
l.next
|
||||
};
|
||||
l.next = next + 1;
|
||||
|
||||
l.files[next].file.replace(file);
|
||||
l.files[next].tag += 1;
|
||||
|
||||
self.vfd = next;
|
||||
self.tag = l.files[next].tag;
|
||||
|
||||
drop(l);
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
fn drop(&mut self) {
|
||||
|
||||
// Close file if it's still open
|
||||
|
||||
if self.tag != INVALID_TAG {
|
||||
let mut l = OPEN_FILES.lock().unwrap();
|
||||
|
||||
if self.vfd < l.files.len() && l.files[self.vfd].tag == self.tag {
|
||||
l.files[self.vfd].file.take();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::PageServerConf;
|
||||
use super::*;
|
||||
use std::io::Read;
|
||||
|
||||
#[test]
|
||||
fn test_vfd() -> anyhow::Result<()> {
|
||||
|
||||
let mut vfiles = Vec::new();
|
||||
|
||||
let test_dir = PageServerConf::test_repo_dir("test_vfd");
|
||||
let _ = std::fs::remove_dir_all(&test_dir);
|
||||
std::fs::create_dir_all(&test_dir)?;
|
||||
|
||||
for i in 0..2000 {
|
||||
let path = test_dir.join(format!("vfd_test{}", i));
|
||||
let content = format!("foobar{}", i);
|
||||
|
||||
std::fs::write(&path, &content)?;
|
||||
|
||||
let vfile = VirtualFile::new(&path);
|
||||
|
||||
vfiles.push((vfile, path, content));
|
||||
}
|
||||
|
||||
for i in 0..vfiles.len() {
|
||||
let (ref mut vfile, _path, expected_content) = &mut vfiles[i];
|
||||
let mut s = String::new();
|
||||
|
||||
let mut file = vfile.open()?;
|
||||
file.read_to_string(&mut s)?;
|
||||
|
||||
assert!(&s == expected_content);
|
||||
|
||||
vfile.cache(file);
|
||||
|
||||
s.clear();
|
||||
let (ref mut vfile, _path, expected_content) = &mut vfiles[0];
|
||||
let mut file = vfile.open()?;
|
||||
file.read_to_string(&mut s)?;
|
||||
|
||||
assert!(&s == expected_content);
|
||||
|
||||
vfile.cache(file);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -5,12 +5,9 @@
|
||||
//!
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::layered_repository;
|
||||
use crate::relish::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::tenant_threads;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
@@ -41,7 +38,6 @@ use zenith_utils::zid::ZTimelineId;
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
tenantid: ZTenantId,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -69,23 +65,6 @@ pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
receivers.remove(&timelineid);
|
||||
|
||||
// Check if it was the last walreceiver of the tenant.
|
||||
// TODO now we store one WalReceiverEntry per timeline,
|
||||
// so this iterator looks a bit strange.
|
||||
for (_timelineid, entry) in receivers.iter() {
|
||||
if entry.tenantid == tenantid {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// When last walreceiver of the tenant is gone, change state to Idle
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -111,13 +90,8 @@ pub fn launch_wal_receiver(
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
tenantid,
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Update tenant state and start tenant threads, if they are not running yet.
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
|
||||
tenant_threads::start_tenant_threads(conf, tenantid);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -140,15 +114,11 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
|
||||
let mut retry_count = 10;
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
// TODO How long should we retry in case of losing connection?
|
||||
// Should we retry at all or we can wait for the next callmemaybe request?
|
||||
//
|
||||
while !tenant_mgr::shutdown_requested() && retry_count > 0 {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -159,24 +129,14 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
retry_count -= 1;
|
||||
sleep(Duration::from_secs(1));
|
||||
} else {
|
||||
info!(
|
||||
"walreceiver disconnected tenant {}, timelineid {}",
|
||||
tenantid, timelineid
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("WAL streaming shut down");
|
||||
// Drop it from list of active WAL_RECEIVERS
|
||||
// so that next callmemaybe request launched a new thread
|
||||
drop_wal_receiver(timelineid, tenantid);
|
||||
debug!("WAL streaming shut down");
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: ZTenantId,
|
||||
@@ -291,14 +251,14 @@ fn walreceiver_main(
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
timeline.upgrade_to_layered_timeline().schedule_checkpoint_if_needed()?;
|
||||
timeline.upgrade_to_layered_timeline().schedule_gc_if_needed()?;
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
// Release memory if needed
|
||||
layered_repository::evict_layer_if_needed(conf)?;
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
@@ -343,7 +303,6 @@ fn walreceiver_main(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -28,19 +28,21 @@ use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use std::process::{ChildStdin, ChildStdout, ChildStderr, Command};
|
||||
use std::process::Stdio;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use nix::poll::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlMultiXactCreate;
|
||||
@@ -133,14 +135,14 @@ lazy_static! {
|
||||
/// perform WAL replay. Only one thread can use the processs at a time,
|
||||
/// that is controlled by the Mutex. In the future, we might want to
|
||||
/// launch a pool of processes to allow concurrent replay of multiple
|
||||
/// records.
|
||||
/// records. FIXME we have a pool now
|
||||
///
|
||||
pub struct PostgresRedoManager {
|
||||
tenantid: ZTenantId,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
runtime: tokio::runtime::Runtime,
|
||||
process: Mutex<Option<PostgresRedoProcess>>,
|
||||
processes: Vec<Mutex<Option<PostgresRedoProcess>>>,
|
||||
next: AtomicUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -210,21 +212,18 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
end_time = Instant::now();
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
|
||||
} else {
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
let process_no = self.next.fetch_add(1, Ordering::AcqRel) % self.processes.len();
|
||||
let mut process_guard = self.processes[process_no].lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = self
|
||||
.runtime
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
|
||||
let p = PostgresRedoProcess::launch(self.conf, process_no, &self.tenantid)?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
|
||||
result = self
|
||||
.runtime
|
||||
.block_on(self.handle_apply_request_postgres(process, &request));
|
||||
result = self.handle_apply_request_postgres(process, &request);
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
end_time = Instant::now();
|
||||
@@ -240,27 +239,24 @@ impl PostgresRedoManager {
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut processes: Vec<Mutex<Option<PostgresRedoProcess>>> = Vec::new();
|
||||
for _ in 1..10 {
|
||||
processes.push(Mutex::new(None));
|
||||
}
|
||||
|
||||
// The actual process is launched lazily, on first request.
|
||||
PostgresRedoManager {
|
||||
runtime,
|
||||
tenantid,
|
||||
conf,
|
||||
process: Mutex::new(None),
|
||||
processes,
|
||||
next: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
async fn handle_apply_request_postgres(
|
||||
fn handle_apply_request_postgres(
|
||||
&self,
|
||||
process: &mut PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
@@ -278,14 +274,14 @@ impl PostgresRedoManager {
|
||||
if let RelishTag::Relation(rel) = request.rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records);
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
trace!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
|
||||
@@ -471,20 +467,22 @@ impl PostgresRedoManager {
|
||||
struct PostgresRedoProcess {
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stderr: ChildStderr,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
async fn launch(
|
||||
fn launch(
|
||||
conf: &PageServerConf,
|
||||
process_no: usize,
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
|
||||
let datadir = conf.tenant_path(tenantid).join(format!("wal-redo-datadir-{}", process_no));
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
@@ -501,7 +499,6 @@ impl PostgresRedoProcess {
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
@@ -538,102 +535,106 @@ impl PostgresRedoProcess {
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
let stdout = child.stdout.take().expect("failed to open child's stdout");
|
||||
|
||||
// This async block reads the child's stderr, and forwards it to the logger
|
||||
let f_stderr = async {
|
||||
let mut stderr_buffered = tokio::io::BufReader::new(stderr);
|
||||
|
||||
let mut line = String::new();
|
||||
loop {
|
||||
let res = stderr_buffered.read_line(&mut line).await;
|
||||
if res.is_err() {
|
||||
debug!("could not convert line to utf-8");
|
||||
continue;
|
||||
}
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(PostgresRedoProcess { stdin, stdout })
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
Ok(PostgresRedoProcess { stdin, stdout, stderr })
|
||||
}
|
||||
|
||||
//
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
async fn apply_wal_records(
|
||||
fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, WALRecord)],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if let Some(img) = base_img {
|
||||
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
build_begin_redo_for_block_msg(tag, &mut buf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, &img, &mut buf);
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for (lsn, rec) in records.iter() {
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
build_apply_record_msg(*lsn, &rec.rec, &mut buf);
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// Send GetPage command to get the result back
|
||||
build_get_page_msg(tag, &mut buf);
|
||||
|
||||
|
||||
// The input is now in 'buf'.
|
||||
|
||||
let mut nwrite = 0;
|
||||
|
||||
let mut resultbuf = Vec::new();
|
||||
resultbuf.resize(8192, 0);
|
||||
|
||||
let mut nresult = 0;
|
||||
|
||||
let mut pollfds = [
|
||||
PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
|
||||
];
|
||||
|
||||
// Do a blind write first
|
||||
let n = self.stdin.write(&buf[nwrite..])?;
|
||||
nwrite += n;
|
||||
|
||||
while nresult < 8192 {
|
||||
|
||||
let nfds = if nwrite < buf.len() {
|
||||
3
|
||||
} else {
|
||||
2
|
||||
};
|
||||
nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
if nwrite < buf.len() && !pollfds[2].revents().unwrap().is_empty() {
|
||||
// stdin
|
||||
let n = self.stdin.write(&buf[nwrite..])?;
|
||||
nwrite += n;
|
||||
}
|
||||
if !pollfds[0].revents().unwrap().is_empty() {
|
||||
// stdout
|
||||
// Read back new page image
|
||||
let n = self.stdout.read(&mut resultbuf[nresult..])?;
|
||||
|
||||
// Send WAL records.
|
||||
for (lsn, rec) in records.iter() {
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(*lsn, &rec.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
nresult += n;
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
if !pollfds[1].revents().unwrap().is_empty() {
|
||||
// stderr
|
||||
let mut readbuf: [u8; 16384] = [0; 16384];
|
||||
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
let n = self.stderr.read(&mut readbuf)?;
|
||||
|
||||
error!("wal-redo-postgres: {}", String::from_utf8_lossy(&readbuf[0..n]));
|
||||
}
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
}
|
||||
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
Ok(Bytes::from(Vec::from(resultbuf)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -641,62 +642,50 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(&mut buf)
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
//debug_assert!(buf.len() == 1 + len);
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(&mut buf)
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
//debug_assert!(buf.len() - oldlen == 1 + len);
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
//debug_assert!(buf.len() - oldlen == 1 + len);
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
|
||||
fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(&mut buf)
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
//debug_assert!(buf.len() == 1 + len);
|
||||
}
|
||||
|
||||
@@ -15,13 +15,6 @@ pub struct DatabaseInfo {
|
||||
pub password: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct ProxyAuthResult {
|
||||
pub ready: bool,
|
||||
pub error: Option<String>,
|
||||
pub conn_info: Option<DatabaseInfo>,
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
@@ -62,25 +55,22 @@ impl CPlaneApi {
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
psql_session_id: &str,
|
||||
) -> Result<ProxyAuthResult> {
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt))
|
||||
.append_pair("psql_session_id", psql_session_id);
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
|
||||
println!("cplane request: {}", url.as_str());
|
||||
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
|
||||
if resp.status().is_success() {
|
||||
let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got auth info: #{:?}", auth_info);
|
||||
|
||||
Ok(auth_info)
|
||||
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got conn info: #{:?}", conn_info);
|
||||
Ok(conn_info)
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
///
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::SocketAddr,
|
||||
net::{SocketAddr, TcpListener},
|
||||
sync::{mpsc, Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
@@ -17,7 +17,6 @@ use clap::{App, Arg, ArgMatches};
|
||||
|
||||
use cplane_api::DatabaseInfo;
|
||||
use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
|
||||
use zenith_utils::tcp_listener;
|
||||
|
||||
mod cplane_api;
|
||||
mod mgmt;
|
||||
@@ -141,10 +140,10 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting proxy on {}", state.conf.proxy_address);
|
||||
let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;
|
||||
let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
|
||||
|
||||
println!("Starting mgmt on {}", state.conf.mgmt_address);
|
||||
let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;
|
||||
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
|
||||
|
||||
let threads = [
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
|
||||
@@ -75,12 +75,8 @@ pub fn proxy_conn_main(
|
||||
// This will set conn.existing_user and we can decide on next actions
|
||||
conn.handle_startup()?;
|
||||
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
conn.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
// both scenarious here should end up producing database connection string
|
||||
let conn_info = if conn.is_existing_user() {
|
||||
let db_info = if conn.is_existing_user() {
|
||||
conn.handle_existing_user()?
|
||||
} else {
|
||||
conn.handle_new_user()?
|
||||
@@ -88,7 +84,7 @@ pub fn proxy_conn_main(
|
||||
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, conn_info)
|
||||
proxy_pass(conn.pgb, db_info)
|
||||
}
|
||||
|
||||
impl ProxyConnection {
|
||||
@@ -160,21 +156,6 @@ impl ProxyConnection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Wait for proxy kick form the console with conninfo
|
||||
fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
// TODO: respond with error to client
|
||||
rx.recv()?
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
// ask password
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
@@ -201,41 +182,14 @@ impl ProxyConnection {
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
&self.psql_session_id,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
|
||||
"cannot authenticate proxy: {}",
|
||||
e
|
||||
)))?;
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
|
||||
Ok(auth_info) => {
|
||||
let conn_info = if auth_info.ready {
|
||||
// Cluster is ready, so just take `conn_info` and respond to the client.
|
||||
auth_info
|
||||
.conn_info
|
||||
.expect("conn_info should be provided with ready cluster")
|
||||
} else {
|
||||
match auth_info.error {
|
||||
Some(e) => {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
|
||||
"cannot authenticate proxy: {}",
|
||||
e
|
||||
)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
None => {
|
||||
// Cluster exists, but isn't active, await its start and proxy kick
|
||||
// with `conn_info`.
|
||||
self.wait_for_conninfo()?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(conn_info) => {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
@@ -251,6 +205,10 @@ impl ProxyConnection {
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
self.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
let hello_message = format!("☀️ Welcome to Zenith!
|
||||
|
||||
To proceed with database creation, open the following link:
|
||||
@@ -269,15 +227,25 @@ databases without opening the browser.
|
||||
self.pgb
|
||||
.write_message(&BeMessage::NoticeResponse(hello_message))?;
|
||||
|
||||
// We requested the DB creation from the console. Now wait for conninfo
|
||||
let conn_info = self.wait_for_conninfo()?;
|
||||
// await for database creation
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
// XXX: respond with error to client
|
||||
let dbinfo = rx.recv()??;
|
||||
|
||||
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
|
||||
"Connecting to database.".to_string(),
|
||||
))?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
Ok(dbinfo)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -610,9 +610,7 @@ class ZenithPageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1",
|
||||
)
|
||||
res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}", )
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
|
||||
@@ -7,10 +7,11 @@ use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use std::env;
|
||||
use std::net::TcpListener;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::{logging, tcp_listener};
|
||||
use zenith_utils::logging;
|
||||
|
||||
use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use walkeeper::http;
|
||||
@@ -131,13 +132,13 @@ fn main() -> Result<()> {
|
||||
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
||||
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -133,12 +133,9 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
let conf = swh.conf.clone();
|
||||
let timelineid = swh.timeline.get().timelineid;
|
||||
let _ = thread::Builder::new()
|
||||
.name("request_callback thread".into())
|
||||
.spawn(move || {
|
||||
request_callback(conf, timelineid, tenant_id);
|
||||
})
|
||||
.unwrap();
|
||||
thread::spawn(move || {
|
||||
request_callback(conf, timelineid, tenant_id);
|
||||
});
|
||||
}
|
||||
|
||||
loop {
|
||||
|
||||
@@ -173,14 +173,11 @@ impl ReplicationConn {
|
||||
let bg_timeline = Arc::clone(swh.timeline.get());
|
||||
let bg_stream_in = self.stream_in.take().unwrap();
|
||||
|
||||
let _ = thread::Builder::new()
|
||||
.name("HotStandbyFeedback thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
|
||||
error!("Replication background thread failed: {}", err);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
|
||||
error!("Replication background thread failed: {}", err);
|
||||
}
|
||||
});
|
||||
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(cmd)?;
|
||||
|
||||
|
||||
@@ -19,10 +19,7 @@ use lazy_static::lazy_static;
|
||||
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
|
||||
use zenith_metrics::{
|
||||
register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
DISK_WRITE_SECONDS_BUCKETS,
|
||||
};
|
||||
use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::pq_proto::SystemId;
|
||||
@@ -302,27 +299,11 @@ lazy_static! {
|
||||
&["ztli"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_commit_lsn gauge vec");
|
||||
static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_bytes",
|
||||
"Bytes written to WAL in a single request, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_bytes histogram vec");
|
||||
static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_write_wal_seconds",
|
||||
"Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_write_wal_seconds histogram vec");
|
||||
}
|
||||
|
||||
struct SafeKeeperMetrics {
|
||||
flush_lsn: Gauge,
|
||||
commit_lsn: Gauge,
|
||||
write_wal_bytes: Histogram,
|
||||
write_wal_seconds: Histogram,
|
||||
}
|
||||
|
||||
impl SafeKeeperMetrics {
|
||||
@@ -331,8 +312,6 @@ impl SafeKeeperMetrics {
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
|
||||
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&ztli_str]),
|
||||
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&ztli_str]),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -340,8 +319,6 @@ impl SafeKeeperMetrics {
|
||||
SafeKeeperMetrics {
|
||||
flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&["n/a"]),
|
||||
write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&["n/a"]),
|
||||
write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&["n/a"]),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -495,14 +472,8 @@ where
|
||||
// do the job
|
||||
let mut last_rec_lsn = Lsn(0);
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.metrics
|
||||
.write_wal_bytes
|
||||
.observe(msg.wal_data.len() as f64);
|
||||
{
|
||||
let _timer = self.metrics.write_wal_seconds.start_timer();
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
}
|
||||
self.storage
|
||||
.write_wal(&self.s.server, msg.h.begin_lsn, &msg.wal_data)?;
|
||||
|
||||
// figure out last record's end lsn for reporting (if we got the
|
||||
// whole record)
|
||||
|
||||
@@ -12,7 +12,6 @@ use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::time::Duration;
|
||||
use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
@@ -74,23 +73,6 @@ pub enum CreateControlFile {
|
||||
False,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref PERSIST_SYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_sync_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_sync_control_file_seconds histogram vec");
|
||||
static ref PERSIST_NOSYNC_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
|
||||
"safekeeper_persist_nosync_control_file_seconds",
|
||||
"Seconds to persist and sync control file, grouped by timeline",
|
||||
&["timeline_id"],
|
||||
DISK_WRITE_SECONDS_BUCKETS.to_vec()
|
||||
)
|
||||
.expect("Failed to register safekeeper_persist_nosync_control_file_seconds histogram vec");
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
/// Get combined stateof all alive replicas
|
||||
pub fn get_replicas_state(&self) -> ReplicaState {
|
||||
@@ -127,14 +109,9 @@ impl SharedState {
|
||||
create: CreateControlFile,
|
||||
) -> Result<Self> {
|
||||
let (cf, state) = SharedState::load_control_file(conf, timelineid, create)?;
|
||||
let timelineid_str = format!("{}", timelineid);
|
||||
let storage = FileStorage {
|
||||
control_file: cf,
|
||||
conf: conf.clone(),
|
||||
persist_sync_control_file_seconds: PERSIST_SYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
persist_nosync_control_file_seconds: PERSIST_NOSYNC_CONTROL_FILE_SECONDS
|
||||
.with_label_values(&[&timelineid_str]),
|
||||
};
|
||||
let (flush_lsn, tli) = if state.server.wal_seg_size != 0 {
|
||||
let wal_dir = conf.workdir.join(format!("{}", timelineid));
|
||||
@@ -399,18 +376,10 @@ impl GlobalTimelines {
|
||||
struct FileStorage {
|
||||
control_file: File,
|
||||
conf: SafeKeeperConf,
|
||||
persist_sync_control_file_seconds: Histogram,
|
||||
persist_nosync_control_file_seconds: Histogram,
|
||||
}
|
||||
|
||||
impl Storage for FileStorage {
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()> {
|
||||
let _timer = if sync {
|
||||
&self.persist_sync_control_file_seconds
|
||||
} else {
|
||||
&self.persist_nosync_control_file_seconds
|
||||
}
|
||||
.start_timer();
|
||||
self.control_file.seek(SeekFrom::Start(0))?;
|
||||
s.ser_into(&mut self.control_file)?;
|
||||
if sync {
|
||||
|
||||
@@ -18,15 +18,11 @@ pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
|
||||
Ok((socket, peer_addr)) => {
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
|
||||
let _ = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
Err(e) => error!("Failed to accept connection: {}", e),
|
||||
}
|
||||
|
||||
@@ -426,10 +426,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
|
||||
pageserver.tenant_create(tenantid)?;
|
||||
println!("tenant successfully created on the pageserver");
|
||||
}
|
||||
|
||||
(sub_name, _) => {
|
||||
bail!("Unexpected tenant subcommand '{}'", sub_name)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -458,16 +455,13 @@ fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result
|
||||
}
|
||||
|
||||
fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = pg_match.subcommand();
|
||||
let sub_args = sub_args.expect("no pg subcommand");
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
|
||||
// All subcommands take an optional --tenantid option
|
||||
let tenantid = get_tenantid(sub_args, env)?;
|
||||
let tenantid = get_tenantid(pg_match, env)?;
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
match pg_match.subcommand() {
|
||||
("list", Some(_list_match)) => {
|
||||
let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| {
|
||||
eprintln!("Failed to load branch info: {}", e);
|
||||
HashMap::new()
|
||||
@@ -497,21 +491,21 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
);
|
||||
}
|
||||
}
|
||||
"create" => {
|
||||
let node_name = sub_args.value_of("node").unwrap_or("main");
|
||||
let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name);
|
||||
("create", Some(create_match)) => {
|
||||
let node_name = create_match.value_of("node").unwrap_or("main");
|
||||
let timeline_name = create_match.value_of("timeline").unwrap_or(node_name);
|
||||
|
||||
let port: Option<u16> = match sub_args.value_of("port") {
|
||||
let port: Option<u16> = match create_match.value_of("port") {
|
||||
Some(p) => Some(p.parse()?),
|
||||
None => None,
|
||||
};
|
||||
cplane.new_node(tenantid, node_name, timeline_name, port)?;
|
||||
}
|
||||
"start" => {
|
||||
let node_name = sub_args.value_of("node").unwrap_or("main");
|
||||
let timeline_name = sub_args.value_of("timeline");
|
||||
("start", Some(start_match)) => {
|
||||
let node_name = start_match.value_of("node").unwrap_or("main");
|
||||
let timeline_name = start_match.value_of("timeline");
|
||||
|
||||
let port: Option<u16> = match sub_args.value_of("port") {
|
||||
let port: Option<u16> = match start_match.value_of("port") {
|
||||
Some(p) => Some(p.parse()?),
|
||||
None => None,
|
||||
};
|
||||
@@ -547,9 +541,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
node.start(&auth_token)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
let node_name = sub_args.value_of("node").unwrap_or("main");
|
||||
let destroy = sub_args.is_present("destroy");
|
||||
("stop", Some(stop_match)) => {
|
||||
let node_name = stop_match.value_of("node").unwrap_or("main");
|
||||
let destroy = stop_match.is_present("destroy");
|
||||
|
||||
let node = cplane
|
||||
.nodes
|
||||
@@ -558,9 +552,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
node.stop(destroy)?;
|
||||
}
|
||||
|
||||
_ => {
|
||||
bail!("Unexpected pg subcommand '{}'", sub_name)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -599,9 +591,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
}
|
||||
}
|
||||
|
||||
(sub_name, _) => {
|
||||
bail!("Unexpected pageserver subcommand '{}'", sub_name)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -74,10 +74,6 @@ lazy_static! {
|
||||
.expect("Failed to register maxrss_kb int gauge");
|
||||
}
|
||||
|
||||
pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
|
||||
// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned.
|
||||
|
||||
@@ -19,7 +19,6 @@ thiserror = "1.0"
|
||||
tokio = "1.11"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
nix = "0.23.0"
|
||||
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
use std::{fs::File, io};
|
||||
|
||||
const MAX_PENDING_FILES: usize = 100;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct BatchFsync {
|
||||
pending: Vec<File>,
|
||||
done: bool,
|
||||
}
|
||||
|
||||
impl BatchFsync {
|
||||
pub fn add(&mut self, file: File) -> io::Result<()> {
|
||||
if self.pending.len() == MAX_PENDING_FILES {
|
||||
self.sync_batch()?;
|
||||
}
|
||||
|
||||
self.pending.push(file);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Must be called before drop.
|
||||
pub fn done(mut self) -> io::Result<()> {
|
||||
self.done = true;
|
||||
self.sync_batch()
|
||||
}
|
||||
|
||||
fn sync_batch(&mut self) -> io::Result<()> {
|
||||
// TODO parallelize
|
||||
for pending_file in self.pending.drain(..) {
|
||||
pending_file.sync_all()?;
|
||||
}
|
||||
self.pending.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for BatchFsync {
|
||||
fn drop(&mut self) {
|
||||
assert!(self.done);
|
||||
}
|
||||
}
|
||||
@@ -40,9 +40,3 @@ pub mod logging;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
// Call fsync for many files at once.
|
||||
pub mod batch_fsync;
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
use std::{
|
||||
io,
|
||||
net::{TcpListener, ToSocketAddrs},
|
||||
os::unix::prelude::AsRawFd,
|
||||
};
|
||||
|
||||
use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
|
||||
|
||||
/// Bind a [`TcpListener`] to addr with `SO_REUSEADDR` set to true.
|
||||
pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
|
||||
let listener = TcpListener::bind(addr)?;
|
||||
|
||||
setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
|
||||
|
||||
Ok(listener)
|
||||
}
|
||||
Reference in New Issue
Block a user