mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 20:10:38 +00:00
Compare commits
43 Commits
skyzh/reti
...
hackathon/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea3a87105c | ||
|
|
33ad0dc177 | ||
|
|
168a6a87d7 | ||
|
|
11e77815af | ||
|
|
39849cd1ff | ||
|
|
87de28bc62 | ||
|
|
6736557ea6 | ||
|
|
d6ae925739 | ||
|
|
133745c005 | ||
|
|
c62f1cc87f | ||
|
|
ae263e5adf | ||
|
|
31ca007fb3 | ||
|
|
7b6a888c24 | ||
|
|
08705d1b8c | ||
|
|
2cc0b392e8 | ||
|
|
60169ad59d | ||
|
|
ee2a6bad93 | ||
|
|
e9525d1f52 | ||
|
|
e757bc9469 | ||
|
|
7089e34070 | ||
|
|
fd12dd942f | ||
|
|
ebddda5b7f | ||
|
|
efe03d5a1c | ||
|
|
850421ec06 | ||
|
|
6dfbf49128 | ||
|
|
708322ce3c | ||
|
|
99fa1c3600 | ||
|
|
0205ce1849 | ||
|
|
1a9b54f1d9 | ||
|
|
3f43823a9b | ||
|
|
a046717a24 | ||
|
|
7a1397cf37 | ||
|
|
75310fe441 | ||
|
|
ecfa3d9de9 | ||
|
|
3d9001d83f | ||
|
|
1a874a3e86 | ||
|
|
c4fe6641c1 | ||
|
|
c7187be8a1 | ||
|
|
83dd7f559c | ||
|
|
80512e2779 | ||
|
|
3916810f20 | ||
|
|
c43e664ff5 | ||
|
|
b37da32c6f |
1
.github/workflows/build_and_test.yml
vendored
1
.github/workflows/build_and_test.yml
vendored
@@ -286,6 +286,7 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
SYNC_AFTER_EACH_TEST: true
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
13
Cargo.lock
generated
13
Cargo.lock
generated
@@ -2727,6 +2727,12 @@ dependencies = [
|
||||
"hashbrown 0.14.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indoc"
|
||||
version = "2.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
|
||||
|
||||
[[package]]
|
||||
name = "infer"
|
||||
version = "0.2.3"
|
||||
@@ -3701,6 +3707,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"indoc",
|
||||
"itertools 0.10.5",
|
||||
"md5",
|
||||
"metrics",
|
||||
@@ -3766,6 +3773,7 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
"const_format",
|
||||
"enum-map",
|
||||
@@ -3773,11 +3781,16 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools 0.10.5",
|
||||
"nix 0.27.1",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
|
||||
@@ -103,6 +103,7 @@ humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
|
||||
@@ -64,6 +64,12 @@ brew install protobuf openssl flex bison icu4c pkg-config
|
||||
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
|
||||
```
|
||||
|
||||
If you get errors about missing `m4` you may have to install it manually:
|
||||
```
|
||||
brew install m4
|
||||
brew link --force m4
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
|
||||
@@ -653,6 +653,11 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'")
|
||||
})?;
|
||||
|
||||
let pg_version = branch_match
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let start_lsn = branch_match
|
||||
.get_one::<String>("ancestor-start-lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
@@ -665,7 +670,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
existing_initdb_timeline_id: None,
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
@@ -1583,6 +1588,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("branch")
|
||||
.about("Create a new timeline, using another timeline as a base, copying its data")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
|
||||
.help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
|
||||
|
||||
@@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf {
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
pub max_secondary_lag_bytes: Option<u64>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heartbeat_interval: Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -172,6 +175,9 @@ impl NeonStorageControllerConf {
|
||||
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
|
||||
|
||||
// Very tight heartbeat interval to speed up tests
|
||||
const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
|
||||
}
|
||||
|
||||
impl Default for NeonStorageControllerConf {
|
||||
@@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf {
|
||||
database_url: None,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,6 +181,23 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
// If the config file we got as a CLI argument includes the `availability_zone`
|
||||
// config, then use that to populate the `metadata.json` file for the pageserver.
|
||||
// In production the deployment orchestrator does this for us.
|
||||
let az_id = conf
|
||||
.other
|
||||
.get("availability_zone")
|
||||
.map(|toml| {
|
||||
let az_str = toml.to_string();
|
||||
// Trim the (") chars from the toml representation
|
||||
if az_str.starts_with('"') && az_str.ends_with('"') {
|
||||
az_str[1..az_str.len() - 1].to_string()
|
||||
} else {
|
||||
az_str
|
||||
}
|
||||
})
|
||||
.unwrap_or("local".to_string());
|
||||
|
||||
let config = self
|
||||
.pageserver_init_make_toml(conf)
|
||||
.context("make pageserver toml")?;
|
||||
@@ -216,6 +233,7 @@ impl PageServerNode {
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -226,7 +244,10 @@ impl PageServerNode {
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
other: HashMap::new(),
|
||||
other: HashMap::from([(
|
||||
"availability_zone_id".to_string(),
|
||||
serde_json::json!(az_id),
|
||||
)]),
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
|
||||
@@ -437,6 +437,8 @@ impl StorageController {
|
||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||
"--max-warming-up-interval",
|
||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||
"--heartbeat-interval",
|
||||
&humantime::Duration::from(self.config.heartbeat_interval).to_string(),
|
||||
"--address-for-peers",
|
||||
&address_for_peers.to_string(),
|
||||
]
|
||||
|
||||
5
demo0.sh
Executable file
5
demo0.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o xtrace # Print each command before execution
|
||||
|
||||
PGPASSWORD=password psql -h localhost -U postgres -p 8432 -d dockercplane -c "select name, postgres_version from branches where deleted=false;"
|
||||
38
demo1.sh
Executable file
38
demo1.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o xtrace # Print each command before execution
|
||||
|
||||
cargo neon stop
|
||||
rm -rf .neon
|
||||
|
||||
sleep 4
|
||||
|
||||
cargo neon init
|
||||
|
||||
sleep 3
|
||||
|
||||
cargo neon start
|
||||
sleep 3
|
||||
|
||||
export TENANT_ID=14719455a7fbf1d257f427377d096cc2
|
||||
cargo neon tenant create --pg-version 15 --tenant-id $TENANT_ID
|
||||
|
||||
sleep 1
|
||||
|
||||
cargo neon endpoint create main --pg-version 15 --tenant-id $TENANT_ID
|
||||
|
||||
sleep 1
|
||||
|
||||
cargo neon endpoint start main
|
||||
cargo neon endpoint list --tenant-id $TENANT_ID
|
||||
|
||||
sleep 3
|
||||
|
||||
./pg_install/v15/bin/pgbench -i -s 10 -p 55432 -h 127.0.0.1 -U cloud_admin postgres
|
||||
|
||||
# This endpoint runs on version 15
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select version();"
|
||||
psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres -c "select pg_current_wal_lsn()"
|
||||
psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres -c "\d+"
|
||||
|
||||
|
||||
41
demo2.sh
Executable file
41
demo2.sh
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -o xtrace # Print each command before execution
|
||||
|
||||
# stop endpoint. Right now this is important, because pg_upgrade will start it
|
||||
# This is not strictly needed, so with some hacking we can implement upgrade without a pause.
|
||||
|
||||
cargo neon endpoint stop main
|
||||
cargo neon endpoint list --tenant-id $TENANT_ID
|
||||
|
||||
# Let's create branch with new major postgres version
|
||||
# !This is the feature that we developed during the hackathon!
|
||||
# everything else is setup and checks
|
||||
|
||||
cargo neon timeline branch --tenant-id $TENANT_ID --pg-version 16 --branch-name branch_16
|
||||
|
||||
# create and start endpoint on it
|
||||
cargo neon endpoint create ep_16 --pg-version 16 --tenant-id $TENANT_ID --branch-name branch_16
|
||||
|
||||
cargo neon endpoint start ep_16
|
||||
|
||||
# let's ensure that this new endpoint runs on a new version
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select version();"
|
||||
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select pg_current_wal_lsn()"
|
||||
|
||||
|
||||
# This will show 0 bytes size for all user relations
|
||||
# This is a known issue.
|
||||
# New timeline doesn't have these extensions, we will read them from parent.
|
||||
# Now relsize cache for them is also empty. After SeqScan this size cache fill be correct.
|
||||
# We need to copy the relsize cache from parent timeline.
|
||||
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "\d+"
|
||||
|
||||
# And as you can see, there is some data in the new endpoint.
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select count(*) from pgbench_accounts;"
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select count(*) from pgbench_branches;"
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "select count(*) from pgbench_tellers;"
|
||||
|
||||
psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres -c "\d+"
|
||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
# See pageserver/Cargo.toml
|
||||
testing = ["dep:nix"]
|
||||
|
||||
[dependencies]
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
@@ -23,6 +27,12 @@ thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
itertools.workspace = true
|
||||
storage_broker.workspace = true
|
||||
camino = {workspace = true, features = ["serde1"]}
|
||||
remote_storage.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
nix = {workspace = true, optional = true}
|
||||
reqwest.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
|
||||
@@ -1,15 +1,28 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use const_format::formatcp;
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use const_format::formatcp;
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use serde_with::serde_as;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
time::Duration,
|
||||
};
|
||||
use utils::logging::LogFormat;
|
||||
|
||||
use crate::models::ImageCompressionAlgorithm;
|
||||
use crate::models::LsnLease;
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
@@ -29,3 +42,511 @@ pub struct NodeMetadata {
|
||||
#[serde(flatten)]
|
||||
pub other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
/// `pageserver.toml`
|
||||
///
|
||||
/// We use serde derive with `#[serde(default)]` to generate a deserializer
|
||||
/// that fills in the default values for each config field.
|
||||
///
|
||||
/// If there cannot be a static default value because we need to make runtime
|
||||
/// checks to determine the default, make it an `Option` (which defaults to None).
|
||||
/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct ConfigToml {
|
||||
// types mapped 1:1 into the runtime PageServerConfig type
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub availability_zone: Option<String>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wait_lsn_timeout: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wal_redo_timeout: Duration,
|
||||
pub superuser: String,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
pub pg_distrib_dir: Option<Utf8PathBuf>,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub http_auth_type: AuthType,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub pg_auth_type: AuthType,
|
||||
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub tenant_config: TenantConfigToml,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub broker_endpoint: storage_broker::Uri,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub broker_keepalive_interval: Duration,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub log_format: LogFormat,
|
||||
pub concurrent_tenant_warmup: NonZeroUsize,
|
||||
pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<reqwest::Url>,
|
||||
pub metric_collection_bucket: Option<RemoteStorageConfig>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
|
||||
pub test_remote_failures: u64,
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub background_task_maximum_delay: Duration,
|
||||
pub control_plane_api: Option<reqwest::Url>,
|
||||
pub control_plane_api_token: Option<String>,
|
||||
pub control_plane_emergency_mode: bool,
|
||||
pub heatmap_upload_concurrency: usize,
|
||||
pub secondary_download_concurrency: usize,
|
||||
pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
|
||||
pub ingest_batch_size: u64,
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: utils::serde_percent::Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<statvfs::mock::Behavior>,
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
}
|
||||
|
||||
pub mod statvfs {
|
||||
pub mod mock {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum Behavior {
|
||||
Success {
|
||||
blocksize: u64,
|
||||
total_blocks: u64,
|
||||
name_filter: Option<utils::serde_regex::Regex>,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Failure { mocked_error: MockedError },
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[allow(clippy::upper_case_acronyms)]
|
||||
pub enum MockedError {
|
||||
EIO,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
impl From<MockedError> for nix::Error {
|
||||
fn from(e: MockedError) -> Self {
|
||||
match e {
|
||||
MockedError::EIO => nix::Error::EIO,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
pub enum EvictionOrder {
|
||||
RelativeAccessed {
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for EvictionOrder {
|
||||
fn default() -> Self {
|
||||
Self::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum GetVectoredImpl {
|
||||
Sequential,
|
||||
Vectored,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Eq,
|
||||
PartialEq,
|
||||
Debug,
|
||||
Copy,
|
||||
Clone,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
pub enum GetImpl {
|
||||
Legacy,
|
||||
Vectored,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum CompactL0Phase1ValueAccess {
|
||||
/// The old way.
|
||||
PageCachedBlobIo,
|
||||
/// The new way.
|
||||
StreamingKmerge {
|
||||
/// If set, we run both the old way and the new way, validate that
|
||||
/// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
|
||||
/// and if the validation fails,
|
||||
/// - in tests: fail them with a panic or
|
||||
/// - in prod, log a rate-limited warning and use the old way's results.
|
||||
///
|
||||
/// If not set, we only run the new way and trust its results.
|
||||
validate: Option<CompactL0BypassPageCacheValidation>,
|
||||
},
|
||||
}
|
||||
|
||||
/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum CompactL0BypassPageCacheValidation {
|
||||
/// Validate that the series of (key, lsn) pairs are the same.
|
||||
KeyLsn,
|
||||
/// Validate that the entire output of old and new way is identical.
|
||||
KeyLsnValue,
|
||||
}
|
||||
|
||||
impl Default for CompactL0Phase1ValueAccess {
|
||||
fn default() -> Self {
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
|
||||
validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A tenant's calcuated configuration, which is the result of merging a
|
||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||
///
|
||||
/// For storing and transmitting individual tenant's configuration, see
|
||||
/// TenantConfOpt.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(deny_unknown_fields, default)]
|
||||
pub struct TenantConfigToml {
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
// Inmemory layer is also flushed at least once in checkpoint_timeout to
|
||||
// eventually upload WAL after activity is stopped.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub checkpoint_timeout: Duration,
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
// How often to check if there's compaction work to be done.
|
||||
// Duration::ZERO means automatic compaction is disabled.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
// Page versions older than this are garbage collected away.
|
||||
pub gc_horizon: u64,
|
||||
// Interval at which garbage collection is triggered.
|
||||
// Duration::ZERO means automatic GC is disabled
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
// Page versions older than this are garbage collected away.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Duration,
|
||||
/// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub walreceiver_connect_timeout: Duration,
|
||||
/// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
|
||||
/// A stalled safekeeper will be changed to a newer one when it appears.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lagging_wal_timeout: Duration,
|
||||
/// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
|
||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub eviction_policy: crate::models::EvictionPolicy,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
// See the corresponding metric's help string.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: crate::models::ThrottleConfig,
|
||||
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
|
||||
|
||||
/// The length for an explicit LSN lease request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lsn_lease_length: Duration,
|
||||
|
||||
/// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lsn_lease_length_for_ts: Duration,
|
||||
}
|
||||
|
||||
pub mod defaults {
|
||||
use crate::models::ImageCompressionAlgorithm;
|
||||
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
|
||||
pub const DEFAULT_LOG_FORMAT: &str = "plain";
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
|
||||
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
|
||||
|
||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
|
||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Zstd { level: Some(1) };
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
}
|
||||
|
||||
impl Default for ConfigToml {
|
||||
fn default() -> Self {
|
||||
use defaults::*;
|
||||
|
||||
Self {
|
||||
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
availability_zone: (None),
|
||||
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
.expect("cannot parse default wal redo timeout")),
|
||||
superuser: (DEFAULT_SUPERUSER.to_string()),
|
||||
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
|
||||
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
|
||||
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
|
||||
http_auth_type: (AuthType::Trust),
|
||||
pg_auth_type: (AuthType::Trust),
|
||||
auth_validation_public_key_path: (None),
|
||||
remote_storage: None,
|
||||
broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
broker_keepalive_interval: (humantime::parse_duration(
|
||||
storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default keepalive interval")),
|
||||
log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
|
||||
.expect("Invalid default constant")),
|
||||
concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
|
||||
metric_collection_interval: (humantime::parse_duration(
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default metric collection interval")),
|
||||
synthetic_size_calculation_interval: (humantime::parse_duration(
|
||||
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
metric_collection_bucket: (None),
|
||||
|
||||
disk_usage_based_eviction: (None),
|
||||
|
||||
test_remote_failures: (0),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: (false),
|
||||
|
||||
background_task_maximum_delay: (humantime::parse_duration(
|
||||
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
|
||||
)
|
||||
.unwrap()),
|
||||
|
||||
control_plane_api: (None),
|
||||
control_plane_api_token: (None),
|
||||
control_plane_emergency_mode: (false),
|
||||
|
||||
heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||
secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
|
||||
|
||||
virtual_file_io_engine: None,
|
||||
|
||||
max_vectored_read_bytes: (MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod tenant_conf_defaults {
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
||||
|
||||
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
||||
// has different parameters.
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
|
||||
crate::models::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
||||
// If there's a need to decrease this value, first make sure that GC
|
||||
// doesn't hold a layer map write lock for non-trivial operations.
|
||||
// Relevant: https://github.com/neondatabase/neon/issues/3394
|
||||
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
|
||||
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
impl Default for TenantConfigToml {
|
||||
fn default() -> Self {
|
||||
use tenant_conf_defaults::*;
|
||||
Self {
|
||||
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
|
||||
.expect("cannot parse default checkpoint timeout"),
|
||||
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: crate::models::CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
|
||||
)
|
||||
.expect("cannot parse default walreceiver connect timeout"),
|
||||
lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
|
||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
eviction_policy: crate::models::EvictionPolicy::NoEviction,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fmt::Display,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
@@ -435,7 +436,9 @@ pub enum CompactionAlgorithm {
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(
|
||||
Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
|
||||
)]
|
||||
pub enum ImageCompressionAlgorithm {
|
||||
// Disabled for writes, support decompressing during read path
|
||||
Disabled,
|
||||
@@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ImageCompressionAlgorithm {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
|
||||
ImageCompressionAlgorithm::Zstd { level } => {
|
||||
if let Some(level) = level {
|
||||
write!(f, "zstd({})", level)
|
||||
} else {
|
||||
write!(f, "zstd")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CompactionAlgorithmSettings {
|
||||
pub kind: CompactionAlgorithm,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -716,9 +741,17 @@ pub struct TimelineInfo {
|
||||
pub pg_version: u32,
|
||||
|
||||
pub state: TimelineState,
|
||||
pub is_archived: bool,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
// ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
|
||||
// Backward compatibility: you will get a JSON not containing the newly-added field.
|
||||
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
|
||||
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
|
||||
// read.
|
||||
/// The last aux file policy being used on this timeline
|
||||
pub last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub is_archived: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -1648,21 +1681,33 @@ mod tests {
|
||||
#[test]
|
||||
fn test_image_compression_algorithm_parsing() {
|
||||
use ImageCompressionAlgorithm::*;
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("disabled").unwrap(),
|
||||
Disabled
|
||||
);
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("zstd").unwrap(),
|
||||
Zstd { level: None }
|
||||
);
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
|
||||
Zstd { level: Some(18) }
|
||||
);
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
|
||||
Zstd { level: Some(-3) }
|
||||
);
|
||||
let cases = [
|
||||
("disabled", Disabled),
|
||||
("zstd", Zstd { level: None }),
|
||||
("zstd(18)", Zstd { level: Some(18) }),
|
||||
("zstd(-3)", Zstd { level: Some(-3) }),
|
||||
];
|
||||
|
||||
for (display, expected) in cases {
|
||||
assert_eq!(
|
||||
ImageCompressionAlgorithm::from_str(display).unwrap(),
|
||||
expected,
|
||||
"parsing works"
|
||||
);
|
||||
assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
|
||||
|
||||
let ser = serde_json::to_string(&expected).expect("serialization");
|
||||
assert_eq!(
|
||||
serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
|
||||
expected,
|
||||
"serde roundtrip"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
serde_json::Value::String(display.to_string()),
|
||||
serde_json::to_value(expected).unwrap(),
|
||||
"Display is the serde serialization"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,6 +235,31 @@ timeout = '5s'";
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_storage_class_serde_roundtrip() {
|
||||
let classes = [
|
||||
None,
|
||||
Some(StorageClass::Standard),
|
||||
Some(StorageClass::IntelligentTiering),
|
||||
];
|
||||
for class in classes {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct Wrapper {
|
||||
#[serde(
|
||||
deserialize_with = "deserialize_storage_class",
|
||||
serialize_with = "serialize_storage_class"
|
||||
)]
|
||||
class: Option<StorageClass>,
|
||||
}
|
||||
let wrapped = Wrapper {
|
||||
class: class.clone(),
|
||||
};
|
||||
let serialized = serde_json::to_string(&wrapped).unwrap();
|
||||
let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
|
||||
assert_eq!(class, deserialized.class);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_azure_parsing() {
|
||||
let toml = "\
|
||||
|
||||
@@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, EnumVariantNames};
|
||||
|
||||
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
#[derive(
|
||||
EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum LogFormat {
|
||||
Plain,
|
||||
@@ -274,6 +276,14 @@ impl From<String> for SecretString {
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for SecretString {
|
||||
type Err = std::convert::Infallible;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(Self(s.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for SecretString {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[SECRET]")
|
||||
|
||||
@@ -23,7 +23,7 @@ where
|
||||
for (i, item) in iter.enumerate() {
|
||||
visitor(item);
|
||||
|
||||
if i + 1 % interval == 0 {
|
||||
if (i + 1) % interval == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(YieldingLoopError::Cancelled);
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -101,6 +101,7 @@ procfs.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
indoc.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -4,7 +4,7 @@ use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
@@ -167,7 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
virtual_file::init(
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -148,7 +147,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Subcommand;
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
@@ -194,7 +193,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
|
||||
@@ -20,14 +20,13 @@ use clap::{Parser, Subcommand};
|
||||
use index_part::IndexPartCmd;
|
||||
use layers::LayerCmd;
|
||||
use pageserver::{
|
||||
config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
use std::env;
|
||||
use std::env::{var, VarError};
|
||||
use std::io::Read;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -223,27 +224,15 @@ fn initialize_config(
|
||||
}
|
||||
};
|
||||
|
||||
let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
|
||||
Ok(mut f) => {
|
||||
let md = f.metadata().context("stat config file")?;
|
||||
if md.is_file() {
|
||||
let mut s = String::new();
|
||||
f.read_to_string(&mut s).context("read config file")?;
|
||||
s.parse().context("parse config file toml")?
|
||||
} else {
|
||||
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
|
||||
}
|
||||
};
|
||||
|
||||
debug!("Using pageserver toml: {config}");
|
||||
|
||||
// Construct the runtime representation
|
||||
let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
|
||||
.context("Failed to parse pageserver configuration")?;
|
||||
let config_file_contents =
|
||||
std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
|
||||
let config_toml = serde_path_to_error::deserialize(
|
||||
toml_edit::de::Deserializer::from_str(&config_file_contents)
|
||||
.context("build toml deserializer")?,
|
||||
)
|
||||
.context("deserialize config toml")?;
|
||||
let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
|
||||
.context("runtime-validation of config toml")?;
|
||||
|
||||
Ok(Box::leak(Box::new(conf)))
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,8 @@
|
||||
//! Periodically collect consumption metrics for all active tenants
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
use crate::consumption_metrics::upload::KeyGen as _;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
@@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
@@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
mod metrics;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
mod disk_cache;
|
||||
mod metrics;
|
||||
mod upload;
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
@@ -143,6 +145,12 @@ async fn collect_metrics(
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
|
||||
// Pre-generate event idempotency keys, to reuse them across the bucket
|
||||
// and HTTP sinks.
|
||||
let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
|
||||
.take(metrics.len())
|
||||
.collect_vec();
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||
@@ -161,8 +169,14 @@ async fn collect_metrics(
|
||||
}
|
||||
|
||||
if let Some(bucket_client) = &bucket_client {
|
||||
let res =
|
||||
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||
let res = upload::upload_metrics_bucket(
|
||||
bucket_client,
|
||||
&cancel,
|
||||
&node_id,
|
||||
&metrics,
|
||||
&idempotency_keys,
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to upload to S3: {e:#}");
|
||||
}
|
||||
@@ -174,9 +188,9 @@ async fn collect_metrics(
|
||||
&client,
|
||||
metric_collection_endpoint,
|
||||
&cancel,
|
||||
&node_id,
|
||||
&metrics,
|
||||
&mut cached_metrics,
|
||||
&idempotency_keys,
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
|
||||
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
cached_metrics: &mut Cache,
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut uploaded = 0;
|
||||
let mut failed = 0;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
|
||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
|
||||
|
||||
while let Some(res) = iter.next() {
|
||||
let (chunk, body) = res?;
|
||||
@@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||
@@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
|
||||
// Serialize and write into compressed buffer
|
||||
let started_at = std::time::Instant::now();
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
|
||||
let (_chunk, body) = res?;
|
||||
gzip_writer.write_all(&body).await?;
|
||||
}
|
||||
@@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// The return type is quite ugly, but we gain testability in isolation
|
||||
fn serialize_in_chunks<'a, F>(
|
||||
/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
|
||||
/// idempotency keys are injected into the corresponding metric events (reused
|
||||
/// across different metrics sinks), and must have the same length as input.
|
||||
fn serialize_in_chunks<'a>(
|
||||
chunk_size: usize,
|
||||
input: &'a [RawMetric],
|
||||
factory: F,
|
||||
idempotency_keys: &'a [IdempotencyKey<'a>],
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
where
|
||||
F: KeyGen<'a> + 'a,
|
||||
{
|
||||
use bytes::BufMut;
|
||||
|
||||
struct Iter<'a, F> {
|
||||
assert_eq!(input.len(), idempotency_keys.len());
|
||||
|
||||
struct Iter<'a> {
|
||||
inner: std::slice::Chunks<'a, RawMetric>,
|
||||
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
||||
chunk_size: usize,
|
||||
|
||||
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
||||
buffer: bytes::BytesMut,
|
||||
// chunk amount of events are reused to produce the serialized document
|
||||
scratch: Vec<Event<Ids, Name>>,
|
||||
factory: F,
|
||||
}
|
||||
|
||||
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -167,17 +170,14 @@ where
|
||||
self.scratch.extend(
|
||||
chunk
|
||||
.iter()
|
||||
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
|
||||
.zip(&mut self.idempotency_keys)
|
||||
.map(|(raw_metric, key)| raw_metric.as_event(key)),
|
||||
);
|
||||
} else {
|
||||
// next rounds: update_in_place to reuse allocations
|
||||
assert_eq!(self.scratch.len(), self.chunk_size);
|
||||
self.scratch
|
||||
.iter_mut()
|
||||
.zip(chunk.iter())
|
||||
.for_each(|(slot, raw_metric)| {
|
||||
raw_metric.update_in_place(slot, &self.factory.generate())
|
||||
});
|
||||
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
|
||||
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
|
||||
}
|
||||
|
||||
let res = serde_json::to_writer(
|
||||
@@ -198,18 +198,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
|
||||
impl<'a> ExactSizeIterator for Iter<'a> {}
|
||||
|
||||
let buffer = bytes::BytesMut::new();
|
||||
let inner = input.chunks(chunk_size);
|
||||
let idempotency_keys = idempotency_keys.iter();
|
||||
let scratch = Vec::new();
|
||||
|
||||
Iter {
|
||||
inner,
|
||||
idempotency_keys,
|
||||
chunk_size,
|
||||
buffer,
|
||||
scratch,
|
||||
factory,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric {
|
||||
}
|
||||
}
|
||||
|
||||
trait KeyGen<'a>: Copy {
|
||||
pub(crate) trait KeyGen<'a> {
|
||||
fn generate(&self) -> IdempotencyKey<'a>;
|
||||
}
|
||||
|
||||
@@ -389,7 +390,10 @@ mod tests {
|
||||
let examples = metric_samples();
|
||||
assert!(examples.len() > 1);
|
||||
|
||||
let factory = FixedGen::new(Utc::now(), "1", 42);
|
||||
let now = Utc::now();
|
||||
let idempotency_keys = (0..examples.len())
|
||||
.map(|i| FixedGen::new(now, "1", i as u16).generate())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
||||
// hashmap
|
||||
@@ -398,13 +402,13 @@ mod tests {
|
||||
events: Vec<Event<Ids, Name>>,
|
||||
}
|
||||
|
||||
let correct = serialize_in_chunks(examples.len(), &examples, factory)
|
||||
let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
|
||||
.map(|res| res.unwrap().1)
|
||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for chunk_size in 1..examples.len() {
|
||||
let actual = serialize_in_chunks(chunk_size, &examples, factory)
|
||||
let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
|
||||
.map(|res| res.unwrap().1)
|
||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -41,19 +41,15 @@
|
||||
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
|
||||
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
||||
|
||||
use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Serialize;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||
use utils::serde_percent::Percent;
|
||||
use utils::{completion, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
@@ -69,23 +65,9 @@ use crate::{
|
||||
CancellableTask, DiskUsageEvictionTask,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct DiskUsageEvictionTaskConfig {
|
||||
pub max_usage_pct: Percent,
|
||||
pub min_avail_bytes: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
}
|
||||
|
||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||
/// partitioning.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum EvictionOrder {
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
@@ -96,23 +78,22 @@ pub enum EvictionOrder {
|
||||
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
|
||||
/// to ensure nondeterminism by adding in a random number to break the
|
||||
/// `relative_last_activity==0.0` ties.
|
||||
#[serde(default = "default_highest_layer_count_loses_first")]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for EvictionOrder {
|
||||
fn default() -> Self {
|
||||
Self::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
|
||||
fn from(value: pageserver_api::config::EvictionOrder) -> Self {
|
||||
match value {
|
||||
pageserver_api::config::EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => Self::RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_highest_layer_count_loses_first() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
|
||||
use EvictionOrder::*;
|
||||
@@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
storage,
|
||||
usage_pre,
|
||||
tenant_manager,
|
||||
task_config.eviction_order,
|
||||
task_config.eviction_order.into(),
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
@@ -1257,7 +1238,6 @@ mod filesystem_level_usage {
|
||||
|
||||
#[test]
|
||||
fn max_usage_pct_pressure() {
|
||||
use super::EvictionOrder;
|
||||
use super::Usage as _;
|
||||
use std::time::Duration;
|
||||
use utils::serde_percent::Percent;
|
||||
@@ -1269,7 +1249,7 @@ mod filesystem_level_usage {
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
eviction_order: pageserver_api::config::EvictionOrder::default(),
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -17,6 +17,7 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use pageserver_api::models::IngestAuxFilesRequest;
|
||||
use pageserver_api::models::ListAuxFilesRequest;
|
||||
@@ -467,9 +468,11 @@ async fn build_timeline_info_common(
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
state,
|
||||
is_archived,
|
||||
is_archived: Some(is_archived),
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
last_aux_file_policy: timeline.last_aux_file_policy.load(),
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -536,7 +539,11 @@ async fn timeline_create_handler(
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
|
||||
tracing::info!(%ancestor_id, "starting to branch");
|
||||
if let Some(pg_version) = request_data.pg_version.as_ref() {
|
||||
tracing::info!(%pg_version, %ancestor_id, "starting to branch");
|
||||
} else {
|
||||
tracing::info!(%ancestor_id, "starting to branch");
|
||||
}
|
||||
} else {
|
||||
tracing::info!("bootstrapping");
|
||||
}
|
||||
@@ -2073,7 +2080,7 @@ async fn disk_usage_eviction_run(
|
||||
evict_bytes: u64,
|
||||
|
||||
#[serde(default)]
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
||||
eviction_order: pageserver_api::config::EvictionOrder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
@@ -2109,7 +2116,7 @@ async fn disk_usage_eviction_run(
|
||||
&state.remote_storage,
|
||||
usage,
|
||||
&state.tenant_manager,
|
||||
config.eviction_order,
|
||||
config.eviction_order.into(),
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
@@ -2316,6 +2323,31 @@ async fn post_tracing_event_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn force_aux_policy_switch_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
|
||||
let policy: AuxFilePolicy = json_request(&mut r).await?;
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
timeline
|
||||
.do_switch_aux_policy(policy)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -3030,6 +3062,10 @@ pub fn make_router(
|
||||
.put("/v1/io_alignment", |r| {
|
||||
api_handler(r, put_io_alignment_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
)
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|
||||
|
||||
@@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::decode_wal_record;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -53,6 +54,8 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
tline: &Timeline,
|
||||
pgdata_path: &Utf8Path,
|
||||
pgdata_lsn: Lsn,
|
||||
change_control_file_lsn: bool,
|
||||
src_timeline: Option<&Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
@@ -75,8 +78,23 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
|
||||
let mut file = tokio::fs::File::open(absolute_path).await?;
|
||||
let len = metadata.len() as usize;
|
||||
if let Some(control_file) =
|
||||
import_file(&mut modification, relative_path, &mut file, len, ctx).await?
|
||||
let new_checkpoint_lsn = if change_control_file_lsn {
|
||||
Some(pgdata_lsn)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// if this is import after pg_upgrade, skip all user data files
|
||||
// relfilenode > FirstNormalObjectId of the new cluster
|
||||
if let Some(control_file) = import_file(
|
||||
&mut modification,
|
||||
relative_path,
|
||||
&mut file,
|
||||
len,
|
||||
new_checkpoint_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
@@ -84,6 +102,37 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
}
|
||||
|
||||
// // if we're importing after pg_upgrade
|
||||
// // also copy metadata for all relations that were not copied
|
||||
// // from the parent timeline
|
||||
// if let Some(src_timeline) = src_timeline {
|
||||
// for ((spcnode, dbnode), _) in src_timeline
|
||||
// .list_dbdirs(pgdata_lsn, ctx)
|
||||
// .await
|
||||
// .with_context(|| format!("Failed to list_dbdirs for src_timeline"))?
|
||||
// {
|
||||
// let rels = src_timeline
|
||||
// .list_rels(spcnode, dbnode, Version::Lsn(pgdata_lsn), ctx)
|
||||
// .await
|
||||
// .with_context(|| format!("Failed to list_rels for src_timeline"))?;
|
||||
|
||||
// let new_rels = tline
|
||||
// .list_rels(spcnode, dbnode, Version::Lsn(pgdata_lsn), ctx)
|
||||
// .await
|
||||
// .with_context(|| format!("Failed to list_rels for new_timeline"))?;
|
||||
|
||||
// for rel in rels {
|
||||
// if !new_rels.contains(&rel) {
|
||||
// let nblocks = src_timeline
|
||||
// .get_rel_size(rel, Version::Lsn(pgdata_lsn), ctx)
|
||||
// .await
|
||||
// .with_context(|| format!("Failed to get_rel_size for src_timeline"))?;
|
||||
// // TODO insert relation size into the new timeline's cache
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// We're done importing all the data files.
|
||||
modification.commit(ctx).await?;
|
||||
|
||||
@@ -93,6 +142,10 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
pg_control.state == DBState_DB_SHUTDOWNED,
|
||||
"Postgres cluster was not shut down cleanly"
|
||||
);
|
||||
info!("pg_control: {:?}", pg_control);
|
||||
info!("checkpoint: {:?}", pg_control.checkPoint);
|
||||
info!("pgdata_lsn: {:?}", pgdata_lsn.0);
|
||||
info!("checkpoint redo: {:?}", pg_control.checkPointCopy.redo);
|
||||
ensure!(
|
||||
pg_control.checkPointCopy.redo == pgdata_lsn.0,
|
||||
"unexpected checkpoint REDO pointer"
|
||||
@@ -101,18 +154,46 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
// Import WAL. This is needed even when starting from a shutdown checkpoint, because
|
||||
// this reads the checkpoint record itself, advancing the tip of the timeline to
|
||||
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
|
||||
import_wal(
|
||||
&pgdata_path.join("pg_wal"),
|
||||
tline,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
pgdata_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
if !change_control_file_lsn {
|
||||
import_wal(
|
||||
&pgdata_path.join("pg_wal"),
|
||||
tline,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
pgdata_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_user_relfile(path: &Path) -> bool {
|
||||
let filename = &path
|
||||
.file_name()
|
||||
.expect("missing rel filename")
|
||||
.to_string_lossy();
|
||||
let (relnode, _, _) = parse_relfilename(filename)
|
||||
.map_err(|e| {
|
||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||
e
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// if this is import after pg_upgrade, skip all user data files
|
||||
// relfilenode > FirstNormalObjectId of the new cluster
|
||||
|
||||
// THIS IS WRONG
|
||||
// if catalog relation was vacuumed with vacuum full, it will have a new relfilenode
|
||||
// which will be greater than FirstNormalObjectId
|
||||
// Use pg_relfilemap decide if the relation is a catalog relation
|
||||
if relnode > pg_constants::FIRST_NORMAL_OBJECT_ID {
|
||||
//
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
async fn import_rel(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
@@ -310,11 +391,13 @@ async fn import_wal(
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
@@ -364,8 +447,15 @@ pub async fn import_basebackup_from_tar(
|
||||
|
||||
match header.entry_type() {
|
||||
tokio_tar::EntryType::Regular => {
|
||||
if let Some(res) =
|
||||
import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
|
||||
if let Some(res) = import_file(
|
||||
&mut modification,
|
||||
file_path.as_ref(),
|
||||
&mut entry,
|
||||
len,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
@@ -449,11 +539,12 @@ pub async fn import_wal_from_tar(
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
@@ -489,6 +580,7 @@ async fn import_file(
|
||||
file_path: &Path,
|
||||
reader: &mut (impl AsyncRead + Send + Sync + Unpin),
|
||||
len: usize,
|
||||
new_checkpoint_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
let file_name = match file_path.file_name() {
|
||||
@@ -502,6 +594,13 @@ async fn import_file(
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if file_name == "pg_internal.init" {
|
||||
// tar archives on macOs, created without COPYFILE_DISABLE=1 env var
|
||||
// will contain "fork files", skip them.
|
||||
info!("skipping pg_internal.init");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
@@ -511,7 +610,14 @@ async fn import_file(
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&bytes[..])?;
|
||||
let mut pg_control = ControlFileData::decode(&bytes[..])?;
|
||||
|
||||
if let Some(checkpoint_lsn) = new_checkpoint_lsn {
|
||||
// If we're not changing the checkpoint LSN, use the one from the control file.
|
||||
pg_control.checkPoint = checkpoint_lsn.0;
|
||||
pg_control.checkPointCopy.redo = checkpoint_lsn.0;
|
||||
};
|
||||
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
debug!("imported control file");
|
||||
@@ -531,8 +637,16 @@ async fn import_file(
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
// if this is import after pg_upgrade, skip all user data files
|
||||
// relfilenode > FirstNormalObjectId of the new cluster
|
||||
// TODO Implement import_rel_from_old_version that will copy
|
||||
// relation metadata and cached size from the parent timeline
|
||||
if is_user_relfile(file_path) && new_checkpoint_lsn.is_some() {
|
||||
info!("after pg_restore skipping {:?}", file_path);
|
||||
} else {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("base") {
|
||||
@@ -556,8 +670,14 @@ async fn import_file(
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
// if this is import after pg_upgrade, skip all user data files
|
||||
// relfilenode > FirstNormalObjectId of the new cluster
|
||||
if is_user_relfile(file_path) && new_checkpoint_lsn.is_some() {
|
||||
info!("after pg_restore skipping {:?}", file_path);
|
||||
} else {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("pg_xact") {
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub enum L0FlushConfig {
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
@@ -16,6 +14,16 @@ impl Default for L0FlushConfig {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
|
||||
fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
|
||||
match config {
|
||||
pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
|
||||
Self::Direct { max_concurrency }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
|
||||
@@ -12,16 +12,17 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
CompactKey, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -32,7 +33,7 @@ use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
@@ -167,7 +168,9 @@ impl Timeline {
|
||||
DatadirModification {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_updates: HashMap::new(),
|
||||
pending_metadata_pages: HashMap::new(),
|
||||
pending_data_pages: Vec::new(),
|
||||
pending_zero_data_pages: Default::default(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
@@ -666,6 +669,21 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
async fn list_aux_files_v1(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||
Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_aux_files_v2(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
@@ -696,7 +714,10 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
self.list_aux_files_v2(lsn, ctx).await?;
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
|
||||
self.list_aux_files_v2(lsn, ctx).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -705,7 +726,51 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
self.list_aux_files_v2(lsn, ctx).await
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) => {
|
||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||
let empty_str = if res.is_empty() { ", empty" } else { "" };
|
||||
warn!(
|
||||
"this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
|
||||
);
|
||||
Ok(res)
|
||||
}
|
||||
None => {
|
||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||
if !res.is_empty() {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
|
||||
match (v1_result, v2_result) {
|
||||
(Ok(v1), Ok(v2)) => {
|
||||
if v1 != v2 {
|
||||
tracing::error!(
|
||||
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
|
||||
);
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unmatched aux file v1 v2 result"
|
||||
)));
|
||||
}
|
||||
Ok(v1)
|
||||
}
|
||||
(Ok(_), Err(v2)) => {
|
||||
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
|
||||
Err(v2)
|
||||
}
|
||||
(Err(v1), Ok(_)) => {
|
||||
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
|
||||
Err(v1)
|
||||
}
|
||||
(Err(_), Err(v2)) => Err(v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_replorigins(
|
||||
@@ -847,6 +912,9 @@ impl Timeline {
|
||||
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
|
||||
// Add extra keyspaces in the test cases. Some test cases write keys into the storage without
|
||||
// creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
|
||||
@@ -969,10 +1037,24 @@ pub struct DatadirModification<'a> {
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
/// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
|
||||
/// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
|
||||
pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
|
||||
|
||||
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
|
||||
/// which keys are stored here.
|
||||
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||
|
||||
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
|
||||
// if we encounter a write from postgres in the same wal record, we will drop this entry.
|
||||
//
|
||||
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
|
||||
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
|
||||
pending_zero_data_pages: HashSet<CompactKey>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
@@ -996,6 +1078,10 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_bytes
|
||||
}
|
||||
|
||||
pub(crate) fn has_dirty_data_pages(&self) -> bool {
|
||||
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -1004,6 +1090,10 @@ impl<'a> DatadirModification<'a> {
|
||||
lsn,
|
||||
self.lsn
|
||||
);
|
||||
|
||||
// If we are advancing LSN, then state from previous wal record should have been flushed.
|
||||
assert!(self.pending_zero_data_pages.is_empty());
|
||||
|
||||
if lsn > self.lsn {
|
||||
self.pending_lsns.push(self.lsn);
|
||||
self.lsn = lsn;
|
||||
@@ -1011,6 +1101,17 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
|
||||
/// keys that represent literal blocks that postgres can read. So data includes relation blocks and
|
||||
/// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
|
||||
///
|
||||
/// The distinction is important because data keys are handled on a fast path where dirty writes are
|
||||
/// not readable until this modification is committed, whereas metadata keys are visible for read
|
||||
/// via [`Self::get`] as soon as their record has been ingested.
|
||||
fn is_data_key(key: &Key) -> bool {
|
||||
key.is_rel_block_key() || key.is_slru_block_key()
|
||||
}
|
||||
|
||||
/// Initialize a completely new repository.
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
@@ -1022,6 +1123,9 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_directory_entries.push((DirectoryKind::Db, 0));
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory
|
||||
self.init_aux_dir()?;
|
||||
|
||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
@@ -1115,6 +1219,31 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
|
||||
self.pending_zero_data_pages
|
||||
.insert(rel_block_to_key(rel, blknum).to_compact());
|
||||
self.pending_bytes += ZERO_PAGE.len();
|
||||
}
|
||||
|
||||
pub(crate) fn put_slru_page_image_zero(
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) {
|
||||
self.pending_zero_data_pages
|
||||
.insert(slru_block_to_key(kind, segno, blknum).to_compact());
|
||||
self.pending_bytes += ZERO_PAGE.len();
|
||||
}
|
||||
|
||||
/// Call this at the end of each WAL record.
|
||||
pub(crate) fn on_record_end(&mut self) {
|
||||
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
|
||||
for key in pending_zero_data_pages {
|
||||
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
@@ -1134,6 +1263,9 @@ impl<'a> DatadirModification<'a> {
|
||||
// 'true', now write the updated 'dbdirs' map back.
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory as well
|
||||
self.init_aux_dir()?;
|
||||
}
|
||||
if r.is_none() {
|
||||
// Create RelDirectory
|
||||
@@ -1494,60 +1626,200 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
|
||||
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
|
||||
return Ok(());
|
||||
}
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, 0));
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn put_file(
|
||||
&mut self,
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
Ok(val) => Some(val),
|
||||
Err(PageReconstructError::MissingKey(_)) => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let mut other_files = Vec::with_capacity(files.len());
|
||||
let mut modifying_file = None;
|
||||
for file @ (p, content) in files {
|
||||
if path == p {
|
||||
assert!(
|
||||
modifying_file.is_none(),
|
||||
"duplicated entries found for {}",
|
||||
path
|
||||
);
|
||||
modifying_file = Some(content);
|
||||
let switch_policy = self.tline.get_switch_aux_file_policy();
|
||||
|
||||
let policy = {
|
||||
let current_policy = self.tline.last_aux_file_policy.load();
|
||||
// Allowed switch path:
|
||||
// * no aux files -> v1/v2/cross-validation
|
||||
// * cross-validation->v2
|
||||
|
||||
let current_policy = if current_policy.is_none() {
|
||||
// This path will only be hit once per tenant: we will decide the final policy in this code block.
|
||||
// The next call to `put_file` will always have `last_aux_file_policy != None`.
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
} else {
|
||||
other_files.push(file);
|
||||
current_policy
|
||||
};
|
||||
|
||||
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
|
||||
self.tline.do_switch_aux_policy(switch_policy)?;
|
||||
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
|
||||
switch_policy
|
||||
} else {
|
||||
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
|
||||
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
|
||||
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
|
||||
}
|
||||
};
|
||||
|
||||
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
Ok(val) => Some(val),
|
||||
Err(PageReconstructError::MissingKey(_)) => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let mut other_files = Vec::with_capacity(files.len());
|
||||
let mut modifying_file = None;
|
||||
for file @ (p, content) in files {
|
||||
if path == p {
|
||||
assert!(
|
||||
modifying_file.is_none(),
|
||||
"duplicated entries found for {}",
|
||||
path
|
||||
);
|
||||
modifying_file = Some(content);
|
||||
} else {
|
||||
other_files.push(file);
|
||||
}
|
||||
}
|
||||
let mut new_files = other_files;
|
||||
match (modifying_file, content.is_empty()) {
|
||||
(Some(old_content), false) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_update(old_content.len(), content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(Some(old_content), true) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_remove(old_content.len());
|
||||
// not adding the file key to the final `new_files` vec.
|
||||
}
|
||||
(None, false) => {
|
||||
self.tline.aux_file_size_estimator.on_add(content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
}
|
||||
let mut new_files = other_files;
|
||||
match (modifying_file, content.is_empty()) {
|
||||
(Some(old_content), false) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_update(old_content.len(), content.len());
|
||||
new_files.push((path, content));
|
||||
|
||||
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
aux_files.n_deltas = 0;
|
||||
} else {
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
|
||||
);
|
||||
aux_files.n_deltas += 1;
|
||||
}
|
||||
aux_files.dir = Some(dir);
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
n_files = dir.files.len();
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
e @ (PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey(_)),
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
if !matches!(e, PageReconstructError::MissingKey(_)) {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
tracing::warn!("treating error as if it was a missing key: {}", e);
|
||||
}
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
n_files = 1;
|
||||
aux_files.dir = Some(dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
(Some(old_content), true) => {
|
||||
self.tline
|
||||
.aux_file_size_estimator
|
||||
.on_remove(old_content.len());
|
||||
// not adding the file key to the final `new_files` vec.
|
||||
}
|
||||
(None, false) => {
|
||||
self.tline.aux_file_size_estimator.on_add(content.len());
|
||||
new_files.push((path, content));
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, n_files));
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1570,7 +1842,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1581,31 +1853,11 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
bail!(
|
||||
"the request contains data not supported by pageserver at TimelineWriter::put: {}", key
|
||||
);
|
||||
}
|
||||
let mut write_batch = Vec::new();
|
||||
for (lsn, value_ser_size, value) in values {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
write_batch.push((key.to_compact(), lsn, value_ser_size, value));
|
||||
} else {
|
||||
retained_pending_updates.entry(key).or_default().push((
|
||||
lsn,
|
||||
value_ser_size,
|
||||
value,
|
||||
));
|
||||
}
|
||||
}
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put_batch(pending_data_pages, ctx).await?;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
@@ -1626,29 +1878,31 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
// Commit should never be called mid-wal-record
|
||||
assert!(self.pending_zero_data_pages.is_empty());
|
||||
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index.
|
||||
let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
|
||||
.pending_updates
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index.
|
||||
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
|
||||
|
||||
write_batch.extend(
|
||||
self.pending_metadata_pages
|
||||
.drain()
|
||||
.flat_map(|(key, values)| {
|
||||
values.into_iter().map(move |(lsn, val_ser_size, value)| {
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
|
||||
}
|
||||
Ok((key.to_compact(), lsn, val_ser_size, value))
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
values
|
||||
.into_iter()
|
||||
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
|
||||
}),
|
||||
);
|
||||
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
if !write_batch.is_empty() {
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1679,43 +1933,103 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.pending_updates.len() + self.pending_deletions.len()
|
||||
self.pending_metadata_pages.len()
|
||||
+ self.pending_data_pages.len()
|
||||
+ self.pending_deletions.len()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
/// Read a page from the Timeline we are writing to. For metadata pages, this passes through
|
||||
/// a cache in Self, which makes writes earlier in this modification visible to WAL records later
|
||||
/// in the modification.
|
||||
///
|
||||
/// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
|
||||
/// page must ensure that the pages they read are already committed in Timeline, for example
|
||||
/// DB create operations are always preceded by a call to commit(). This is special cased because
|
||||
/// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
|
||||
/// and not data pages.
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, _, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
if !Self::is_data_key(&key) {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
|
||||
if let Some((_, _, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// This is an expensive check, so we only do it in debug mode. If reading a data key,
|
||||
// this key should never be present in pending_data_pages. We ensure this by committing
|
||||
// modifications before ingesting DB create operations, which are the only kind that reads
|
||||
// data pages during ingest.
|
||||
if cfg!(debug_assertions) {
|
||||
for (dirty_key, _, _, _) in &self.pending_data_pages {
|
||||
debug_assert!(&key.to_compact() != dirty_key);
|
||||
}
|
||||
|
||||
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
|
||||
}
|
||||
}
|
||||
|
||||
// Metadata page cache miss, or we're reading a data page.
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
/// Only used during unit tests, force putting a key into the modification.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
|
||||
self.put(key, val);
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
if Self::is_data_key(&key) {
|
||||
self.put_data(key.to_compact(), val)
|
||||
} else {
|
||||
self.put_metadata(key.to_compact(), val)
|
||||
}
|
||||
}
|
||||
|
||||
fn put_data(&mut self, key: CompactKey, val: Value) {
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
|
||||
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
|
||||
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
|
||||
// and the subsequent postgres-originating write
|
||||
if self.pending_zero_data_pages.remove(&key) {
|
||||
self.pending_bytes -= ZERO_PAGE.len();
|
||||
}
|
||||
|
||||
self.pending_bytes += val_serialized_size;
|
||||
self.pending_data_pages
|
||||
.push((key, self.lsn, val_serialized_size, val))
|
||||
}
|
||||
|
||||
fn put_metadata(&mut self, key: CompactKey, val: Value) {
|
||||
let values = self.pending_metadata_pages.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
// Update the pending_bytes contribution from this entry, and update the serialized size in place
|
||||
self.pending_bytes -= *last_value_ser_size;
|
||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_bytes += *last_value_ser_size;
|
||||
|
||||
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
|
||||
// have been generated by synthesized zero page writes prior to the first real write to a page.
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
@@ -1787,6 +2101,21 @@ struct RelDirectory {
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||
pub(crate) struct AuxFilesDirectory {
|
||||
pub(crate) files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
impl AuxFilesDirectory {
|
||||
pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
|
||||
if let Some(value) = value {
|
||||
self.files.insert(key, value);
|
||||
} else {
|
||||
self.files.remove(&key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct RelSizeEntry {
|
||||
nblocks: u32,
|
||||
|
||||
@@ -60,32 +60,7 @@ pub mod mock {
|
||||
use regex::Regex;
|
||||
use tracing::log::info;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum Behavior {
|
||||
Success {
|
||||
blocksize: u64,
|
||||
total_blocks: u64,
|
||||
name_filter: Option<utils::serde_regex::Regex>,
|
||||
},
|
||||
Failure {
|
||||
mocked_error: MockedError,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[allow(clippy::upper_case_acronyms)]
|
||||
pub enum MockedError {
|
||||
EIO,
|
||||
}
|
||||
|
||||
impl From<MockedError> for nix::Error {
|
||||
fn from(e: MockedError) -> Self {
|
||||
match e {
|
||||
MockedError::EIO => nix::Error::EIO,
|
||||
}
|
||||
}
|
||||
}
|
||||
pub use pageserver_api::config::statvfs::mock::Behavior;
|
||||
|
||||
pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
|
||||
info!("running mocked statvfs");
|
||||
@@ -116,6 +91,7 @@ pub mod mock {
|
||||
block_size: *blocksize,
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::TopTenantShardItem;
|
||||
@@ -32,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
@@ -47,6 +49,7 @@ use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
use utils::fs_ext;
|
||||
use utils::id::TenantId;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
@@ -83,6 +86,7 @@ use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::pgdatadir_mapping;
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -683,6 +687,7 @@ impl Tenant {
|
||||
index_part: Option<IndexPart>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = self.tenant_shard_id;
|
||||
@@ -693,6 +698,10 @@ impl Tenant {
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
// This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
|
||||
// there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
|
||||
// Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
|
||||
last_aux_file_policy,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
@@ -707,6 +716,10 @@ impl Tenant {
|
||||
|
||||
if let Some(index_part) = index_part.as_ref() {
|
||||
timeline.remote_client.init_upload_queue(index_part)?;
|
||||
|
||||
timeline
|
||||
.last_aux_file_policy
|
||||
.store(index_part.last_aux_file_policy());
|
||||
} else {
|
||||
// No data on the remote storage, but we have local metadata file. We can end up
|
||||
// here with timeline_create being interrupted before finishing index part upload.
|
||||
@@ -1277,12 +1290,15 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
let last_aux_file_policy = index_part.last_aux_file_policy();
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
Some(index_part),
|
||||
remote_metadata,
|
||||
ancestor,
|
||||
last_aux_file_policy,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1499,6 +1515,7 @@ impl Tenant {
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -1695,15 +1712,31 @@ impl Tenant {
|
||||
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
|
||||
})?;
|
||||
}
|
||||
|
||||
self.branch_timeline(
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
// hackathon hackaneon single click postgres upgrade
|
||||
if pg_version > ancestor_timeline.pg_version {
|
||||
let old_pg_version = ancestor_timeline.pg_version;
|
||||
tracing::info!("Upgrading timeline {new_timeline_id} from version {old_pg_version} to {pg_version}");
|
||||
// add new stuff here
|
||||
self.branch_timeline(
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
create_guard,
|
||||
ctx,
|
||||
pg_version,
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
self.branch_timeline(
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
create_guard,
|
||||
ctx,
|
||||
ancestor_timeline.pg_version,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
None => {
|
||||
self.bootstrap_timeline(
|
||||
@@ -2660,6 +2693,7 @@ impl Tenant {
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
resources: TimelineResources,
|
||||
cause: CreateTimelineCause,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let state = match cause {
|
||||
CreateTimelineCause::Load => {
|
||||
@@ -2688,6 +2722,7 @@ impl Tenant {
|
||||
resources,
|
||||
pg_version,
|
||||
state,
|
||||
last_aux_file_policy,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
|
||||
@@ -3213,9 +3248,17 @@ impl Tenant {
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
ctx: &RequestContext,
|
||||
pg_version: u32,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||
.await
|
||||
self.branch_timeline_impl(
|
||||
src_timeline,
|
||||
dst_id,
|
||||
start_lsn,
|
||||
timeline_create_guard,
|
||||
ctx,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn branch_timeline_impl(
|
||||
@@ -3224,7 +3267,8 @@ impl Tenant {
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
_ctx: &RequestContext,
|
||||
ctx: &RequestContext,
|
||||
pg_version: u32,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
|
||||
@@ -3300,7 +3344,7 @@ impl Tenant {
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
pg_version,
|
||||
);
|
||||
|
||||
let uninitialized_timeline = self
|
||||
@@ -3310,9 +3354,121 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
src_timeline.last_aux_file_policy.load(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if pg_version != src_timeline.pg_version {
|
||||
info!(
|
||||
"branching timeline {dst_id} from timeline {src_id} with different pg_version: {pg_version}",
|
||||
);
|
||||
|
||||
let timeline_id = dst_id;
|
||||
|
||||
// prepare pgdata for the new timeline
|
||||
let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
|
||||
let pgdata_path = path_with_suffix_extension(
|
||||
timelines_path.join(format!("basebackup-{timeline_id}")),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
if pgdata_path.exists() {
|
||||
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||
})?;
|
||||
}
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
// scopeguard::defer! {
|
||||
// if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
// // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
|
||||
// error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
|
||||
// }
|
||||
// }
|
||||
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to initdb {timeline_id} with pg_version {pg_version} at {pgdata_path}"
|
||||
)
|
||||
})?;
|
||||
|
||||
// TODO
|
||||
// do pg_upgrade bits here
|
||||
// Rust is not the most convenient for writing this,
|
||||
// So just call the pg_upgrade in the subprocess.
|
||||
// In the future we can turn it into API call to some service that will do the work
|
||||
//
|
||||
// 1. start postgres on a parent timeline at the start_lsn, using neon_local (now this is hardcoded)
|
||||
// 2. run pg_upgrade using neon_local for old version and freshly created pgdata for new version
|
||||
run_pg_upgrade(
|
||||
self.conf,
|
||||
&pgdata_path,
|
||||
src_timeline.pg_version,
|
||||
pg_version,
|
||||
src_timeline.timeline_id,
|
||||
self.tenant_shard_id.tenant_id,
|
||||
start_lsn,
|
||||
&self.cancel,
|
||||
).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to pg_upgrade {timeline_id} with pg_version {pg_version} at {pgdata_path}"
|
||||
)
|
||||
})?;
|
||||
|
||||
let contolfile_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
let start_lsn = start_lsn.align();
|
||||
// choose the max of controlfile_lsn and start_lsn
|
||||
//
|
||||
// It is possible that the controlfile_lsn is ahead of the start_lsn,
|
||||
// especially for small databases
|
||||
// In that case, we need to start from the controlfile_lsn.
|
||||
// Otherwise we will have LSN on the pages larger that the lsn of the branch.
|
||||
// And this will lead to the error, when compute will try to flush the page
|
||||
// with the lsn larger than the branch lsn.
|
||||
//
|
||||
// ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
|
||||
//
|
||||
// We got another problem here - a gap between the
|
||||
// branching_lsn (where we diverged with the parent) and pgdata_lsn (import lsn of the new timeline)
|
||||
// We should teach the wal-redo to skip all the records between these two points.
|
||||
// Otherwise we will see some updates from the parent timeline in the new timeline
|
||||
let pgdata_lsn = std::cmp::max(contolfile_lsn, start_lsn);
|
||||
assert!(pgdata_lsn.is_aligned());
|
||||
|
||||
// TODO why do we need these lines?
|
||||
let tenant_shard_id = uninitialized_timeline.owning_tenant.tenant_shard_id;
|
||||
let unfinished_timeline = uninitialized_timeline.raw_timeline()?;
|
||||
|
||||
// Flush the new layer files to disk, before we make the timeline as available to
|
||||
// the outside world.
|
||||
//
|
||||
// Flush loop needs to be spawned in order to be able to flush.
|
||||
unfinished_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
unfinished_timeline,
|
||||
&pgdata_path,
|
||||
pgdata_lsn,
|
||||
true,
|
||||
Some(src_timeline),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
unfinished_timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let new_timeline = uninitialized_timeline.finish_creation()?;
|
||||
|
||||
// Root timeline gets its layers during creation and uploads them along with the metadata.
|
||||
@@ -3503,6 +3659,7 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -3519,6 +3676,8 @@ impl Tenant {
|
||||
unfinished_timeline,
|
||||
&pgdata_path,
|
||||
pgdata_lsn,
|
||||
false,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3574,6 +3733,7 @@ impl Tenant {
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
@@ -3589,6 +3749,7 @@ impl Tenant {
|
||||
ancestor,
|
||||
resources,
|
||||
CreateTimelineCause::Load,
|
||||
last_aux_file_policy,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
@@ -3874,6 +4035,95 @@ async fn run_initdb(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run pg_upgrade from the old cluster to the new cluster.
|
||||
async fn run_pg_upgrade(
|
||||
conf: &'static PageServerConf,
|
||||
new_pgdata: &Utf8Path,
|
||||
old_pg_version: u32,
|
||||
new_pg_version: u32,
|
||||
_parent_timeline_id: TimelineId,
|
||||
_tenant_id: TenantId,
|
||||
_start_lsn: Lsn, // this is where we need to start compute for parent timeline to dump the data
|
||||
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), InitdbError> {
|
||||
//let old_bin_path = conf.pg_bin_dir(old_pg_version).map_err(InitdbError::Other)?;
|
||||
let pg_upgrade_bin_path = conf
|
||||
.pg_bin_dir(new_pg_version)
|
||||
.map_err(InitdbError::Other)?
|
||||
.join("pg_upgrade");
|
||||
|
||||
let pg_upgrade_lib_dir = conf
|
||||
.pg_lib_dir(new_pg_version)
|
||||
.map_err(InitdbError::Other)?;
|
||||
|
||||
info!(
|
||||
"running {} in pgdata {} from version {} to {}",
|
||||
pg_upgrade_bin_path, new_pgdata, old_pg_version, new_pg_version,
|
||||
);
|
||||
|
||||
// TODO
|
||||
// start ad-hoc compute for parent timeline to connect and dump the data
|
||||
// inspired by the script https://github.com/neondatabase/cloud/pull/17267/files
|
||||
// and neon_local
|
||||
|
||||
// We test with neon_local, so let's hardcode it for now
|
||||
let base_dir = "/home/ana/work/neon/";
|
||||
let old_pgdata = format!("{}/.neon/endpoints/main/pgdata", base_dir);
|
||||
|
||||
let pg_upgrade_command = tokio::process::Command::new(&pg_upgrade_bin_path)
|
||||
.current_dir(base_dir)
|
||||
.args(["-b", format!("{}pg_install/v15/bin/", base_dir).as_str()])
|
||||
.args(["-B", format!("{}pg_install/v16/bin/", base_dir).as_str()])
|
||||
.args(["-d", old_pgdata.as_ref()])
|
||||
.args(["-D", new_pgdata.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--socketdir", "/tmp"])
|
||||
.args([
|
||||
"--neon_start",
|
||||
format!("{}target/debug/neon_local endpoint start main", base_dir).as_str(),
|
||||
])
|
||||
.args([
|
||||
"--neon_stop",
|
||||
format!("{}target/debug/neon_local endpoint stop main", base_dir).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_upgrade_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_upgrade_lib_dir)
|
||||
.env("PGPORTOLD", "55432")
|
||||
.env("PGPORTNEW", "55433")
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
// print pg_upgrade_command
|
||||
info!("{:?}", pg_upgrade_command);
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let pg_upgrade_output = pg_upgrade_command.wait_with_output().await?;
|
||||
if !pg_upgrade_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
pg_upgrade_output.status,
|
||||
pg_upgrade_output.stderr,
|
||||
));
|
||||
}
|
||||
|
||||
// This isn't true cancellation support, see above. Still return an error to
|
||||
// excercise the cancellation code path.
|
||||
if cancel.is_cancelled() {
|
||||
return Err(InitdbError::Cancelled);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub async fn dump_layerfile_from_path(
|
||||
path: &Utf8Path,
|
||||
@@ -4177,6 +4427,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
@@ -4185,7 +4436,7 @@ mod tests {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use rand::{thread_rng, Rng};
|
||||
@@ -4194,6 +4445,7 @@ mod tests {
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::TenantId;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
@@ -5990,9 +6242,16 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_aux_file_e2e() {
|
||||
let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
|
||||
async fn test_branch_copies_dirty_aux_file_flag() {
|
||||
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v2 if not set by the admins
|
||||
assert_eq!(
|
||||
harness.tenant_conf.switch_aux_file_policy,
|
||||
AuxFilePolicy::default_tenant_config()
|
||||
);
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
@@ -6002,6 +6261,9 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// no aux file is written at this point, so the persistent flag should be unset
|
||||
assert_eq!(tline.last_aux_file_policy.load(), None);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
@@ -6012,6 +6274,30 @@ mod tests {
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// there is no tenant manager to pass the configuration through, so lets mimic it
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V2,
|
||||
"wanted state has been updated"
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
||||
);
|
||||
|
||||
// we can read everything from the storage
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
@@ -6029,6 +6315,12 @@ mod tests {
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"keep v2 storage format when new files are written"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
@@ -6040,9 +6332,321 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// child copies the last flag even if that is not on remote storage yet
|
||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
assert_eq!(files.get("pg_logical/mappings/test2"), None);
|
||||
|
||||
// even if we crash here without flushing parent timeline with it's new
|
||||
// last_aux_file_policy we are safe, because child was never meant to access ancestor's
|
||||
// files. the ancestor can even switch back to V1 because of a migration safely.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// there is no tenant manager to pass the configuration through, so lets mimic it
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V2,
|
||||
"wanted state has been updated"
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
"dirty index_part.json reflected state is yet to be updated"
|
||||
);
|
||||
|
||||
// we can still read the auxfile v1 before we ingest anything new
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"second", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"ingesting a file should apply the wanted switch state when applicable"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first")),
|
||||
"cross validation writes to both v1 and v2 so this should be available in v2"
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
);
|
||||
|
||||
// mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V1),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"third", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.get_switch_aux_file_policy(),
|
||||
AuxFilePolicy::V1,
|
||||
"wanted state has been updated again, even if invalid request"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"ingesting a file should apply the wanted switch state when applicable"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"third"))
|
||||
);
|
||||
|
||||
// mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(LocationConf::attached_single(
|
||||
TenantConfOpt {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
tenant.generation,
|
||||
&pageserver_api::models::ShardParameters::default(),
|
||||
))
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test3", b"last", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
|
||||
assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"third"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test3"),
|
||||
Some(&bytes::Bytes::from_static(b"last"))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_force_switch() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_force_switch")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"dirty index_part.json reflected state is yet to be updated"
|
||||
);
|
||||
|
||||
// lose all data from v1
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test2", b"second", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
// read data ingested in v2
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test2"),
|
||||
Some(&bytes::Bytes::from_static(b"second"))
|
||||
);
|
||||
// lose all data from v1
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn aux_file_policy_auto_detect() {
|
||||
let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
|
||||
.await
|
||||
.unwrap();
|
||||
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut lsn = Lsn(0x08);
|
||||
|
||||
let tline: Arc<Timeline> = tenant
|
||||
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
None,
|
||||
"no aux file is written so it should be unset"
|
||||
);
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: vec![(
|
||||
"test_file".to_string(),
|
||||
Bytes::copy_from_slice(b"test_file"),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
})
|
||||
.unwrap();
|
||||
modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
lsn += 8;
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification
|
||||
.put_file("pg_logical/mappings/test1", b"first", &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
modification.commit(&ctx).await.unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep using v1 because there are aux files writting with v1"
|
||||
);
|
||||
|
||||
// we can still read the auxfile v1
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(
|
||||
files.get("pg_logical/mappings/test1"),
|
||||
Some(&bytes::Bytes::from_static(b"first"))
|
||||
);
|
||||
assert_eq!(
|
||||
files.get("test_file"),
|
||||
Some(&bytes::Bytes::from_static(b"test_file"))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -9,11 +9,10 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithm;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
@@ -23,50 +22,6 @@ use std::num::NonZeroU64;
|
||||
use std::time::Duration;
|
||||
use utils::generation::Generation;
|
||||
|
||||
pub mod defaults {
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
||||
|
||||
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
||||
// has different parameters.
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
||||
super::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
// Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
|
||||
// If there's a need to decrease this value, first make sure that GC
|
||||
// doesn't hold a layer map write lock for non-trivial operations.
|
||||
// Relevant: https://github.com/neondatabase/neon/issues/3394
|
||||
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
|
||||
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub(crate) enum AttachmentMode {
|
||||
/// Our generation is current as far as we know, and as far as we know we are the only attached
|
||||
@@ -281,96 +236,20 @@ impl LocationConf {
|
||||
}
|
||||
}
|
||||
|
||||
/// A tenant's calcuated configuration, which is the result of merging a
|
||||
/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
|
||||
///
|
||||
/// For storing and transmitting individual tenant's configuration, see
|
||||
/// TenantConfOpt.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TenantConf {
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
// Inmemory layer is also flushed at least once in checkpoint_timeout to
|
||||
// eventually upload WAL after activity is stopped.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub checkpoint_timeout: Duration,
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
// How often to check if there's compaction work to be done.
|
||||
// Duration::ZERO means automatic compaction is disabled.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithmSettings,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
// Page versions older than this are garbage collected away.
|
||||
pub gc_horizon: u64,
|
||||
// Interval at which garbage collection is triggered.
|
||||
// Duration::ZERO means automatic GC is disabled
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
// Page versions older than this are garbage collected away.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Duration,
|
||||
/// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub walreceiver_connect_timeout: Duration,
|
||||
/// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
|
||||
/// A stalled safekeeper will be changed to a newer one when it appears.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lagging_wal_timeout: Duration,
|
||||
/// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
|
||||
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
|
||||
/// to avoid eager reconnects.
|
||||
pub max_lsn_wal_lag: NonZeroU64,
|
||||
pub eviction_policy: EvictionPolicy,
|
||||
pub min_resident_size_override: Option<u64>,
|
||||
// See the corresponding metric's help string.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: AuxFilePolicy,
|
||||
|
||||
/// The length for an explicit LSN lease request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lsn_lease_length: Duration,
|
||||
|
||||
/// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lsn_lease_length_for_ts: Duration,
|
||||
impl Default for LocationConf {
|
||||
// TODO: this should be removed once tenant loading can guarantee that we are never
|
||||
// loading from a directory without a configuration.
|
||||
// => tech debt since https://github.com/neondatabase/neon/issues/1555
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: Generation::none(),
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
tenant_conf: TenantConfOpt::default(),
|
||||
shard: ShardIdentity::unsharded(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -545,51 +424,6 @@ impl TenantConfOpt {
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TenantConf {
|
||||
fn default() -> Self {
|
||||
use defaults::*;
|
||||
Self {
|
||||
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
|
||||
.expect("cannot parse default checkpoint timeout"),
|
||||
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: CompactionAlgorithmSettings {
|
||||
kind: DEFAULT_COMPACTION_ALGORITHM,
|
||||
},
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
|
||||
)
|
||||
.expect("cannot parse default walreceiver connect timeout"),
|
||||
lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
|
||||
.expect("cannot parse default walreceiver lagging wal timeout"),
|
||||
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.expect("cannot parse default max walreceiver Lsn wal lag"),
|
||||
eviction_policy: EvictionPolicy::NoEviction,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -628,6 +628,18 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
|
||||
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
|
||||
self: &Arc<Self>,
|
||||
last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
|
||||
///
|
||||
/// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
|
||||
|
||||
@@ -70,8 +70,6 @@ pub struct IndexPart {
|
||||
///
|
||||
/// None means no aux files have been written to the storage before the point
|
||||
/// when this flag is introduced.
|
||||
///
|
||||
/// This field is deprecated as part of the aux v1 retirement.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
|
||||
}
|
||||
@@ -134,6 +132,10 @@ impl IndexPart {
|
||||
pub(crate) fn example() -> Self {
|
||||
Self::empty(TimelineMetadata::example())
|
||||
}
|
||||
|
||||
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
|
||||
self.last_aux_file_policy
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -52,6 +52,7 @@ use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
|
||||
@@ -34,8 +34,7 @@ use crate::tenant::disk_btree::{
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
@@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
|
||||
@@ -215,7 +215,7 @@ impl IndexEntry {
|
||||
|
||||
const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
|
||||
let res = Self::validate_checkpoint_distance(
|
||||
crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
);
|
||||
if res.is_err() {
|
||||
panic!("default checkpoint distance is valid")
|
||||
@@ -692,8 +692,13 @@ impl InMemoryLayer {
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
// This should not break anything, but is unexpected: ingestion code aims to filter out
|
||||
// multiple writes to the same key at the same LSN. This happens in cases where our
|
||||
// ingenstion code generates some write like an empty page, and we see a write from postgres
|
||||
// to the same key in the same wal record. If one such write makes it through, we
|
||||
// index the most recent write, implicitly ignoring the earlier write. We log a warning
|
||||
// because this case is unexpected, and we would like tests to fail if this happens.
|
||||
warn!("Key {} at {} written twice at same LSN", key, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
|
||||
use crate::tenant::throttle::Stats;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
@@ -456,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
|
||||
|
||||
// If compaction period is set to zero (to disable it), then we will use a reasonable default
|
||||
let period = if period == Duration::ZERO {
|
||||
humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
|
||||
.unwrap()
|
||||
.into()
|
||||
humantime::Duration::from_str(
|
||||
pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD,
|
||||
)
|
||||
.unwrap()
|
||||
.into()
|
||||
} else {
|
||||
period
|
||||
};
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::{
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
|
||||
},
|
||||
@@ -66,7 +66,6 @@ use std::{
|
||||
use crate::{
|
||||
aux_file::AuxFileSizeEstimator,
|
||||
tenant::{
|
||||
config::defaults::DEFAULT_PITR_INTERVAL,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
|
||||
@@ -96,12 +95,13 @@ use crate::{
|
||||
use crate::{
|
||||
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
|
||||
};
|
||||
use crate::{
|
||||
pgdatadir_mapping::DirectoryKind,
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
|
||||
use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
@@ -202,6 +202,11 @@ pub struct TimelineResources {
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
pub(crate) struct AuxFilesState {
|
||||
pub(crate) dir: Option<AuxFilesDirectory>,
|
||||
pub(crate) n_deltas: usize,
|
||||
}
|
||||
|
||||
/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
|
||||
/// ingestion considerably, because WAL ingestion needs to check on most records if the record
|
||||
/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end
|
||||
@@ -405,9 +410,15 @@ pub struct Timeline {
|
||||
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
|
||||
>,
|
||||
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
|
||||
/// Indicate whether aux file v2 storage is enabled.
|
||||
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
|
||||
|
||||
/// Some test cases directly place keys into the timeline without actually modifying the directory
|
||||
/// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
|
||||
/// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
|
||||
@@ -1938,7 +1949,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
|
||||
#[allow(dead_code)]
|
||||
#[allow(unused)]
|
||||
pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -1947,8 +1958,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
|
||||
}
|
||||
|
||||
/// TODO(chi): remove after retiring aux read path
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2089,6 +2098,7 @@ impl Timeline {
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
state: TimelineState,
|
||||
aux_file_policy: Option<AuxFilePolicy>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2215,8 +2225,15 @@ impl Timeline {
|
||||
|
||||
timeline_get_throttle: resources.timeline_get_throttle,
|
||||
|
||||
aux_files: tokio::sync::Mutex::new(AuxFilesState {
|
||||
dir: None,
|
||||
n_deltas: 0,
|
||||
}),
|
||||
|
||||
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
|
||||
|
||||
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
|
||||
|
||||
#[cfg(test)]
|
||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||
|
||||
@@ -2225,6 +2242,10 @@ impl Timeline {
|
||||
handles: Default::default(),
|
||||
};
|
||||
|
||||
if aux_file_policy == Some(AuxFilePolicy::V1) {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
|
||||
}
|
||||
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
@@ -4387,6 +4408,14 @@ impl Timeline {
|
||||
) -> Result<(), detach_ancestor::Error> {
|
||||
detach_ancestor::complete(self, tenant, attempt, ctx).await
|
||||
}
|
||||
|
||||
/// Switch aux file policy and schedule upload to the index part.
|
||||
pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
|
||||
self.last_aux_file_policy.store(Some(policy));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Timeline {
|
||||
@@ -5049,14 +5078,14 @@ impl Timeline {
|
||||
|
||||
// If we have a page image, and no WAL, we're all set
|
||||
if data.records.is_empty() {
|
||||
if let Some((img_lsn, img)) = &data.img {
|
||||
if let Some((img_lsn, img)) = data.img {
|
||||
trace!(
|
||||
"found page image for key {} at {}, no WAL redo required, req LSN {}",
|
||||
key,
|
||||
img_lsn,
|
||||
request_lsn,
|
||||
);
|
||||
Ok(img.clone())
|
||||
Ok(img)
|
||||
} else {
|
||||
Err(PageReconstructError::from(anyhow!(
|
||||
"base image for {key} at {request_lsn} not found"
|
||||
@@ -5067,33 +5096,138 @@ impl Timeline {
|
||||
//
|
||||
// If we don't have a base image, then the oldest WAL record better initialize
|
||||
// the page
|
||||
if data.img.is_none() && !data.records.first().unwrap().1.will_init() {
|
||||
Err(PageReconstructError::from(anyhow!(
|
||||
"Base image for {} at {} not found, but got {} WAL records",
|
||||
key,
|
||||
request_lsn,
|
||||
data.records.len()
|
||||
)))
|
||||
} else {
|
||||
if data.img.is_some() {
|
||||
|
||||
let have_img = data.img.is_some();
|
||||
let will_init = data
|
||||
.records
|
||||
.first()
|
||||
.map(|(_, rec)| rec.will_init())
|
||||
.expect("already checked to have records");
|
||||
|
||||
match (have_img, will_init) {
|
||||
(false, false) => {
|
||||
return Err(PageReconstructError::from(anyhow!(
|
||||
"Base image for {} at {} not found, but got {} WAL records",
|
||||
key,
|
||||
request_lsn,
|
||||
data.records.len()
|
||||
)))
|
||||
}
|
||||
(true, _) => {
|
||||
trace!(
|
||||
"found {} WAL records and a base image for {} at {}, performing WAL redo",
|
||||
data.records.len(),
|
||||
key,
|
||||
request_lsn
|
||||
);
|
||||
} else {
|
||||
}
|
||||
(false, _) => {
|
||||
assert!(will_init, "already checked above");
|
||||
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
||||
};
|
||||
let res = self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
}
|
||||
}
|
||||
|
||||
let oldest_lsn = data
|
||||
.records
|
||||
.first()
|
||||
.map(|(lsn, _)| lsn)
|
||||
.expect("again, checked");
|
||||
|
||||
// walk up the ancestry until we have found an ancestor covering the lsn range
|
||||
let ancestry = std::iter::successors(Some(self), |tl| tl.ancestor_timeline.as_deref())
|
||||
// 100 - initdb R pg14
|
||||
// 150 - branch S pg14
|
||||
// 200 - branch T pg15
|
||||
// 250 - branch U pg15
|
||||
// 300 - branch V pg16
|
||||
//
|
||||
// oldest_lsn = 155:
|
||||
// get [V pg16, U pg15(one_more=true), T pg15(one_more=true), S pg14(one_more=false)]
|
||||
.take_while({
|
||||
let mut one_more = true;
|
||||
|
||||
move |tl| {
|
||||
if *oldest_lsn < tl.ancestor_lsn {
|
||||
assert!(one_more);
|
||||
true
|
||||
} else {
|
||||
let prev = one_more;
|
||||
one_more = false;
|
||||
prev
|
||||
}
|
||||
}
|
||||
})
|
||||
// remove consecutive same pg_versions, which might be all in case we can use the
|
||||
// same timeline for all reconstruction.
|
||||
// [V pg16, U pg15, T pg15, S pg14] => [V pg16, T pg15, S pg14]
|
||||
.fold(Vec::<&Timeline>::with_capacity(4), |mut acc, next| {
|
||||
if acc
|
||||
.last()
|
||||
.map(|tl| tl.pg_version == next.pg_version)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
// overwrite with an earlier timeline; additionally we only allow upgrades,
|
||||
// so we cannot go backwards like pg14 (branch) pg15 (branch) pg14
|
||||
*acc.last_mut().unwrap() = next;
|
||||
} else {
|
||||
acc.push(next);
|
||||
}
|
||||
acc
|
||||
});
|
||||
|
||||
// shifted for the purpose of timeline_pairs
|
||||
let later_timelines = ancestry
|
||||
.iter()
|
||||
.rev()
|
||||
.skip(1)
|
||||
.map(Some)
|
||||
.chain(std::iter::once(None));
|
||||
|
||||
// zip older and later timelines into pair, which we then use to select parts of
|
||||
// wal records to be executed on which version walredo
|
||||
let timeline_pairs = ancestry.iter().rev().zip(later_timelines);
|
||||
|
||||
let mgr = self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?;
|
||||
|
||||
let mut img = data.img.clone();
|
||||
let mut records = &data.records[..];
|
||||
|
||||
for (older, later) in timeline_pairs {
|
||||
let scratch = records
|
||||
.iter()
|
||||
.take_while(|(lsn, _)| {
|
||||
// if there is no later, take all remaining
|
||||
later.map(|later| lsn < &later.ancestor_lsn).unwrap_or(true)
|
||||
})
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
records = &records[scratch.len()..];
|
||||
|
||||
if later.is_none() {
|
||||
assert!(records.is_empty());
|
||||
}
|
||||
|
||||
// if we don't have any records for this timeline (which is possible)
|
||||
// go to the previous one
|
||||
if scratch.is_empty() {
|
||||
tracing::info!("no records for timeline {}", older.timeline_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
// this is only used for logging on the next round
|
||||
let last_lsn = scratch.last().unwrap().0;
|
||||
|
||||
// is request_lsn ok? it's not used for anything important, just logging.
|
||||
let res = mgr
|
||||
.request_redo(key, request_lsn, img, scratch, older.pg_version)
|
||||
.await;
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
|
||||
img = match res {
|
||||
Ok(img) => Some((last_lsn, img)),
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(e)) => {
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
@@ -5101,8 +5235,9 @@ impl Timeline {
|
||||
))
|
||||
}
|
||||
};
|
||||
Ok(img)
|
||||
}
|
||||
|
||||
Ok(img.unwrap().1)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess};
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
@@ -29,7 +30,6 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
@@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
@@ -1433,43 +1436,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum CompactL0Phase1ValueAccess {
|
||||
/// The old way.
|
||||
PageCachedBlobIo,
|
||||
/// The new way.
|
||||
StreamingKmerge {
|
||||
/// If set, we run both the old way and the new way, validate that
|
||||
/// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
|
||||
/// and if the validation fails,
|
||||
/// - in tests: fail them with a panic or
|
||||
/// - in prod, log a rate-limited warning and use the old way's results.
|
||||
///
|
||||
/// If not set, we only run the new way and trust its results.
|
||||
validate: Option<CompactL0BypassPageCacheValidation>,
|
||||
},
|
||||
}
|
||||
|
||||
/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum CompactL0BypassPageCacheValidation {
|
||||
/// Validate that the series of (key, lsn) pairs are the same.
|
||||
KeyLsn,
|
||||
/// Validate that the entire output of old and new way is identical.
|
||||
KeyLsnValue,
|
||||
}
|
||||
|
||||
impl Default for CompactL0Phase1ValueAccess {
|
||||
fn default() -> Self {
|
||||
CompactL0Phase1ValueAccess::StreamingKmerge {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
|
||||
validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Entry point for new tiered compaction algorithm.
|
||||
///
|
||||
|
||||
@@ -286,6 +286,8 @@ impl DeleteTimelineFlow {
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
// Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
|
||||
None,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ use crate::{
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
walrecord::{decode_wal_record, DecodedWALRecord},
|
||||
};
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
let mut modification = timeline.begin_modification(startlsn);
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
@@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize WAL record
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
|
||||
|
||||
if decoded.is_dbase_create_copy(timeline.pg_version)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
// these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure
|
||||
// all earlier writes of data blocks are visible by committing any modification in flight.
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
let ingested = walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
if !ingested {
|
||||
@@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
filtered_records = 0;
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the remaining records.
|
||||
if uncommitted_records > 0 {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
modification.commit(&ctx).await?;
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
//! Note that the vectored blob api does *not* go through the page cache.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::Key;
|
||||
@@ -29,9 +28,6 @@ use crate::context::RequestContext;
|
||||
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
|
||||
/// Metadata bundled with the start and end offset of a blob.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct BlobMeta {
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
|
||||
|
||||
@@ -19,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use once_cell::sync::OnceCell;
|
||||
use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
|
||||
@@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine {
|
||||
}
|
||||
},
|
||||
Err(std::env::VarError::NotPresent) => {
|
||||
crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
|
||||
.parse()
|
||||
.unwrap()
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
IoEngineKind::TokioEpollUring
|
||||
}
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
IoEngineKind::StdFs
|
||||
}
|
||||
}
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var {env_var_name} is not unicode");
|
||||
|
||||
@@ -57,6 +57,7 @@ use utils::lsn::Lsn;
|
||||
|
||||
pub struct WalIngest {
|
||||
shard: ShardIdentity,
|
||||
pg_version: u32,
|
||||
checkpoint: CheckPoint,
|
||||
checkpoint_modified: bool,
|
||||
warn_ingest_lag: WarnIngestLag,
|
||||
@@ -82,6 +83,7 @@ impl WalIngest {
|
||||
|
||||
Ok(WalIngest {
|
||||
shard: *timeline.get_shard_identity(),
|
||||
pg_version: timeline.pg_version,
|
||||
checkpoint,
|
||||
checkpoint_modified: false,
|
||||
warn_ingest_lag: WarnIngestLag {
|
||||
@@ -104,10 +106,9 @@ impl WalIngest {
|
||||
///
|
||||
pub async fn ingest_record(
|
||||
&mut self,
|
||||
recdata: Bytes,
|
||||
decoded: DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
decoded: &mut DecodedWALRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
@@ -115,7 +116,12 @@ impl WalIngest {
|
||||
let prev_len = modification.len();
|
||||
|
||||
modification.set_lsn(lsn)?;
|
||||
decode_wal_record(recdata, decoded, pg_version)?;
|
||||
|
||||
if decoded.is_dbase_create_copy(self.pg_version) {
|
||||
// Records of this type should always be preceded by a commit(), as they
|
||||
// rely on reading data pages back from the Timeline.
|
||||
assert!(!modification.has_dirty_data_pages());
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
@@ -133,11 +139,11 @@ impl WalIngest {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
// Heap AM records need some special handling, because they modify VM pages
|
||||
// without registering them with the standard mechanism.
|
||||
self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
|
||||
self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
|
||||
.await?;
|
||||
}
|
||||
pg_constants::RM_NEON_ID => {
|
||||
self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
|
||||
self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
|
||||
.await?;
|
||||
}
|
||||
// Handle other special record types
|
||||
@@ -325,7 +331,7 @@ impl WalIngest {
|
||||
}
|
||||
pg_constants::RM_RELMAP_ID => {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
|
||||
self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
|
||||
.await?;
|
||||
}
|
||||
pg_constants::RM_XLOG_ID => {
|
||||
@@ -470,7 +476,7 @@ impl WalIngest {
|
||||
|
||||
continue;
|
||||
}
|
||||
self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
|
||||
self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -486,6 +492,8 @@ impl WalIngest {
|
||||
// until commit() is called to flush the data into the repository and update
|
||||
// the latest LSN.
|
||||
|
||||
modification.on_record_end();
|
||||
|
||||
Ok(modification.len() > prev_len)
|
||||
}
|
||||
|
||||
@@ -557,6 +565,7 @@ impl WalIngest {
|
||||
page_set_lsn(&mut image, lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
|
||||
.await?;
|
||||
} else {
|
||||
@@ -1195,7 +1204,7 @@ impl WalIngest {
|
||||
if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
|
||||
// Tail of last remaining FSM page has to be zeroed.
|
||||
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
|
||||
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
|
||||
modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
|
||||
fsm_physical_page_no += 1;
|
||||
}
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||
@@ -1217,7 +1226,7 @@ impl WalIngest {
|
||||
if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
|
||||
// Tail of last remaining vm page has to be zeroed.
|
||||
// We are not precise here and instead of digging in VM bitmap format just clear the whole page.
|
||||
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
|
||||
modification.put_rel_page_image_zero(rel, vm_page_no);
|
||||
vm_page_no += 1;
|
||||
}
|
||||
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||
@@ -1687,7 +1696,7 @@ impl WalIngest {
|
||||
continue;
|
||||
}
|
||||
|
||||
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
|
||||
modification.put_rel_page_image_zero(rel, gap_blknum);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -1753,7 +1762,7 @@ impl WalIngest {
|
||||
|
||||
// fill the gap with zeros
|
||||
for gap_blknum in old_nblocks..blknum {
|
||||
modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
|
||||
modification.put_slru_page_image_zero(kind, segno, gap_blknum);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -1827,21 +1836,25 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
@@ -1983,6 +1996,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
@@ -2008,6 +2022,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
|
||||
.await?;
|
||||
m.on_record_end();
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
@@ -2409,7 +2424,6 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
let mut modification = tline.begin_modification(startpoint);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
println!("decoding {} bytes", bytes.len() - xlogoff);
|
||||
|
||||
// Decode and ingest wal. We process the wal in chunks because
|
||||
@@ -2417,8 +2431,10 @@ mod tests {
|
||||
for chunk in bytes[xlogoff..].chunks(50) {
|
||||
decoder.feed_bytes(chunk);
|
||||
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -160,6 +160,30 @@ pub struct DecodedWALRecord {
|
||||
pub origin_id: u16,
|
||||
}
|
||||
|
||||
impl DecodedWALRecord {
|
||||
/// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
|
||||
/// by reading other existing relations' data blocks. This is more complex to apply than new-style database
|
||||
/// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
|
||||
pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
|
||||
if self.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
match pg_version {
|
||||
14 => {
|
||||
// Postgres 14 database creations are always the legacy kind
|
||||
info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
|
||||
}
|
||||
15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
|
||||
16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
|
||||
_ => {
|
||||
panic!("Unsupported postgres version {pg_version}")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
|
||||
@@ -78,7 +78,11 @@ pub struct PostgresRedoManager {
|
||||
/// # Shutdown
|
||||
///
|
||||
/// See [`Self::launched_processes`].
|
||||
redo_process: heavier_once_cell::OnceCell<ProcessOnceCell>,
|
||||
///
|
||||
/// # Different pg versions
|
||||
///
|
||||
/// We run a own quiesced process for each version (pg14, pg15, pg16 and maybe pg17).
|
||||
processes: [heavier_once_cell::OnceCell<ProcessOnceCell>; 4],
|
||||
|
||||
/// Gate that is entered when launching a walredo process and held open
|
||||
/// until the process has been `kill()`ed and `wait()`ed upon.
|
||||
@@ -215,10 +219,18 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
process: self.redo_process.get().and_then(|p| match &*p {
|
||||
ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
|
||||
ProcessOnceCell::ManagerShutDown => None,
|
||||
}),
|
||||
process: self
|
||||
.processes
|
||||
.iter()
|
||||
.filter_map(|p| {
|
||||
p.get().and_then(|p| match &*p {
|
||||
ProcessOnceCell::Spawned(p) => {
|
||||
Some(WalRedoManagerProcessStatus { pid: p.id() })
|
||||
}
|
||||
ProcessOnceCell::ManagerShutDown => None,
|
||||
})
|
||||
})
|
||||
.next(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -236,7 +248,7 @@ impl PostgresRedoManager {
|
||||
tenant_shard_id,
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: heavier_once_cell::OnceCell::default(),
|
||||
processes: Default::default(),
|
||||
launched_processes: utils::sync::gate::Gate::default(),
|
||||
}
|
||||
}
|
||||
@@ -256,26 +268,31 @@ impl PostgresRedoManager {
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) -> bool {
|
||||
// prevent new processes from being spawned
|
||||
let maybe_permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||
None
|
||||
} else {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
Some(permit)
|
||||
let mut it_was_us = false;
|
||||
for process in self.processes.iter() {
|
||||
// prevent new processes from being spawned
|
||||
let maybe_permit = match process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||
None
|
||||
} else {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
Some(permit)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(permit) => Some(permit),
|
||||
};
|
||||
let it_was_us = if let Some(permit) = maybe_permit {
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
Err(permit) => Some(permit),
|
||||
};
|
||||
let i_cant_see_why_this = if let Some(permit) = maybe_permit {
|
||||
process.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
// TODO: or is correct?
|
||||
it_was_us |= i_cant_see_why_this;
|
||||
}
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
@@ -291,7 +308,10 @@ impl PostgresRedoManager {
|
||||
if let Some(last_redo_at) = *g {
|
||||
if last_redo_at.elapsed() >= idle_timeout {
|
||||
drop(g);
|
||||
drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
|
||||
|
||||
self.processes.iter().for_each(|c| {
|
||||
drop(c.get().map(|guard| guard.take_and_deinit()));
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -314,13 +334,23 @@ impl PostgresRedoManager {
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, Error> {
|
||||
assert!(
|
||||
(14..=17).contains(&pg_version),
|
||||
"this should be an enum, but no: {pg_version}"
|
||||
);
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
// handling multiple processes idea: just support N versions here, but the caller
|
||||
// splits per parent_lsn in the case that:
|
||||
// - reconstruct_data spans two versions
|
||||
// - reconstruct_data went to parent???
|
||||
let process = &self.processes[(pg_version - 14) as usize];
|
||||
|
||||
let proc: Arc<Process> = match process.get_or_init_detached().await {
|
||||
Ok(guard) => match &*guard {
|
||||
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
|
||||
ProcessOnceCell::ManagerShutDown => {
|
||||
@@ -332,11 +362,11 @@ impl PostgresRedoManager {
|
||||
// acquire guard before spawning process, so that we don't spawn new processes
|
||||
// if the gate is already closed.
|
||||
let _launched_processes_guard = match self.launched_processes.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
};
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
};
|
||||
let proc = Arc::new(Process {
|
||||
process: process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
@@ -353,8 +383,7 @@ impl PostgresRedoManager {
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
process.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
@@ -419,7 +448,7 @@ impl PostgresRedoManager {
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
match self.redo_process.get() {
|
||||
match process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
match &*guard {
|
||||
@@ -448,9 +477,7 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process a batch of WAL records using bespoken Neon code.
|
||||
///
|
||||
/// Process a batch of WAL records using bespoke Neon code.
|
||||
fn apply_batch_neon(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -471,7 +498,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// Apply all the WAL records in the batch
|
||||
for (record_lsn, record) in records.iter() {
|
||||
self.apply_record_neon(key, &mut page, *record_lsn, record)?;
|
||||
apply_neon::apply_in_neon(record, *record_lsn, key, &mut page)?;
|
||||
}
|
||||
// Success!
|
||||
let duration = start_time.elapsed();
|
||||
@@ -488,18 +515,6 @@ impl PostgresRedoManager {
|
||||
|
||||
Ok(page.freeze())
|
||||
}
|
||||
|
||||
fn apply_record_neon(
|
||||
&self,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
record_lsn: Lsn,
|
||||
record: &NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
apply_neon::apply_in_neon(record, record_lsn, key, page)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
@@ -234,8 +236,13 @@ pub(crate) fn apply_in_neon(
|
||||
LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::AuxFile { .. } => {
|
||||
// No-op: this record will never be created in aux v2.
|
||||
NeonWalRecord::AuxFile { file_path, content } => {
|
||||
let mut dir = AuxFilesDirectory::des(page)?;
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
|
||||
page.clear();
|
||||
let mut writer = page.writer();
|
||||
dir.ser_into(&mut writer)?;
|
||||
}
|
||||
#[cfg(test)]
|
||||
NeonWalRecord::Test {
|
||||
@@ -243,7 +250,6 @@ pub(crate) fn apply_in_neon(
|
||||
clear,
|
||||
will_init,
|
||||
} => {
|
||||
use bytes::BufMut;
|
||||
if *will_init {
|
||||
assert!(*clear, "init record must be clear to ensure correctness");
|
||||
}
|
||||
@@ -255,3 +261,59 @@ pub(crate) fn apply_in_neon(
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::AUX_FILES_KEY;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
|
||||
#[test]
|
||||
fn apply_aux_file_deltas() -> anyhow::Result<()> {
|
||||
let base_dir = AuxFilesDirectory {
|
||||
files: HashMap::from([
|
||||
("two".to_string(), Bytes::from_static(b"content0")),
|
||||
("three".to_string(), Bytes::from_static(b"contentX")),
|
||||
]),
|
||||
};
|
||||
let base_image = AuxFilesDirectory::ser(&base_dir)?;
|
||||
|
||||
let deltas = vec![
|
||||
// Insert
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "one".to_string(),
|
||||
content: Some(Bytes::from_static(b"content1")),
|
||||
},
|
||||
// Update
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "two".to_string(),
|
||||
content: Some(Bytes::from_static(b"content99")),
|
||||
},
|
||||
// Delete
|
||||
NeonWalRecord::AuxFile {
|
||||
file_path: "three".to_string(),
|
||||
content: None,
|
||||
},
|
||||
];
|
||||
|
||||
let file_path = AUX_FILES_KEY;
|
||||
let mut page = BytesMut::from_iter(base_image);
|
||||
|
||||
for record in deltas {
|
||||
apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
|
||||
}
|
||||
|
||||
let reconstructed = AuxFilesDirectory::des(&page)?;
|
||||
let expect = HashMap::from([
|
||||
("one".to_string(), Bytes::from_static(b"content1")),
|
||||
("two".to_string(), Bytes::from_static(b"content99")),
|
||||
]);
|
||||
|
||||
assert_eq!(reconstructed.files, expect);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
69
poetry.lock
generated
69
poetry.lock
generated
@@ -985,43 +985,38 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "42.0.4"
|
||||
version = "43.0.1"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
|
||||
{file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
|
||||
{file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
|
||||
{file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
|
||||
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
|
||||
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
|
||||
{file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
|
||||
{file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
|
||||
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
|
||||
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
|
||||
{file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
|
||||
{file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"},
|
||||
{file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"},
|
||||
{file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"},
|
||||
{file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"},
|
||||
{file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"},
|
||||
{file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -1034,7 +1029,7 @@ nox = ["nox"]
|
||||
pep8test = ["check-sdist", "click", "mypy", "ruff"]
|
||||
sdist = ["build"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
@@ -1110,13 +1105,13 @@ dotenv = ["python-dotenv"]
|
||||
|
||||
[[package]]
|
||||
name = "flask-cors"
|
||||
version = "4.0.1"
|
||||
version = "5.0.0"
|
||||
description = "A Flask extension adding a decorator for CORS support"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"},
|
||||
{file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"},
|
||||
{file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
|
||||
{file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
||||
@@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware {
|
||||
.build()
|
||||
}
|
||||
|
||||
pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
pub(crate) fn new_client_with_timeout(
|
||||
request_timeout: Duration,
|
||||
total_retry_duration: Duration,
|
||||
) -> ClientWithMiddleware {
|
||||
let timeout_client = reqwest::ClientBuilder::new()
|
||||
.timeout(default_timout)
|
||||
.timeout(request_timeout)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
|
||||
let retry_policy =
|
||||
ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
|
||||
ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration);
|
||||
|
||||
reqwest_middleware::ClientBuilder::new(timeout_client)
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
|
||||
@@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60);
|
||||
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
/// Currently, endpoint_id is enough, but this may change later,
|
||||
@@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
|
||||
info!("metrics collector has shut down");
|
||||
}
|
||||
|
||||
let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
|
||||
let http_client = http::new_client_with_timeout(
|
||||
HTTP_REPORTING_REQUEST_TIMEOUT,
|
||||
HTTP_REPORTING_RETRY_DURATION,
|
||||
);
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
|
||||
let mut prev = Utc::now();
|
||||
|
||||
@@ -86,7 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
#[instrument(name = "broker pull", skip_all)]
|
||||
#[instrument(name = "broker_pull", skip_all)]
|
||||
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
|
||||
|
||||
|
||||
@@ -389,6 +389,25 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Unevict timeline and remove uploaded partial segment(s) from the remote storage.
|
||||
/// Successfull response returns list of segments existed before the deletion.
|
||||
/// Aimed for one-off usage not normally needed.
|
||||
async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let response = tli
|
||||
.backup_partial_reset()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Used only in tests to hand craft required data.
|
||||
async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -607,6 +626,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
|
||||
request_span(r, timeline_digest_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset",
|
||||
|r| request_span(r, timeline_backup_partial_reset),
|
||||
)
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
})
|
||||
|
||||
@@ -183,10 +183,10 @@ impl WalResidentTimeline {
|
||||
"Replacing uploaded partial segment in in-mem control file: {replace:?}"
|
||||
);
|
||||
|
||||
let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?;
|
||||
let remote_timeline_path = &self.tli.remote_path;
|
||||
wal_backup::copy_partial_segment(
|
||||
&replace.previous.remote_path(&remote_timeline_path),
|
||||
&replace.current.remote_path(&remote_timeline_path),
|
||||
&replace.previous.remote_path(remote_timeline_path),
|
||||
&replace.current.remote_path(remote_timeline_path),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ use crate::{
|
||||
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
|
||||
|
||||
@@ -875,6 +875,29 @@ where
|
||||
return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
|
||||
}
|
||||
|
||||
// Disallow any non-sequential writes, which can result in gaps or
|
||||
// overwrites. If we need to move the pointer, ProposerElected message
|
||||
// should have truncated WAL first accordingly. Note that the first
|
||||
// condition (WAL rewrite) is quite expected in real world; it happens
|
||||
// when walproposer reconnects to safekeeper and writes some more data
|
||||
// while first connection still gets some packets later. It might be
|
||||
// better to not log this as error! above.
|
||||
let write_lsn = self.wal_store.write_lsn();
|
||||
if write_lsn > msg.h.begin_lsn {
|
||||
bail!(
|
||||
"append request rewrites WAL written before, write_lsn={}, msg lsn={}",
|
||||
write_lsn,
|
||||
msg.h.begin_lsn
|
||||
);
|
||||
}
|
||||
if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) {
|
||||
bail!(
|
||||
"append request creates gap in written WAL, write_lsn={}, msg lsn={}",
|
||||
write_lsn,
|
||||
msg.h.begin_lsn,
|
||||
);
|
||||
}
|
||||
|
||||
// Now we know that we are in the same term as the proposer,
|
||||
// processing the message.
|
||||
|
||||
@@ -960,10 +983,7 @@ mod tests {
|
||||
use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
wal_storage::Storage,
|
||||
};
|
||||
use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
|
||||
use std::{ops::Deref, str::FromStr, time::Instant};
|
||||
|
||||
// fake storage for tests
|
||||
@@ -1003,6 +1023,10 @@ mod tests {
|
||||
}
|
||||
|
||||
impl wal_storage::Storage for DummyWalStore {
|
||||
fn write_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
@@ -1076,7 +1100,7 @@ mod tests {
|
||||
let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
term: 2,
|
||||
term_start_lsn: Lsn(3),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
@@ -1090,24 +1114,29 @@ mod tests {
|
||||
};
|
||||
|
||||
let pem = ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(3),
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term: 1,
|
||||
lsn: Lsn(3),
|
||||
}]),
|
||||
timeline_start_lsn: Lsn(0),
|
||||
term: 2,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![
|
||||
TermLsn {
|
||||
term: 1,
|
||||
lsn: Lsn(1),
|
||||
},
|
||||
TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(3),
|
||||
},
|
||||
]),
|
||||
timeline_start_lsn: Lsn(1),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// check that AppendRequest before term_start_lsn doesn't switch last_log_term.
|
||||
let resp = sk
|
||||
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.get_last_log_term(), 0);
|
||||
sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(sk.get_last_log_term(), 1);
|
||||
|
||||
// but record at term_start_lsn does the switch
|
||||
ar_hdr.begin_lsn = Lsn(2);
|
||||
@@ -1116,12 +1145,63 @@ mod tests {
|
||||
h: ar_hdr,
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
let resp = sk
|
||||
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_last_log_term(), 1);
|
||||
sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(sk.get_last_log_term(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_non_consecutive_write() {
|
||||
let storage = InMemoryState {
|
||||
persisted_state: test_sk_state(),
|
||||
};
|
||||
let wal_store = DummyWalStore { lsn: Lsn(0) };
|
||||
|
||||
let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();
|
||||
|
||||
let pem = ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![TermLsn {
|
||||
term: 1,
|
||||
lsn: Lsn(1),
|
||||
}]),
|
||||
timeline_start_lsn: Lsn(1),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
term_start_lsn: Lsn(3),
|
||||
begin_lsn: Lsn(1),
|
||||
end_lsn: Lsn(2),
|
||||
commit_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let append_request = AppendRequest {
|
||||
h: ar_hdr.clone(),
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
|
||||
// do write ending at 2, it should be ok
|
||||
sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await
|
||||
.unwrap();
|
||||
let mut ar_hrd2 = ar_hdr.clone();
|
||||
ar_hrd2.begin_lsn = Lsn(4);
|
||||
ar_hrd2.end_lsn = Lsn(5);
|
||||
let append_request = AppendRequest {
|
||||
h: ar_hdr,
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
// and now starting at 4, it must fail
|
||||
sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -36,7 +37,7 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
|
||||
use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::{self};
|
||||
use crate::wal_backup::{self, remote_timeline_path};
|
||||
use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
@@ -469,6 +470,7 @@ impl From<TimelineError> for ApiError {
|
||||
/// It also holds SharedState and provides mutually exclusive access to it.
|
||||
pub struct Timeline {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub remote_path: RemotePath,
|
||||
|
||||
/// Used to broadcast commit_lsn updates to all background jobs.
|
||||
commit_lsn_watch_tx: watch::Sender<Lsn>,
|
||||
@@ -519,8 +521,10 @@ impl Timeline {
|
||||
let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
remote_path,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
@@ -557,8 +561,10 @@ impl Timeline {
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
|
||||
|
||||
let walreceivers = WalReceivers::new();
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
Ok(Timeline {
|
||||
ttid,
|
||||
remote_path,
|
||||
commit_lsn_watch_tx,
|
||||
commit_lsn_watch_rx,
|
||||
term_flush_lsn_watch_tx,
|
||||
@@ -902,6 +908,10 @@ impl Timeline {
|
||||
|
||||
Ok(WalResidentTimeline::new(self.clone(), guard))
|
||||
}
|
||||
|
||||
pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
|
||||
self.manager_ctl.backup_partial_reset().await
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a guard that allows to read/write disk timeline state.
|
||||
|
||||
@@ -28,28 +28,38 @@ impl Manager {
|
||||
/// - control file is flushed (no next event scheduled)
|
||||
/// - no WAL residence guards
|
||||
/// - no pushes to the broker
|
||||
/// - partial WAL backup is uploaded
|
||||
/// - last partial WAL segment is uploaded
|
||||
/// - all local segments before the uploaded partial are committed and uploaded
|
||||
pub(crate) fn ready_for_eviction(
|
||||
&self,
|
||||
next_event: &Option<tokio::time::Instant>,
|
||||
state: &StateSnapshot,
|
||||
) -> bool {
|
||||
self.backup_task.is_none()
|
||||
let ready = self.backup_task.is_none()
|
||||
&& self.recovery_task.is_none()
|
||||
&& self.wal_removal_task.is_none()
|
||||
&& self.partial_backup_task.is_none()
|
||||
&& self.partial_backup_uploaded.is_some()
|
||||
&& next_event.is_none()
|
||||
&& self.access_service.is_empty()
|
||||
&& !self.tli_broker_active.get()
|
||||
// Partial segment of current flush_lsn is uploaded up to this flush_lsn.
|
||||
&& !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
|
||||
// And it is the next one after the last removed. Given that local
|
||||
// WAL is removed only after it is uploaded to s3 (and pageserver
|
||||
// advancing remote_consistent_lsn) which happens only after WAL is
|
||||
// committed, true means all this is done.
|
||||
//
|
||||
// This also works for the first segment despite last_removed_segno
|
||||
// being 0 on init because this 0 triggers run of wal_removal_task
|
||||
// on success of which manager updates the horizon.
|
||||
&& self
|
||||
.partial_backup_uploaded
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.flush_lsn
|
||||
.segment_number(self.wal_seg_size)
|
||||
== self.last_removed_segno + 1
|
||||
== self.last_removed_segno + 1;
|
||||
ready
|
||||
}
|
||||
|
||||
/// Evict the timeline to remote storage.
|
||||
@@ -83,7 +93,8 @@ impl Manager {
|
||||
info!("successfully evicted timeline");
|
||||
}
|
||||
|
||||
/// Restore evicted timeline from remote storage.
|
||||
/// Attempt to restore evicted timeline from remote storage; it must be
|
||||
/// offloaded.
|
||||
#[instrument(name = "unevict_timeline", skip_all)]
|
||||
pub(crate) async fn unevict_timeline(&mut self) {
|
||||
assert!(self.is_offloaded);
|
||||
@@ -167,7 +178,7 @@ async fn redownload_partial_segment(
|
||||
partial: &PartialRemoteSegment,
|
||||
) -> anyhow::Result<()> {
|
||||
let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
|
||||
let remote_segfile = remote_segment_path(mgr, partial)?;
|
||||
let remote_segfile = remote_segment_path(mgr, partial);
|
||||
|
||||
debug!(
|
||||
"redownloading partial segment: {} -> {}",
|
||||
@@ -252,7 +263,7 @@ async fn do_validation(
|
||||
);
|
||||
}
|
||||
|
||||
let remote_segfile = remote_segment_path(mgr, partial)?;
|
||||
let remote_segfile = remote_segment_path(mgr, partial);
|
||||
let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
|
||||
wal_backup::read_object(&remote_segfile, 0).await?;
|
||||
|
||||
@@ -279,12 +290,8 @@ fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8Path
|
||||
local_partial_segfile
|
||||
}
|
||||
|
||||
fn remote_segment_path(
|
||||
mgr: &Manager,
|
||||
partial: &PartialRemoteSegment,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?;
|
||||
Ok(partial.remote_path(&remote_timeline_path))
|
||||
fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath {
|
||||
partial.remote_path(&mgr.tli.remote_path)
|
||||
}
|
||||
|
||||
/// Compare first `n` bytes of two readers. If the bytes differ, return an error.
|
||||
|
||||
@@ -11,12 +11,14 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use futures::channel::oneshot;
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{
|
||||
task::{JoinError, JoinHandle},
|
||||
time::Instant,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, instrument, warn, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -33,7 +35,7 @@ use crate::{
|
||||
timeline_guard::{AccessService, GuardId, ResidenceGuard},
|
||||
timelines_set::{TimelineSetGuard, TimelinesSet},
|
||||
wal_backup::{self, WalBackupTaskHandle},
|
||||
wal_backup_partial::{self, PartialRemoteSegment},
|
||||
wal_backup_partial::{self, PartialBackup, PartialRemoteSegment},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
@@ -96,6 +98,8 @@ pub enum ManagerCtlMessage {
|
||||
GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
|
||||
/// Request to drop the guard.
|
||||
GuardDrop(GuardId),
|
||||
/// Request to reset uploaded partial backup state.
|
||||
BackupPartialReset(oneshot::Sender<anyhow::Result<Vec<String>>>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ManagerCtlMessage {
|
||||
@@ -103,6 +107,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
|
||||
match self {
|
||||
ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
|
||||
ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
|
||||
ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -143,6 +148,19 @@ impl ManagerCtl {
|
||||
.and_then(std::convert::identity)
|
||||
}
|
||||
|
||||
/// Request timeline manager to reset uploaded partial segment state and
|
||||
/// wait for the result.
|
||||
pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
|
||||
let (tx, rx) = oneshot::channel();
|
||||
self.manager_tx
|
||||
.send(ManagerCtlMessage::BackupPartialReset(tx))
|
||||
.expect("manager task is not running");
|
||||
match rx.await {
|
||||
Ok(res) => res,
|
||||
Err(_) => anyhow::bail!("timeline manager is gone"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Must be called exactly once to bootstrap the manager.
|
||||
pub fn bootstrap_manager(
|
||||
&self,
|
||||
@@ -181,7 +199,8 @@ pub(crate) struct Manager {
|
||||
pub(crate) wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>>,
|
||||
|
||||
// partial backup
|
||||
pub(crate) partial_backup_task: Option<JoinHandle<Option<PartialRemoteSegment>>>,
|
||||
pub(crate) partial_backup_task:
|
||||
Option<(JoinHandle<Option<PartialRemoteSegment>>, CancellationToken)>,
|
||||
pub(crate) partial_backup_uploaded: Option<PartialRemoteSegment>,
|
||||
|
||||
// misc
|
||||
@@ -302,12 +321,12 @@ pub async fn main_task(
|
||||
_ = sleep_until(&next_event) => {
|
||||
// we were waiting for some event (e.g. cfile save)
|
||||
}
|
||||
res = await_task_finish(&mut mgr.wal_removal_task) => {
|
||||
res = await_task_finish(mgr.wal_removal_task.as_mut()) => {
|
||||
// WAL removal task finished
|
||||
mgr.wal_removal_task = None;
|
||||
mgr.update_wal_removal_end(res);
|
||||
}
|
||||
res = await_task_finish(&mut mgr.partial_backup_task) => {
|
||||
res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => {
|
||||
// partial backup task finished
|
||||
mgr.partial_backup_task = None;
|
||||
mgr.update_partial_backup_end(res);
|
||||
@@ -335,8 +354,9 @@ pub async fn main_task(
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(partial_backup_task) = &mut mgr.partial_backup_task {
|
||||
if let Err(e) = partial_backup_task.await {
|
||||
if let Some((handle, cancel)) = &mut mgr.partial_backup_task {
|
||||
cancel.cancel();
|
||||
if let Err(e) = handle.await {
|
||||
warn!("partial backup task failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
@@ -560,11 +580,14 @@ impl Manager {
|
||||
}
|
||||
|
||||
// Get WalResidentTimeline and start partial backup task.
|
||||
self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
|
||||
let cancel = CancellationToken::new();
|
||||
let handle = tokio::spawn(wal_backup_partial::main_task(
|
||||
self.wal_resident_timeline(),
|
||||
self.conf.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
)));
|
||||
cancel.clone(),
|
||||
));
|
||||
self.partial_backup_task = Some((handle, cancel));
|
||||
}
|
||||
|
||||
/// Update the state after partial WAL backup task finished.
|
||||
@@ -579,6 +602,39 @@ impl Manager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Reset partial backup state and remove its remote storage data. Since it
|
||||
/// might concurrently uploading something, cancel the task first.
|
||||
async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
|
||||
info!("resetting partial backup state");
|
||||
// Force unevict timeline if it is evicted before erasing partial backup
|
||||
// state. The intended use of this function is to drop corrupted remote
|
||||
// state; we haven't enabled local files deletion yet anywhere,
|
||||
// so direct switch is safe.
|
||||
if self.is_offloaded {
|
||||
self.tli.switch_to_present().await?;
|
||||
// switch manager state as soon as possible
|
||||
self.is_offloaded = false;
|
||||
}
|
||||
|
||||
if let Some((handle, cancel)) = &mut self.partial_backup_task {
|
||||
cancel.cancel();
|
||||
info!("cancelled partial backup task, awaiting it");
|
||||
// we're going to reset .partial_backup_uploaded to None anyway, so ignore the result
|
||||
handle.await.ok();
|
||||
self.partial_backup_task = None;
|
||||
}
|
||||
|
||||
let tli = self.wal_resident_timeline();
|
||||
let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
|
||||
// Reset might fail e.g. when cfile is already reset but s3 removal
|
||||
// failed, so set manager state to None beforehand. In any case caller
|
||||
// is expected to retry until success.
|
||||
self.partial_backup_uploaded = None;
|
||||
let res = partial_backup.reset().await?;
|
||||
info!("reset is done");
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// Handle message arrived from ManagerCtl.
|
||||
async fn handle_message(&mut self, msg: Option<ManagerCtlMessage>) {
|
||||
debug!("received manager message: {:?}", msg);
|
||||
@@ -602,6 +658,16 @@ impl Manager {
|
||||
Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
|
||||
self.access_service.drop_guard(guard_id);
|
||||
}
|
||||
Some(ManagerCtlMessage::BackupPartialReset(tx)) => {
|
||||
info!("resetting uploaded partial backup state");
|
||||
let res = self.backup_partial_reset().await;
|
||||
if let Err(ref e) = res {
|
||||
warn!("failed to reset partial backup state: {:?}", e);
|
||||
}
|
||||
if tx.send(res).is_err() {
|
||||
warn!("failed to send partial backup reset result, receiver dropped");
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// can't happen, we're holding the sender
|
||||
unreachable!();
|
||||
@@ -619,7 +685,11 @@ async fn sleep_until(option: &Option<tokio::time::Instant>) {
|
||||
}
|
||||
}
|
||||
|
||||
async fn await_task_finish<T>(option: &mut Option<JoinHandle<T>>) -> Result<T, JoinError> {
|
||||
/// Future that resolves when the task is finished or never if the task is None.
|
||||
///
|
||||
/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the
|
||||
/// option to get the latter is hard.
|
||||
async fn await_task_finish<T>(option: Option<&mut JoinHandle<T>>) -> Result<T, JoinError> {
|
||||
if let Some(task) = option {
|
||||
task.await
|
||||
} else {
|
||||
|
||||
@@ -203,7 +203,7 @@ struct WalBackupTask {
|
||||
}
|
||||
|
||||
/// Offload single timeline.
|
||||
#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
async fn backup_task_main(
|
||||
tli: WalResidentTimeline,
|
||||
parallel_jobs: usize,
|
||||
@@ -315,7 +315,7 @@ async fn backup_lsn_range(
|
||||
anyhow::bail!("parallel_jobs must be >= 1");
|
||||
}
|
||||
|
||||
let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
|
||||
let remote_timeline_path = &timeline.remote_path;
|
||||
let start_lsn = *backup_lsn;
|
||||
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||
|
||||
@@ -328,11 +328,7 @@ async fn backup_lsn_range(
|
||||
loop {
|
||||
let added_task = match iter.next() {
|
||||
Some(s) => {
|
||||
uploads.push_back(backup_single_segment(
|
||||
s,
|
||||
timeline_dir,
|
||||
&remote_timeline_path,
|
||||
));
|
||||
uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
|
||||
@@ -22,6 +22,7 @@ use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
|
||||
@@ -31,7 +32,7 @@ use crate::{
|
||||
safekeeper::Term,
|
||||
timeline::WalResidentTimeline,
|
||||
timeline_manager::StateSnapshot,
|
||||
wal_backup::{self, remote_timeline_path},
|
||||
wal_backup::{self},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
@@ -145,7 +146,7 @@ impl State {
|
||||
}
|
||||
}
|
||||
|
||||
struct PartialBackup {
|
||||
pub struct PartialBackup {
|
||||
wal_seg_size: usize,
|
||||
tli: WalResidentTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
@@ -155,8 +156,25 @@ struct PartialBackup {
|
||||
state: State,
|
||||
}
|
||||
|
||||
// Read-only methods for getting segment names
|
||||
impl PartialBackup {
|
||||
pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
|
||||
let local_prefix = tli.get_timeline_dir();
|
||||
let remote_timeline_path = tli.remote_path.clone();
|
||||
|
||||
PartialBackup {
|
||||
wal_seg_size,
|
||||
tli,
|
||||
state: persistent_state.partial_backup,
|
||||
conf,
|
||||
local_prefix,
|
||||
remote_timeline_path,
|
||||
}
|
||||
}
|
||||
|
||||
// Read-only methods for getting segment names
|
||||
fn segno(&self, lsn: Lsn) -> XLogSegNo {
|
||||
lsn.segment_number(self.wal_seg_size)
|
||||
}
|
||||
@@ -297,6 +315,18 @@ impl PartialBackup {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Prepend to the given segments remote prefix and delete them from the
|
||||
// remote storage.
|
||||
async fn delete_segments(&self, segments_to_delete: &Vec<String>) -> anyhow::Result<()> {
|
||||
info!("deleting objects: {:?}", segments_to_delete);
|
||||
let mut objects_to_delete = vec![];
|
||||
for seg in segments_to_delete.iter() {
|
||||
let remote_path = self.remote_timeline_path.join(seg);
|
||||
objects_to_delete.push(remote_path);
|
||||
}
|
||||
wal_backup::delete_objects(&objects_to_delete).await
|
||||
}
|
||||
|
||||
/// Delete all non-Uploaded segments from the remote storage. There should be only one
|
||||
/// Uploaded segment at a time.
|
||||
#[instrument(name = "gc", skip_all)]
|
||||
@@ -329,15 +359,8 @@ impl PartialBackup {
|
||||
);
|
||||
}
|
||||
|
||||
info!("deleting objects: {:?}", segments_to_delete);
|
||||
let mut objects_to_delete = vec![];
|
||||
for seg in segments_to_delete.iter() {
|
||||
let remote_path = self.remote_timeline_path.join(seg);
|
||||
objects_to_delete.push(remote_path);
|
||||
}
|
||||
|
||||
// removing segments from remote storage
|
||||
wal_backup::delete_objects(&objects_to_delete).await?;
|
||||
// execute the deletion
|
||||
self.delete_segments(&segments_to_delete).await?;
|
||||
|
||||
// now we can update the state on disk
|
||||
let new_state = {
|
||||
@@ -349,6 +372,27 @@ impl PartialBackup {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remove uploaded segment(s) from the state and remote storage. Aimed for
|
||||
/// manual intervention, not normally needed.
|
||||
/// Returns list of segments which potentially existed in the remote storage.
|
||||
pub async fn reset(&mut self) -> anyhow::Result<Vec<String>> {
|
||||
let segments_to_delete = self
|
||||
.state
|
||||
.segments
|
||||
.iter()
|
||||
.map(|seg| seg.name.clone())
|
||||
.collect();
|
||||
|
||||
// First reset cfile state, and only then objects themselves. If the
|
||||
// later fails we might leave some garbage behind; that's ok for this
|
||||
// single time usage.
|
||||
let new_state = State { segments: vec![] };
|
||||
self.commit_state(new_state).await?;
|
||||
|
||||
self.delete_segments(&segments_to_delete).await?;
|
||||
Ok(segments_to_delete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if everything is uploaded and partial backup task doesn't need to run.
|
||||
@@ -372,38 +416,21 @@ pub(crate) fn needs_uploading(
|
||||
///
|
||||
/// When there is nothing more to do and the last segment was successfully uploaded, the task
|
||||
/// returns PartialRemoteSegment, to signal readiness for offloading the timeline.
|
||||
#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn main_task(
|
||||
tli: WalResidentTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
limiter: RateLimiter,
|
||||
cancel: CancellationToken,
|
||||
) -> Option<PartialRemoteSegment> {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
let mut first_iteration = true;
|
||||
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
|
||||
let local_prefix = tli.get_timeline_dir();
|
||||
let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
error!("failed to create remote path: {:?}", e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let mut backup = PartialBackup {
|
||||
wal_seg_size,
|
||||
tli,
|
||||
state: persistent_state.partial_backup,
|
||||
conf,
|
||||
local_prefix,
|
||||
remote_timeline_path,
|
||||
};
|
||||
let mut backup = PartialBackup::new(tli, conf).await;
|
||||
|
||||
debug!("state: {:?}", backup.state);
|
||||
|
||||
@@ -433,6 +460,10 @@ pub async fn main_task(
|
||||
&& flush_lsn_rx.borrow().term == seg.term
|
||||
{
|
||||
// we have nothing to do, the last segment is already uploaded
|
||||
debug!(
|
||||
"exiting, uploaded up to term={} flush_lsn={} commit_lsn={}",
|
||||
seg.term, seg.flush_lsn, seg.commit_lsn
|
||||
);
|
||||
return Some(seg.clone());
|
||||
}
|
||||
}
|
||||
@@ -444,6 +475,10 @@ pub async fn main_task(
|
||||
info!("timeline canceled");
|
||||
return None;
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
info!("task canceled");
|
||||
return None;
|
||||
}
|
||||
_ = flush_lsn_rx.changed() => {}
|
||||
}
|
||||
}
|
||||
@@ -470,6 +505,10 @@ pub async fn main_task(
|
||||
info!("timeline canceled");
|
||||
return None;
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
info!("task canceled");
|
||||
return None;
|
||||
}
|
||||
_ = commit_lsn_rx.changed() => {}
|
||||
_ = flush_lsn_rx.changed() => {
|
||||
let segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||
@@ -492,7 +531,13 @@ pub async fn main_task(
|
||||
}
|
||||
|
||||
// limit concurrent uploads
|
||||
let _upload_permit = limiter.acquire_partial_backup().await;
|
||||
let _upload_permit = tokio::select! {
|
||||
acq = limiter.acquire_partial_backup() => acq,
|
||||
_ = cancel.cancelled() => {
|
||||
info!("task canceled");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let prepared = backup.prepare_upload().await;
|
||||
if let Some(seg) = &uploaded_segment {
|
||||
|
||||
@@ -37,6 +37,8 @@ use pq_proto::SystemId;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
pub trait Storage {
|
||||
// Last written LSN.
|
||||
fn write_lsn(&self) -> Lsn;
|
||||
/// LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn;
|
||||
|
||||
@@ -329,6 +331,10 @@ impl PhysicalStorage {
|
||||
}
|
||||
|
||||
impl Storage for PhysicalStorage {
|
||||
// Last written LSN.
|
||||
fn write_lsn(&self) -> Lsn {
|
||||
self.write_lsn
|
||||
}
|
||||
/// flush_lsn returns LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.flush_record_lsn
|
||||
|
||||
@@ -175,6 +175,10 @@ impl DiskWALStorage {
|
||||
}
|
||||
|
||||
impl wal_storage::Storage for DiskWALStorage {
|
||||
// Last written LSN.
|
||||
fn write_lsn(&self) -> Lsn {
|
||||
self.write_lsn
|
||||
}
|
||||
/// LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.flush_record_lsn
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
-- This file should undo anything in `up.sql`
|
||||
DROP TABLE safekeepers;
|
||||
@@ -0,0 +1,15 @@
|
||||
-- started out as a copy of cplane schema, removed the unnecessary columns.
|
||||
CREATE TABLE safekeepers (
|
||||
-- the surrogate identifier defined by control plane database sequence
|
||||
id BIGINT PRIMARY KEY,
|
||||
region_id TEXT NOT NULL,
|
||||
version BIGINT NOT NULL,
|
||||
-- the natural id on whatever cloud platform, not needed in storage controller
|
||||
-- instance_id TEXT UNIQUE NOT NULL,
|
||||
host TEXT NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
active BOOLEAN NOT NULL DEFAULT false,
|
||||
-- projects_count INTEGER NOT NULL DEFAULT 0,
|
||||
http_port INTEGER NOT NULL,
|
||||
availability_zone_id TEXT NOT NULL
|
||||
);
|
||||
@@ -2,6 +2,7 @@ use crate::metrics::{
|
||||
HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
|
||||
METRICS_REGISTRY,
|
||||
};
|
||||
use crate::persistence::SafekeeperPersistence;
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use anyhow::Context;
|
||||
@@ -101,7 +102,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
|
||||
let validate_req = json_request::<ValidateRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req))
|
||||
json_response(StatusCode::OK, state.service.validate(validate_req).await?)
|
||||
}
|
||||
|
||||
/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
|
||||
@@ -767,6 +768,55 @@ impl From<ReconcileError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the safekeeper record by instance id, or 404.
|
||||
///
|
||||
/// Not used by anything except manual testing.
|
||||
async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
let res = state.service.get_safekeeper(id).await;
|
||||
|
||||
match res {
|
||||
Ok(b) => json_response(StatusCode::OK, b),
|
||||
Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
|
||||
Err(ApiError::NotFound("unknown instance_id".into()))
|
||||
}
|
||||
Err(other) => Err(other.into()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Used as part of deployment scripts.
|
||||
///
|
||||
/// Assumes information is only relayed to storage controller after first selecting an unique id on
|
||||
/// control plane database, which means we have an id field in the request and payload.
|
||||
async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let body = json_request::<SafekeeperPersistence>(&mut req).await?;
|
||||
let id = parse_request_param::<i64>(&req, "id")?;
|
||||
|
||||
if id != body.id {
|
||||
// it should be repeated
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"id mismatch: url={id:?}, body={:?}",
|
||||
body.id
|
||||
)));
|
||||
}
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
state.service.upsert_safekeeper(body).await?;
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::NO_CONTENT)
|
||||
.body(Body::empty())
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
|
||||
/// be allowed to run if Service has finished its initial reconciliation.
|
||||
async fn tenant_service_handler<R, H>(
|
||||
@@ -1127,6 +1177,13 @@ pub fn make_router(
|
||||
.put("/control/v1/step_down", |r| {
|
||||
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
|
||||
})
|
||||
.get("/control/v1/safekeeper/:id", |r| {
|
||||
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
.post("/control/v1/safekeeper/:id", |r| {
|
||||
// id is in the body
|
||||
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
|
||||
@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
use storage_controller::service::chaos_injector::ChaosInjector;
|
||||
use storage_controller::service::{
|
||||
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
||||
RECONCILER_CONCURRENCY_DEFAULT,
|
||||
Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
|
||||
MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
|
||||
};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -104,6 +104,10 @@ struct Cli {
|
||||
// a pageserver
|
||||
#[arg(long)]
|
||||
max_secondary_lag_bytes: Option<u64>,
|
||||
|
||||
// Period with which to send heartbeats to registered nodes
|
||||
#[arg(long)]
|
||||
heartbeat_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
@@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
split_threshold: args.split_threshold,
|
||||
neon_local_repo_dir: args.neon_local_repo_dir,
|
||||
max_secondary_lag_bytes: args.max_secondary_lag_bytes,
|
||||
heartbeat_interval: args
|
||||
.heartbeat_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
|
||||
address_for_peers: args.address_for_peers,
|
||||
start_as_candidate: args.start_as_candidate,
|
||||
http_service_port: args.listen.port() as i32,
|
||||
|
||||
@@ -8,6 +8,7 @@ use self::split_state::SplitState;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::controller_api::MetadataHealthRecord;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
@@ -91,7 +92,8 @@ pub(crate) enum DatabaseOperation {
|
||||
Detach,
|
||||
ReAttach,
|
||||
IncrementGeneration,
|
||||
PeekGenerations,
|
||||
TenantGenerations,
|
||||
ShardGenerations,
|
||||
ListTenantShards,
|
||||
InsertTenantShards,
|
||||
UpdateTenantShard,
|
||||
@@ -544,13 +546,13 @@ impl Persistence {
|
||||
/// If the tenant doesn't exist, an empty vector is returned.
|
||||
///
|
||||
/// Output is sorted by shard number
|
||||
pub(crate) async fn peek_generations(
|
||||
pub(crate) async fn tenant_generations(
|
||||
&self,
|
||||
filter_tenant_id: TenantId,
|
||||
) -> Result<Vec<ShardGenerationState>, DatabaseError> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
let rows = self
|
||||
.with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| {
|
||||
.with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| {
|
||||
let result = tenant_shards
|
||||
.filter(tenant_id.eq(filter_tenant_id.to_string()))
|
||||
.select(TenantShardPersistence::as_select())
|
||||
@@ -572,6 +574,64 @@ impl Persistence {
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Read the generation number of specific tenant shards
|
||||
///
|
||||
/// Output is unsorted. Output may not include values for all inputs, if they are missing in the database.
|
||||
pub(crate) async fn shard_generations(
|
||||
&self,
|
||||
mut tenant_shard_ids: impl Iterator<Item = &TenantShardId>,
|
||||
) -> Result<Vec<(TenantShardId, Option<Generation>)>, DatabaseError> {
|
||||
let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0);
|
||||
|
||||
// We will chunk our input to avoid composing arbitrarily long `IN` clauses. Typically we are
|
||||
// called with a single digit number of IDs, but in principle we could be called with tens
|
||||
// of thousands (all the shards on one pageserver) from the generation validation API.
|
||||
loop {
|
||||
// A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly
|
||||
// large query strings.
|
||||
let chunk_ids = tenant_shard_ids.by_ref().take(32);
|
||||
|
||||
// Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count)
|
||||
let in_clause = chunk_ids
|
||||
.map(|tsid| {
|
||||
format!(
|
||||
"('{}', {}, {})",
|
||||
tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0
|
||||
)
|
||||
})
|
||||
.join(",");
|
||||
|
||||
// We are done when our iterator gives us nothing to filter on
|
||||
if in_clause.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
let chunk_rows = self
|
||||
.with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| {
|
||||
// diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because
|
||||
// the inputs are strongly typed and cannot carry any user-supplied raw string content.
|
||||
let result : Vec<TenantShardPersistence> = diesel::sql_query(
|
||||
format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str()
|
||||
).load(conn)?;
|
||||
|
||||
Ok(result)
|
||||
})
|
||||
.await?;
|
||||
rows.extend(chunk_rows.into_iter())
|
||||
}
|
||||
|
||||
Ok(rows
|
||||
.into_iter()
|
||||
.map(|tsp| {
|
||||
(
|
||||
tsp.get_tenant_shard_id()
|
||||
.expect("Bad tenant ID in database"),
|
||||
tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
)
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[allow(non_local_definitions)]
|
||||
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
|
||||
///
|
||||
@@ -938,10 +998,54 @@ impl Persistence {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeeper_get(
|
||||
&self,
|
||||
id: i64,
|
||||
) -> Result<SafekeeperPersistence, DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::{id as id_column, safekeepers};
|
||||
self.with_conn(move |conn| -> DatabaseResult<SafekeeperPersistence> {
|
||||
Ok(safekeepers
|
||||
.filter(id_column.eq(&id))
|
||||
.select(SafekeeperPersistence::as_select())
|
||||
.get_result(conn)?)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn safekeeper_upsert(
|
||||
&self,
|
||||
record: SafekeeperPersistence,
|
||||
) -> Result<(), DatabaseError> {
|
||||
use crate::schema::safekeepers::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
let bind = record.as_insert_or_update();
|
||||
|
||||
let inserted_updated = diesel::insert_into(safekeepers)
|
||||
.values(&bind)
|
||||
.on_conflict(id)
|
||||
.do_update()
|
||||
.set(&bind)
|
||||
.execute(conn)?;
|
||||
|
||||
if inserted_updated != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of rows ({})",
|
||||
inserted_updated
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[derive(
|
||||
QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
|
||||
)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
@@ -1073,3 +1177,47 @@ pub(crate) struct ControllerPersistence {
|
||||
pub(crate) address: String,
|
||||
pub(crate) started_at: chrono::DateTime<chrono::Utc>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
pub(crate) struct SafekeeperPersistence {
|
||||
pub(crate) id: i64,
|
||||
pub(crate) region_id: String,
|
||||
/// 1 is special, it means just created (not currently posted to storcon).
|
||||
/// Zero or negative is not really expected.
|
||||
/// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
|
||||
pub(crate) version: i64,
|
||||
pub(crate) host: String,
|
||||
pub(crate) port: i32,
|
||||
pub(crate) active: bool,
|
||||
pub(crate) http_port: i32,
|
||||
pub(crate) availability_zone_id: String,
|
||||
}
|
||||
|
||||
impl SafekeeperPersistence {
|
||||
fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> {
|
||||
InsertUpdateSafekeeper {
|
||||
id: self.id,
|
||||
region_id: &self.region_id,
|
||||
version: self.version,
|
||||
host: &self.host,
|
||||
port: self.port,
|
||||
active: self.active,
|
||||
http_port: self.http_port,
|
||||
availability_zone_id: &self.availability_zone_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Insertable, AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::safekeepers)]
|
||||
struct InsertUpdateSafekeeper<'a> {
|
||||
id: i64,
|
||||
region_id: &'a str,
|
||||
version: i64,
|
||||
host: &'a str,
|
||||
port: i32,
|
||||
active: bool,
|
||||
http_port: i32,
|
||||
availability_zone_id: &'a str,
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ use utils::failpoint_support;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
@@ -593,6 +594,8 @@ impl Reconciler {
|
||||
notify_attempts += 1;
|
||||
}
|
||||
|
||||
pausable_failpoint!("reconciler-live-migrate-post-notify");
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
|
||||
@@ -45,3 +45,17 @@ diesel::table! {
|
||||
}
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
|
||||
|
||||
diesel::table! {
|
||||
safekeepers {
|
||||
id -> Int8,
|
||||
region_id -> Text,
|
||||
version -> Int8,
|
||||
instance_id -> Text,
|
||||
host -> Text,
|
||||
port -> Int4,
|
||||
active -> Bool,
|
||||
http_port -> Int4,
|
||||
availability_zone_id -> Text,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
|
||||
/// being handled on the pageserver side.
|
||||
pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
|
||||
|
||||
/// How often to send heartbeats to registered nodes?
|
||||
pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);
|
||||
|
||||
#[derive(Clone, strum_macros::Display)]
|
||||
enum TenantOperations {
|
||||
Create,
|
||||
@@ -326,6 +329,8 @@ pub struct Config {
|
||||
// upgraded to primary.
|
||||
pub max_secondary_lag_bytes: Option<u64>,
|
||||
|
||||
pub heartbeat_interval: Duration,
|
||||
|
||||
pub address_for_peers: Option<Uri>,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
@@ -909,9 +914,7 @@ impl Service {
|
||||
async fn spawn_heartbeat_driver(&self) {
|
||||
self.startup_complete.clone().wait().await;
|
||||
|
||||
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL);
|
||||
let mut interval = tokio::time::interval(self.config.heartbeat_interval);
|
||||
while !self.cancel.is_cancelled() {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => { }
|
||||
@@ -1851,37 +1854,74 @@ impl Service {
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse {
|
||||
let locked = self.inner.read().unwrap();
|
||||
pub(crate) async fn validate(
|
||||
&self,
|
||||
validate_req: ValidateRequest,
|
||||
) -> Result<ValidateResponse, DatabaseError> {
|
||||
// Fast in-memory check: we may reject validation on anything that doesn't match our
|
||||
// in-memory generation for a shard
|
||||
let in_memory_result = {
|
||||
let mut in_memory_result = Vec::new();
|
||||
let locked = self.inner.read().unwrap();
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_shard.generation
|
||||
);
|
||||
|
||||
in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid));
|
||||
} else {
|
||||
// This is legal: for example during a shard split the pageserver may still
|
||||
// have deletions in its queue from the old pre-split shard, or after deletion
|
||||
// of a tenant that was busy with compaction/gc while being deleted.
|
||||
tracing::info!(
|
||||
"Refusing deletion validation for missing shard {}",
|
||||
req_tenant.id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
in_memory_result
|
||||
};
|
||||
|
||||
// Database calls to confirm validity for anything that passed the in-memory check. We must do this
|
||||
// in case of controller split-brain, where some other controller process might have incremented the generation.
|
||||
let db_generations = self
|
||||
.persistence
|
||||
.shard_generations(in_memory_result.iter().filter_map(|i| {
|
||||
if i.2 {
|
||||
Some(&i.0)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}))
|
||||
.await?;
|
||||
let db_generations = db_generations.into_iter().collect::<HashMap<_, _>>();
|
||||
|
||||
let mut response = ValidateResponse {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_shard.generation
|
||||
);
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
});
|
||||
for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() {
|
||||
let valid = if valid {
|
||||
let db_generation = db_generations.get(&tenant_shard_id);
|
||||
db_generation == Some(&Some(validate_generation))
|
||||
} else {
|
||||
// After tenant deletion, we may approve any validation. This avoids
|
||||
// spurious warnings on the pageserver if it has pending LSN updates
|
||||
// at the point a deletion happens.
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
// If in-memory state says it's invalid, trust that. It's always safe to fail a validation, at worst
|
||||
// this prevents a pageserver from cleaning up an object in S3.
|
||||
false
|
||||
};
|
||||
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: tenant_shard_id,
|
||||
valid,
|
||||
})
|
||||
}
|
||||
response
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_create(
|
||||
@@ -3176,7 +3216,7 @@ impl Service {
|
||||
// run concurrently with reconciliations, and it is not guaranteed that the node we find here
|
||||
// will still be the latest when we're done: we will check generations again at the end of
|
||||
// this function to handle that.
|
||||
let generations = self.persistence.peek_generations(tenant_id).await?;
|
||||
let generations = self.persistence.tenant_generations(tenant_id).await?;
|
||||
|
||||
if generations
|
||||
.iter()
|
||||
@@ -3233,7 +3273,7 @@ impl Service {
|
||||
// Post-check: are all the generations of all the shards the same as they were initially? This proves that
|
||||
// our remote operation executed on the latest generation and is therefore persistent.
|
||||
{
|
||||
let latest_generations = self.persistence.peek_generations(tenant_id).await?;
|
||||
let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
|
||||
if latest_generations
|
||||
.into_iter()
|
||||
.map(
|
||||
@@ -6257,9 +6297,13 @@ impl Service {
|
||||
node_id: NodeId,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), OperationError> {
|
||||
// TODO(vlad): Currently this operates on the assumption that all
|
||||
// secondaries are warm. This is not always true (e.g. we just migrated the
|
||||
// tenant). Take that into consideration by checking the secondary status.
|
||||
const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
|
||||
const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
let reconciler_config = ReconcilerConfigBuilder::new()
|
||||
.secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
|
||||
.secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
|
||||
.build();
|
||||
|
||||
let mut tids_to_promote = self.fill_node_plan(node_id);
|
||||
let mut waiters = Vec::new();
|
||||
|
||||
@@ -6327,9 +6371,11 @@ impl Service {
|
||||
node_id
|
||||
);
|
||||
|
||||
if let Some(waiter) =
|
||||
self.maybe_reconcile_shard(tenant_shard, nodes)
|
||||
{
|
||||
if let Some(waiter) = self.maybe_configured_reconcile_shard(
|
||||
tenant_shard,
|
||||
nodes,
|
||||
reconciler_config,
|
||||
) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
@@ -6473,4 +6519,18 @@ impl Service {
|
||||
|
||||
global_observed
|
||||
}
|
||||
|
||||
pub(crate) async fn get_safekeeper(
|
||||
&self,
|
||||
id: i64,
|
||||
) -> Result<crate::persistence::SafekeeperPersistence, DatabaseError> {
|
||||
self.persistence.safekeeper_get(id).await
|
||||
}
|
||||
|
||||
pub(crate) async fn upsert_safekeeper(
|
||||
&self,
|
||||
record: crate::persistence::SafekeeperPersistence,
|
||||
) -> Result<(), DatabaseError> {
|
||||
self.persistence.safekeeper_upsert(record).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>(
|
||||
let yield_err = if err.is_permanent() {
|
||||
true
|
||||
} else {
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
let backoff_time = 1 << trial.min(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
trial += 1;
|
||||
trial == MAX_RETRIES - 1
|
||||
@@ -473,7 +473,7 @@ async fn list_objects_with_retries(
|
||||
s3_target.delimiter,
|
||||
DisplayErrorContext(e),
|
||||
);
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
let backoff_time = 1 << trial.min(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
}
|
||||
}
|
||||
@@ -492,7 +492,7 @@ async fn download_object_with_retries(
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
error!("Failed to download object for key {key}: {e}");
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
let backoff_time = 1 << trial.min(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
continue;
|
||||
}
|
||||
@@ -508,7 +508,7 @@ async fn download_object_with_retries(
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to stream object body for key {key}: {e}");
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
let backoff_time = 1 << trial.min(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import os
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from contextlib import _GeneratorContextManager, contextmanager
|
||||
|
||||
@@ -8,6 +10,7 @@ import pytest
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
PgBin,
|
||||
@@ -333,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
|
||||
fixture = request.getfixturevalue(request.param)
|
||||
assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
|
||||
return fixture
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def sync_after_each_test():
|
||||
# The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true`
|
||||
#
|
||||
# In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`)
|
||||
# that are run on self-hosted runners because some of these tests are pretty write-heavy
|
||||
# and create issues to start the processes within 10s
|
||||
key = "SYNC_AFTER_EACH_TEST"
|
||||
enabled = os.environ.get(key) == "true"
|
||||
|
||||
yield
|
||||
|
||||
if not enabled:
|
||||
# regress test, or running locally
|
||||
return
|
||||
|
||||
start = time.time()
|
||||
# we only run benches on unices, the method might not exist on windows
|
||||
os.sync()
|
||||
elapsed = time.time() - start
|
||||
log.info(f"called sync after test {elapsed=}")
|
||||
|
||||
@@ -24,7 +24,20 @@ from functools import cached_property, partial
|
||||
from itertools import chain, product
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import quote, urlparse
|
||||
|
||||
import asyncpg
|
||||
@@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore # reexport
|
||||
|
||||
from .neon_api import NeonAPI, NeonApiEndpoint
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
@@ -1164,6 +1179,8 @@ class NeonEnv:
|
||||
"listen_http_addr": f"localhost:{pageserver_port.http}",
|
||||
"pg_auth_type": pg_auth_type,
|
||||
"http_auth_type": http_auth_type,
|
||||
# Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
|
||||
"availability_zone": "us-east-2a",
|
||||
}
|
||||
if self.pageserver_virtual_file_io_engine is not None:
|
||||
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
|
||||
@@ -1192,11 +1209,7 @@ class NeonEnv:
|
||||
|
||||
# Create a corresponding NeonPageserver object
|
||||
self.pageservers.append(
|
||||
NeonPageserver(
|
||||
self,
|
||||
ps_id,
|
||||
port=pageserver_port,
|
||||
)
|
||||
NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
|
||||
)
|
||||
cfg["pageservers"].append(ps_cfg)
|
||||
|
||||
@@ -2400,6 +2413,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
"listen_http_port": node.service_port.http,
|
||||
"listen_pg_addr": "localhost",
|
||||
"listen_pg_port": node.service_port.pg,
|
||||
"availability_zone_id": node.az_id,
|
||||
}
|
||||
log.info(f"node_register({body})")
|
||||
self.request(
|
||||
@@ -2846,6 +2860,29 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
|
||||
raise AssertionError("unreachable")
|
||||
|
||||
def on_safekeeper_deploy(self, id: int, body: dict[str, Any]):
|
||||
self.request(
|
||||
"POST",
|
||||
f"{self.api}/control/v1/safekeeper/{id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
json=body,
|
||||
)
|
||||
|
||||
def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]:
|
||||
try:
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.api}/control/v1/safekeeper/{id}",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
json = response.json()
|
||||
assert isinstance(json, dict)
|
||||
return json
|
||||
except StorageControllerApiException as e:
|
||||
if e.status_code == 404:
|
||||
return None
|
||||
raise e
|
||||
|
||||
def __enter__(self) -> "NeonStorageController":
|
||||
return self
|
||||
|
||||
@@ -2923,10 +2960,11 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
|
||||
TEMP_FILE_SUFFIX = "___temp"
|
||||
|
||||
def __init__(self, env: NeonEnv, id: int, port: PageserverPort):
|
||||
def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str):
|
||||
super().__init__(host="localhost", port=port.pg, user="cloud_admin")
|
||||
self.env = env
|
||||
self.id = id
|
||||
self.az_id = az_id
|
||||
self.running = False
|
||||
self.service_port = port
|
||||
self.version = env.get_binary_version("pageserver")
|
||||
@@ -2963,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
def config_toml_path(self) -> Path:
|
||||
return self.workdir / "pageserver.toml"
|
||||
|
||||
def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]):
|
||||
def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T:
|
||||
"""
|
||||
Edit the pageserver's config toml file in place.
|
||||
"""
|
||||
path = self.config_toml_path
|
||||
with open(path, "r") as f:
|
||||
config = toml.load(f)
|
||||
edit_fn(config)
|
||||
res = edit_fn(config)
|
||||
with open(path, "w") as f:
|
||||
toml.dump(config, f)
|
||||
return res
|
||||
|
||||
def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -4553,6 +4592,8 @@ class Safekeeper(LogUtils):
|
||||
def timeline_dir(self, tenant_id, timeline_id) -> Path:
|
||||
return self.data_dir / str(tenant_id) / str(timeline_id)
|
||||
|
||||
# List partial uploaded segments of this safekeeper. Works only for
|
||||
# RemoteStorageKind.LOCAL_FS.
|
||||
def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
tline_path = (
|
||||
self.env.repo_dir
|
||||
@@ -4562,9 +4603,11 @@ class Safekeeper(LogUtils):
|
||||
/ str(timeline_id)
|
||||
)
|
||||
assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage)
|
||||
return self._list_segments_in_dir(
|
||||
segs = self._list_segments_in_dir(
|
||||
tline_path, lambda name: ".metadata" not in name and ".___temp" not in name
|
||||
)
|
||||
mysegs = [s for s in segs if f"sk{self.id}" in s]
|
||||
return mysegs
|
||||
|
||||
def list_segments(self, tenant_id, timeline_id) -> List[str]:
|
||||
"""
|
||||
|
||||
@@ -109,9 +109,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
# controller's attempts to notify the endpoint).
|
||||
".*reconciler.*neon_local notification hook failed.*",
|
||||
".*reconciler.*neon_local error.*",
|
||||
# Neon local does not provide pageserver with an AZ
|
||||
# TODO: remove this once neon local does so
|
||||
".*registering without specific availability zone id.*",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -174,6 +174,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def debug_dump_timeline(
|
||||
self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None
|
||||
) -> Any:
|
||||
params = params or {}
|
||||
params["timeline_id"] = str(timeline_id)
|
||||
dump = self.debug_dump(params)
|
||||
return dump["timelines"][0]
|
||||
|
||||
def get_partial_backup(self, timeline_id: TimelineId) -> Any:
|
||||
dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
|
||||
return dump["control_file"]["partial_backup"]
|
||||
|
||||
def get_eviction_state(self, timeline_id: TimelineId) -> Any:
|
||||
dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"})
|
||||
return dump["control_file"]["eviction_state"]
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
@@ -228,6 +244,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset",
|
||||
json={},
|
||||
)
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}",
|
||||
|
||||
76
test_runner/regress/test_aux_files.py
Normal file
76
test_runner/regress/test_aux_files.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
AuxFileStore,
|
||||
NeonEnvBuilder,
|
||||
logical_replication_sync,
|
||||
)
|
||||
|
||||
|
||||
def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# aux file v2 is enabled on the write path, so for now, it should be unset (or null)
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
|
||||
is None
|
||||
)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute("create table t(pk integer primary key, payload integer)")
|
||||
cur.execute(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
|
||||
)
|
||||
cur.execute("create publication pub1 for table t, replication_example")
|
||||
|
||||
# now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
|
||||
# instead of going through the full logical replication process.
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
|
||||
vanilla_pg.safe_psql(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
|
||||
)
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
endpoint.stop()
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be enabled at this point
|
||||
assert (
|
||||
client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
|
||||
== AuxFileStore.V2
|
||||
)
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["switch_aux_file_policy"] = "V1"
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# the flag should still be enabled
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== AuxFileStore.V2
|
||||
)
|
||||
env.pageserver.restart()
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be persisted
|
||||
assert (
|
||||
client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"last_aux_file_policy"
|
||||
]
|
||||
== AuxFileStore.V2
|
||||
)
|
||||
@@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
# We will start a pageserver with no control_plane_api set, so it won't be able to self-register
|
||||
env.storage_controller.node_register(env.pageserver)
|
||||
|
||||
replaced_config = env.pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_api": "",
|
||||
}
|
||||
)
|
||||
def remove_control_plane_api_field(config):
|
||||
return config.pop("control_plane_api")
|
||||
|
||||
control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field)
|
||||
env.pageserver.start()
|
||||
env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
|
||||
|
||||
@@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
env.pageserver.stop()
|
||||
# Starting without the override that disabled control_plane_api
|
||||
env.pageserver.patch_config_toml_nonrecursive(replaced_config)
|
||||
env.pageserver.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"control_plane_api": control_plane_api,
|
||||
}
|
||||
)
|
||||
env.pageserver.start()
|
||||
|
||||
generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)
|
||||
|
||||
@@ -31,7 +31,7 @@ from fixtures.pageserver.utils import (
|
||||
remote_storage_delete_key,
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.pg_version import PgVersion, run_only_on_default_postgres
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.storage_controller_proxy import StorageControllerProxy
|
||||
@@ -2330,3 +2330,185 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
|
||||
connect=0, # Disable retries: we want to see the 503
|
||||
)
|
||||
).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id)
|
||||
|
||||
|
||||
def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
A correctness edge case: while we are live migrating and a shard's generation is
|
||||
visible to the Reconciler but not to the central Service, the generation validation
|
||||
API should still prevent stale generations from doing deletions.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
TENANT_CONF = {
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": 128 * 1024,
|
||||
"compaction_threshold": 1,
|
||||
"compaction_target_size": 128 * 1024,
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
}
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
env.neon_cli.create_tenant(tenant_id, timeline_id)
|
||||
env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF)
|
||||
|
||||
# Write enough data that a compaction would do some work (deleting some L0s)
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(64)
|
||||
for _i in range(0, 2):
|
||||
workload.churn_rows(64, upload=False)
|
||||
|
||||
# Upload but don't compact
|
||||
origin_pageserver = env.get_tenant_pageserver(tenant_id)
|
||||
dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
|
||||
origin_pageserver.http_client().timeline_checkpoint(
|
||||
tenant_id, timeline_id, wait_until_uploaded=True, compact=False
|
||||
)
|
||||
|
||||
# Start a compaction that will pause on a failpoint.
|
||||
compaction_failpoint = "before-upload-index-pausable"
|
||||
origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause"))
|
||||
|
||||
# This failpoint can also cause migration code to time out trying to politely flush
|
||||
# during migrations
|
||||
origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*")
|
||||
|
||||
try:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
compact_fut = executor.submit(
|
||||
origin_pageserver.http_client().timeline_compact,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
wait_until_uploaded=True,
|
||||
)
|
||||
|
||||
# Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's
|
||||
# index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete
|
||||
def has_hit_compaction_failpoint():
|
||||
assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
|
||||
|
||||
wait_until(10, 1, has_hit_compaction_failpoint)
|
||||
|
||||
# While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
|
||||
# after incrementing generation and attaching the new location
|
||||
migration_failpoint = "reconciler-live-migrate-post-notify"
|
||||
env.storage_controller.configure_failpoints((migration_failpoint, "pause"))
|
||||
migrate_fut = executor.submit(
|
||||
env.storage_controller.tenant_shard_migrate,
|
||||
TenantShardId(tenant_id, 0, 0),
|
||||
dest_ps_id,
|
||||
)
|
||||
|
||||
def has_hit_migration_failpoint():
|
||||
assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}")
|
||||
|
||||
# Long wait because the migration will have to time out during transition to AttachedStale
|
||||
# before it reaches this point. The timeout is because the AttachedStale transition includes
|
||||
# a flush of remote storage, and if the compaction already enqueued an index upload this cannot
|
||||
# make progress.
|
||||
wait_until(60, 1, has_hit_migration_failpoint)
|
||||
|
||||
# Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
|
||||
origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
|
||||
compact_fut.result()
|
||||
origin_pageserver.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
# Eventually migration completes
|
||||
env.storage_controller.configure_failpoints((migration_failpoint, "off"))
|
||||
migrate_fut.result()
|
||||
except:
|
||||
# Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
|
||||
env.storage_controller.configure_failpoints((migration_failpoint, "off"))
|
||||
origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
|
||||
raise
|
||||
|
||||
# Ensure the destination of the migration writes an index, so that if it has corrupt state that is
|
||||
# visible to the scrubber.
|
||||
workload.write_rows(1, upload=False)
|
||||
env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint(
|
||||
tenant_id, timeline_id, wait_until_uploaded=True, compact=False
|
||||
)
|
||||
|
||||
# The destination of the live migration would now have a corrupt index (referencing deleted L0s) if
|
||||
# the controller had not properly applied validation rules.
|
||||
healthy, _summary = env.storage_scrubber.scan_metadata()
|
||||
try:
|
||||
log.info(f"scrubbed, healthy={healthy}")
|
||||
assert healthy
|
||||
except:
|
||||
# On failures, we want to report them FAIL during the test, not as ERROR during teardown
|
||||
neon_env_builder.enable_scrub_on_exit = False
|
||||
raise
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
|
||||
def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
fake_id = 5
|
||||
|
||||
target = env.storage_controller
|
||||
|
||||
assert target.get_safekeeper(fake_id) is None
|
||||
|
||||
body = {
|
||||
"active": True,
|
||||
"id": fake_id,
|
||||
"created_at": "2023-10-25T09:11:25Z",
|
||||
"updated_at": "2024-08-28T11:32:43Z",
|
||||
"region_id": "aws-us-east-2",
|
||||
"host": "safekeeper-333.us-east-2.aws.neon.build",
|
||||
"port": 6401,
|
||||
"http_port": 7676,
|
||||
"version": 5957,
|
||||
"availability_zone_id": "us-east-2b",
|
||||
}
|
||||
|
||||
target.on_safekeeper_deploy(fake_id, body)
|
||||
|
||||
inserted = target.get_safekeeper(fake_id)
|
||||
assert inserted is not None
|
||||
assert eq_safekeeper_records(body, inserted)
|
||||
|
||||
# error out if pk is changed (unexpected)
|
||||
with pytest.raises(StorageControllerApiException) as exc:
|
||||
different_pk = dict(body)
|
||||
different_pk["id"] = 4
|
||||
assert different_pk["id"] != body["id"]
|
||||
target.on_safekeeper_deploy(fake_id, different_pk)
|
||||
assert exc.value.status_code == 400
|
||||
|
||||
inserted_again = target.get_safekeeper(fake_id)
|
||||
assert inserted_again is not None
|
||||
assert eq_safekeeper_records(inserted, inserted_again)
|
||||
|
||||
# the most common case, version goes up:
|
||||
assert isinstance(body["version"], int)
|
||||
body["version"] += 1
|
||||
target.on_safekeeper_deploy(fake_id, body)
|
||||
inserted_now = target.get_safekeeper(fake_id)
|
||||
assert inserted_now is not None
|
||||
|
||||
assert eq_safekeeper_records(body, inserted_now)
|
||||
|
||||
|
||||
def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
|
||||
compared = [dict(a), dict(b)]
|
||||
|
||||
masked_keys = ["created_at", "updated_at"]
|
||||
|
||||
for d in compared:
|
||||
# keep deleting these in case we are comparing the body as it will be uploaded by real scripts
|
||||
for key in masked_keys:
|
||||
if key in d:
|
||||
del d[key]
|
||||
|
||||
return compared[0] == compared[1]
|
||||
|
||||
@@ -217,6 +217,13 @@ def test_scrubber_physical_gc_ancestors(
|
||||
workload.init()
|
||||
workload.write_rows(100)
|
||||
|
||||
# Issue a deletion queue flush so that the parent shard can't leave behind layers
|
||||
# that will look like unexpected garbage to the scrubber
|
||||
for pre_split_shard in env.storage_controller.locate(tenant_id):
|
||||
env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush(
|
||||
execute=True
|
||||
)
|
||||
|
||||
new_shard_count = 4
|
||||
assert shard_count is None or new_shard_count > shard_count
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
@@ -321,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
|
||||
workload.write_rows(100, upload=False)
|
||||
workload.stop()
|
||||
|
||||
# Issue a deletion queue flush so that the parent shard can't leave behind layers
|
||||
# that will look like unexpected garbage to the scrubber
|
||||
env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
new_shard_count = 4
|
||||
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
|
||||
for shard in shards:
|
||||
|
||||
@@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id: TenantId = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Multiple creation requests which race will generate this error
|
||||
# Multiple creation requests which race will generate this error on the pageserver
|
||||
# and storage controller respectively
|
||||
env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
|
||||
env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
|
||||
|
||||
# Tenant creation requests which arrive out of order will generate complaints about
|
||||
# generation nubmers out of order.
|
||||
|
||||
@@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# We will run with the limit set to 1, so that once we have one tenant stuck
|
||||
# in a pausable failpoint, the rest are prevented from proceeding through warmup.
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
@@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
|
||||
def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
|
||||
# env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
|
||||
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
|
||||
@@ -72,6 +72,17 @@ def wait_lsn_force_checkpoint(
|
||||
wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
|
||||
|
||||
|
||||
def wait_lsn_force_checkpoint_at_sk(
|
||||
safekeeper: Safekeeper,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ps: NeonPageserver,
|
||||
pageserver_conn_options=None,
|
||||
):
|
||||
sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
|
||||
wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
|
||||
|
||||
|
||||
def wait_lsn_force_checkpoint_at(
|
||||
lsn: Lsn,
|
||||
tenant_id: TenantId,
|
||||
@@ -79,6 +90,10 @@ def wait_lsn_force_checkpoint_at(
|
||||
ps: NeonPageserver,
|
||||
pageserver_conn_options=None,
|
||||
):
|
||||
"""
|
||||
Wait until pageserver receives given lsn, force checkpoint and wait for
|
||||
upload, i.e. remote_consistent_lsn advancement.
|
||||
"""
|
||||
pageserver_conn_options = pageserver_conn_options or {}
|
||||
|
||||
auth_token = None
|
||||
@@ -2330,6 +2345,77 @@ def test_s3_eviction(
|
||||
assert event_metrics_seen
|
||||
|
||||
|
||||
# Test resetting uploaded partial segment state.
|
||||
def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
|
||||
# We want to upload/evict quickly, but not too quickly to check that s3 is
|
||||
# empty before next round of upload happens.
|
||||
# Note: this test fails with --delete-offloaded-wal, this is expected.
|
||||
neon_env_builder.safekeeper_extra_opts = [
|
||||
"--enable-offload",
|
||||
"--partial-backup-timeout",
|
||||
"1s",
|
||||
"--control-file-save-interval",
|
||||
"1s",
|
||||
"--eviction-min-resident=1s",
|
||||
]
|
||||
# XXX: pageserver currently connects to safekeeper as long as connection
|
||||
# manager doesn't remove its entry (default lagging_wal_timeout is 10s),
|
||||
# causing uneviction. It should be fixed to not reconnect if last
|
||||
# remote_consistent_lsn is communicated and there is nothing to fetch. Make
|
||||
# value lower to speed up the test.
|
||||
initial_tenant_conf = {
|
||||
"lagging_wal_timeout": "1s",
|
||||
}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
endpoint = env.endpoints.create("main")
|
||||
endpoint.start()
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
endpoint.stop()
|
||||
sk = env.safekeepers[0]
|
||||
# eviction won't happen until remote_consistent_lsn catches up.
|
||||
wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver)
|
||||
|
||||
http_cli = env.safekeepers[0].http_client()
|
||||
|
||||
# wait until eviction happens
|
||||
def evicted():
|
||||
eviction_state = http_cli.get_eviction_state(timeline_id)
|
||||
log.info(f"eviction_state: {eviction_state}")
|
||||
if isinstance(eviction_state, str) and eviction_state == "Present":
|
||||
raise Exception("eviction didn't happen yet")
|
||||
|
||||
wait_until(30, 1, evicted)
|
||||
# it must have uploaded something
|
||||
uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
|
||||
log.info(f"uploaded segments before reset: {uploaded_segs}")
|
||||
assert len(uploaded_segs) > 0
|
||||
|
||||
reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id)
|
||||
log.info(f"reset res: {reset_res}")
|
||||
|
||||
# Backup_partial_reset must have reset the state and dropped s3 segment.
|
||||
#
|
||||
# Note: if listing takes more than --partial-backup-timeout test becomes
|
||||
# flaky because file might be reuploaded. With local fs it shouldn't be an
|
||||
# issue, but can add retry if this appears.
|
||||
uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
|
||||
log.info(f"uploaded segments after reset: {uploaded_segs}")
|
||||
assert len(uploaded_segs) == 0
|
||||
|
||||
# calling second time should be ok
|
||||
http_cli.backup_partial_reset(tenant_id, timeline_id)
|
||||
|
||||
# inserting data should be ok
|
||||
endpoint.start()
|
||||
endpoint.safe_psql("insert into t values(1, 'hehe')")
|
||||
|
||||
|
||||
def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Verify that pulling timeline from a SK with an uploaded partial segment
|
||||
@@ -2357,7 +2443,16 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
|
||||
"--eviction-min-resident=500ms",
|
||||
]
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"})
|
||||
# XXX: pageserver currently connects to safekeeper as long as connection
|
||||
# manager doesn't remove its entry (default lagging_wal_timeout is 10s),
|
||||
# causing uneviction. It should be fixed to not reconnect if last
|
||||
# remote_consistent_lsn is communicated and there is nothing to fetch. Until
|
||||
# this is fixed make value lower to speed up the test.
|
||||
initial_tenant_conf = {
|
||||
"lagging_wal_timeout": "1s",
|
||||
"checkpoint_timeout": "100ms",
|
||||
}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
@@ -2421,7 +2516,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
|
||||
endpoint.start(safekeepers=[2, 3])
|
||||
|
||||
def new_partial_segment_uploaded():
|
||||
segs = src_sk.list_uploaded_segments(tenant_id, timeline_id)
|
||||
segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id)
|
||||
for seg in segs:
|
||||
if "partial" in seg and "sk3" in seg:
|
||||
return seg
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 48388a5b59...a317b9b5b9
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 8aa1ded772...49d5e576a5
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 95132feffe...d1fabbb548
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,14 +1,14 @@
|
||||
{
|
||||
"v16": [
|
||||
"16.4",
|
||||
"95132feffe277ce84309d93a42e9aadfd2cb0437"
|
||||
"6e9a4ff6249ac02b8175054b7b3f7dfb198be48b"
|
||||
],
|
||||
"v15": [
|
||||
"15.8",
|
||||
"8aa1ded7726d416ac8e02600aad387a353478fc7"
|
||||
"49d5e576a56e4cc59cd6a6a0791b2324b9fa675e"
|
||||
],
|
||||
"v14": [
|
||||
"14.13",
|
||||
"48388a5b597c81c09e28c016650a7156b48717a1"
|
||||
"a317b9b5b96978b49e78986697f3dd80d06f99a7"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user