mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 13:30:38 +00:00
Compare commits
38 Commits
diko/rcgen
...
arpad/remo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3bc3f71418 | ||
|
|
19bea5fd0c | ||
|
|
5be94e28c4 | ||
|
|
63a106021a | ||
|
|
9a6ace9bde | ||
|
|
8c77ccfc01 | ||
|
|
cbd2fc2395 | ||
|
|
028a191040 | ||
|
|
8cce27bedb | ||
|
|
90b706cd96 | ||
|
|
057ce115de | ||
|
|
e85607eed8 | ||
|
|
437071888e | ||
|
|
148b3701cf | ||
|
|
daebe50e19 | ||
|
|
e0ee6fbeff | ||
|
|
307fa2ceb7 | ||
|
|
a338984dc7 | ||
|
|
8936a7abd8 | ||
|
|
946e971df8 | ||
|
|
d109bf8c1d | ||
|
|
4f7b2cdd4f | ||
|
|
66f56ddaec | ||
|
|
fd16caa7d0 | ||
|
|
ff5a527167 | ||
|
|
c66444ea15 | ||
|
|
88f01c1ca1 | ||
|
|
a6937a3281 | ||
|
|
3c8565a194 | ||
|
|
979fa0682b | ||
|
|
8884865bca | ||
|
|
4c4e33bc2e | ||
|
|
342607473a | ||
|
|
9c37bfc90a | ||
|
|
52dee408dc | ||
|
|
5487a20b72 | ||
|
|
f06d721a98 | ||
|
|
2e35f23085 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -6,6 +6,7 @@ self-hosted-runner:
|
||||
- small
|
||||
- small-metal
|
||||
- small-arm64
|
||||
- unit-perf
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AWS_ECR_REGION
|
||||
|
||||
@@ -70,6 +70,7 @@ runs:
|
||||
|
||||
- name: Install Allure
|
||||
shell: bash -euxo pipefail {0}
|
||||
working-directory: /tmp
|
||||
run: |
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
|
||||
4
.github/workflows/build_and_test.yml
vendored
4
.github/workflows/build_and_test.yml
vendored
@@ -284,7 +284,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [ self-hosted, small-metal ]
|
||||
runs-on: [ self-hosted, unit-perf ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
@@ -1271,7 +1271,7 @@ jobs:
|
||||
exit 1
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||
needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
|
||||
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
|
||||
if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
|
||||
permissions:
|
||||
|
||||
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -2837,6 +2837,7 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5494,6 +5495,17 @@ version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c707298afce11da2efef2f600116fa93ffa7a032b5d7b628aa17711ec81383ca"
|
||||
|
||||
[[package]]
|
||||
name = "remote_keys"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"aws-config",
|
||||
"aws-sdk-kms",
|
||||
"aws-smithy-types",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -30,6 +30,7 @@ members = [
|
||||
"libs/tenant_size_model",
|
||||
"libs/metrics",
|
||||
"libs/postgres_connection",
|
||||
"libs/remote_keys",
|
||||
"libs/remote_storage",
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
|
||||
@@ -29,13 +29,12 @@
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -c /var/db/postgres/configs/config.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::mpsc;
|
||||
use std::thread;
|
||||
@@ -43,8 +42,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeCtlConfig;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
|
||||
};
|
||||
@@ -118,8 +116,10 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
pub set_disk_quota_for_fs: Option<String>,
|
||||
|
||||
#[arg(short = 'S', long, group = "spec-path")]
|
||||
pub spec_path: Option<OsString>,
|
||||
// TODO(tristan957): remove alias after compatibility tests are no longer
|
||||
// an issue
|
||||
#[arg(short = 'c', long, alias = "spec-path")]
|
||||
pub config: Option<OsString>,
|
||||
|
||||
#[arg(short = 'i', long, group = "compute-id")]
|
||||
pub compute_id: String,
|
||||
@@ -127,8 +127,9 @@ struct Cli {
|
||||
#[arg(
|
||||
short = 'p',
|
||||
long,
|
||||
conflicts_with = "spec-path",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL"
|
||||
conflicts_with = "config",
|
||||
value_name = "CONTROL_PLANE_API_BASE_URL",
|
||||
requires = "compute-id"
|
||||
)]
|
||||
pub control_plane_uri: Option<String>,
|
||||
}
|
||||
@@ -138,7 +139,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// For historical reasons, the main thread that processes the spec and launches postgres
|
||||
// For historical reasons, the main thread that processes the config and launches postgres
|
||||
// is synchronous, but we always have this tokio runtime available and we "enter" it so
|
||||
// that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
|
||||
// from all parts of compute_ctl.
|
||||
@@ -154,7 +155,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let cli_spec = try_spec_from_cli(&cli)?;
|
||||
let config = get_config(&cli)?;
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
@@ -175,8 +176,7 @@ fn main() -> Result<()> {
|
||||
#[cfg(target_os = "linux")]
|
||||
vm_monitor_addr: cli.vm_monitor_addr,
|
||||
},
|
||||
cli_spec.spec,
|
||||
cli_spec.compute_ctl_config,
|
||||
config,
|
||||
)?;
|
||||
|
||||
let exit_code = compute_node.run()?;
|
||||
@@ -201,27 +201,17 @@ async fn init() -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
// First, read spec from the path if provided
|
||||
if let Some(ref spec_path) = cli.spec_path {
|
||||
let file = File::open(Path::new(spec_path))?;
|
||||
return Ok(CliSpecParams {
|
||||
spec: Some(serde_json::from_reader(file)?),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
});
|
||||
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
|
||||
// First, read the config from the path if provided
|
||||
if let Some(ref config) = cli.config {
|
||||
let file = File::open(config)?;
|
||||
return Ok(serde_json::from_reader(&file)?);
|
||||
}
|
||||
|
||||
if cli.control_plane_uri.is_none() {
|
||||
panic!("must specify --control-plane-uri");
|
||||
};
|
||||
|
||||
// If the spec wasn't provided in the CLI arguments, then retrieve it from
|
||||
// If the config wasn't provided in the CLI arguments, then retrieve it from
|
||||
// the control plane
|
||||
match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(resp) => Ok(CliSpecParams {
|
||||
spec: resp.0,
|
||||
compute_ctl_config: resp.1,
|
||||
}),
|
||||
match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
|
||||
Ok(config) => Ok(config),
|
||||
Err(e) => {
|
||||
error!(
|
||||
"cannot get response from control plane: {}\n\
|
||||
@@ -233,13 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
|
||||
}
|
||||
}
|
||||
|
||||
struct CliSpecParams {
|
||||
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
|
||||
spec: Option<ComputeSpec>,
|
||||
#[allow(dead_code)]
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
fn deinit_and_exit(exit_code: Option<i32>) -> ! {
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit. Shutting down OTEL tracing provider may
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::{env, fs};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
|
||||
use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
};
|
||||
@@ -303,11 +303,7 @@ struct StartVmMonitorResult {
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn new(
|
||||
params: ComputeNodeParams,
|
||||
cli_spec: Option<ComputeSpec>,
|
||||
compute_ctl_config: ComputeCtlConfig,
|
||||
) -> Result<Self> {
|
||||
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
|
||||
let connstr = params.connstr.as_str();
|
||||
let conn_conf = postgres::config::Config::from_str(connstr)
|
||||
.context("cannot build postgres config from connstr")?;
|
||||
@@ -315,8 +311,8 @@ impl ComputeNode {
|
||||
.context("cannot build tokio postgres config from connstr")?;
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(cli_spec) = cli_spec {
|
||||
let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
@@ -327,7 +323,7 @@ impl ComputeNode {
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
compute_ctl_config,
|
||||
compute_ctl_config: config.compute_ctl_config,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -634,19 +630,23 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for HIPAA if necessary
|
||||
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
|
||||
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
// Configure and start rsyslog for compliance audit logging
|
||||
match pspec.spec.audit_log_level {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
let remote_endpoint =
|
||||
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for Postgres logs export
|
||||
|
||||
@@ -178,7 +178,7 @@ pub fn write_postgres_conf(
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
match spec.audit_log_level {
|
||||
ComputeAudit::Disabled => {}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
|
||||
writeln!(file, "pgaudit.log='ddl,role'")?;
|
||||
// Disable logging of catalog queries to reduce the noise
|
||||
@@ -202,16 +202,20 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
|
||||
}
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
writeln!(
|
||||
file,
|
||||
"# Managed by compute_ctl compliance audit settings: begin"
|
||||
)?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
// Exclude 'misc' category, because it doesn't contain anythig relevant.
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Enable logging of parameters.
|
||||
// This is very verbose and may contain sensitive data.
|
||||
if spec.audit_log_level == ComputeAudit::Full {
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
} else {
|
||||
writeln!(file, "pgaudit.log_parameter=off")?;
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
}
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
|
||||
@@ -11,7 +11,7 @@ use futures::future::BoxFuture;
|
||||
use http::{Request, Response, StatusCode};
|
||||
use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
|
||||
use tower_http::auth::AsyncAuthorizeRequest;
|
||||
use tracing::warn;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::http::{JsonResponse, extract::RequestId};
|
||||
|
||||
@@ -92,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
if data.claims.compute_id != compute_id {
|
||||
return Err(JsonResponse::error(
|
||||
StatusCode::UNAUTHORIZED,
|
||||
"invalid claims in authorization token",
|
||||
"invalid compute ID in authorization token claims",
|
||||
));
|
||||
}
|
||||
|
||||
@@ -112,12 +112,14 @@ impl Authorize {
|
||||
token: &str,
|
||||
validation: &Validation,
|
||||
) -> Result<TokenData<ComputeClaims>> {
|
||||
debug!("verifying token {}", token);
|
||||
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to construct decoding key from {}: {}",
|
||||
"failed to construct decoding key from {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
@@ -130,7 +132,7 @@ impl Authorize {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Failed to decode authorization token using {}: {}",
|
||||
"failed to decode authorization token using {}: {}",
|
||||
jwk.common.key_id.as_ref().unwrap(),
|
||||
e
|
||||
);
|
||||
@@ -140,6 +142,6 @@ impl Authorize {
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow!("Failed to verify authorization token"))
|
||||
Err(anyhow!("failed to verify authorization token"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
// but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
|
||||
// And it's fair to call it a 'RPC' (Remote Procedure Call).
|
||||
pub enum CPlaneRequestRPC {
|
||||
GetSpec,
|
||||
GetConfig,
|
||||
}
|
||||
|
||||
impl CPlaneRequestRPC {
|
||||
pub fn as_str(&self) -> &str {
|
||||
match self {
|
||||
CPlaneRequestRPC::GetSpec => "GetSpec",
|
||||
CPlaneRequestRPC::GetConfig => "GetConfig",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,9 +3,8 @@ use std::path::Path;
|
||||
|
||||
use anyhow::{Result, anyhow, bail};
|
||||
use compute_api::responses::{
|
||||
ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
|
||||
ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
|
||||
};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
use reqwest::StatusCode;
|
||||
use tokio_postgres::Client;
|
||||
use tracing::{error, info, instrument};
|
||||
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
|
||||
fn do_control_plane_request(
|
||||
uri: &str,
|
||||
jwt: &str,
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
|
||||
) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
@@ -29,14 +28,14 @@ fn do_control_plane_request(
|
||||
.map_err(|e| {
|
||||
(
|
||||
true,
|
||||
format!("could not perform spec request to control plane: {:?}", e),
|
||||
format!("could not perform request to control plane: {:?}", e),
|
||||
UNKNOWN_HTTP_STATUS.to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
let status = resp.status();
|
||||
match status {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
|
||||
StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
|
||||
Ok(spec_resp) => Ok(spec_resp),
|
||||
Err(e) => Err((
|
||||
true,
|
||||
@@ -69,40 +68,35 @@ fn do_control_plane_request(
|
||||
}
|
||||
}
|
||||
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
|
||||
/// env variable is set, it will be used for authorization.
|
||||
pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
|
||||
/// Request config from the control-plane by compute_id. If
|
||||
/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
|
||||
/// authorization.
|
||||
pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
|
||||
let mut attempt = 1;
|
||||
|
||||
info!("getting spec from control plane: {}", cp_uri);
|
||||
info!("getting config from control plane: {}", cp_uri);
|
||||
|
||||
// Do 3 attempts to get spec from the control plane using the following logic:
|
||||
// - network error -> then retry
|
||||
// - compute id is unknown or any other error -> bail out
|
||||
// - no spec for compute yet (Empty state) -> return Ok(None)
|
||||
// - got spec -> return Ok(Some(spec))
|
||||
// - got config -> return Ok(Some(config))
|
||||
while attempt < 4 {
|
||||
let result = match do_control_plane_request(&cp_uri, &jwt) {
|
||||
Ok(spec_resp) => {
|
||||
Ok(config_resp) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[
|
||||
CPlaneRequestRPC::GetSpec.as_str(),
|
||||
CPlaneRequestRPC::GetConfig.as_str(),
|
||||
&StatusCode::OK.to_string(),
|
||||
])
|
||||
.inc();
|
||||
match spec_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
|
||||
match config_resp.status {
|
||||
ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
|
||||
ControlPlaneComputeStatus::Attached => {
|
||||
if let Some(spec) = spec_resp.spec {
|
||||
Ok((Some(spec), spec_resp.compute_ctl_config))
|
||||
if config_resp.spec.is_some() {
|
||||
Ok(config_resp.into())
|
||||
} else {
|
||||
bail!("compute is attached, but spec is empty")
|
||||
}
|
||||
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
|
||||
}
|
||||
Err((retry, msg, status)) => {
|
||||
CPLANE_REQUESTS_TOTAL
|
||||
.with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
|
||||
.with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
|
||||
.inc();
|
||||
if retry {
|
||||
Err(anyhow!(msg))
|
||||
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
|
||||
};
|
||||
|
||||
if let Err(e) = &result {
|
||||
error!("attempt {} to get spec failed with: {}", attempt, e);
|
||||
error!("attempt {} to get config failed with: {}", attempt, e);
|
||||
} else {
|
||||
return result;
|
||||
}
|
||||
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(
|
||||
|
||||
// All attempts failed, return error.
|
||||
Err(anyhow::anyhow!(
|
||||
"Exhausted all attempts to retrieve the spec from the control plane"
|
||||
"Exhausted all attempts to retrieve the config from the control plane"
|
||||
))
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
// XXX: consider making it a part of config.json
|
||||
let pghba_path = pgdata_path.join("pg_hba.conf");
|
||||
|
||||
if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
|
||||
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Create a standby.signal file
|
||||
pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
// XXX: consider making it a part of config.json
|
||||
let signalfile = pgdata_path.join("standby.signal");
|
||||
|
||||
if !signalfile.exists() {
|
||||
|
||||
@@ -278,12 +278,12 @@ impl ComputeNode {
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
//! postgresql.conf - postgresql settings
|
||||
//! spec.json - passed to `compute_ctl`
|
||||
//! config.json - passed to `compute_ctl`
|
||||
//! pgdata/
|
||||
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
|
||||
//! zenith.signal
|
||||
@@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
|
||||
use anyhow::{Context, Result, anyhow, bail};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
|
||||
RemoteExtSpec, Role,
|
||||
@@ -619,90 +621,101 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// Create spec file
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
disable_lfc_resizing: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
project_id: None,
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf.clone()),
|
||||
},
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
project_id: None,
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
audit_log_level: ComputeAudit::Disabled,
|
||||
logs_export_host: None::<String>,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
ComputeConfig {
|
||||
spec: Some(spec),
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
}
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
if self.cluster.is_some() {
|
||||
debug!("Cluster is already set in the endpoint spec, using it");
|
||||
spec.cluster = self.cluster.clone().unwrap();
|
||||
|
||||
debug!("spec.cluster {:?}", spec.cluster);
|
||||
|
||||
// fill missing fields again
|
||||
if create_test_user {
|
||||
spec.cluster.roles.push(Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
});
|
||||
spec.cluster.databases.push(Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
});
|
||||
}
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
}
|
||||
|
||||
// TODO(tristan957): Remove the write to spec.json after compatibility
|
||||
// tests work themselves out
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
|
||||
|
||||
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
|
||||
let logfile = std::fs::OpenOptions::new()
|
||||
@@ -710,6 +723,16 @@ impl Endpoint {
|
||||
.append(true)
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// TODO(tristan957): Remove when compatibility tests are no longer an
|
||||
// issue
|
||||
let old_compute_ctl = {
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
let help_output = cmd.arg("--help").output()?;
|
||||
let help_output = String::from_utf8_lossy(&help_output.stdout);
|
||||
|
||||
!help_output.contains("--config")
|
||||
};
|
||||
|
||||
// Launch compute_ctl
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
@@ -728,9 +751,18 @@ impl Endpoint {
|
||||
])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &conn_str])
|
||||
// TODO(tristan957): Change this to --config when compatibility tests
|
||||
// are no longer an issue
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
self.endpoint_path()
|
||||
.join(if old_compute_ctl {
|
||||
"spec.json"
|
||||
} else {
|
||||
"config.json"
|
||||
})
|
||||
.to_str()
|
||||
.unwrap(),
|
||||
])
|
||||
.args([
|
||||
"--pgbin",
|
||||
@@ -873,10 +905,12 @@ impl Endpoint {
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
let (mut spec, compute_ctl_config) = {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let file = std::fs::File::open(config_path)?;
|
||||
let config: ComputeConfig = serde_json::from_reader(file)?;
|
||||
|
||||
(config.spec.unwrap(), config.compute_ctl_config)
|
||||
};
|
||||
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
@@ -926,7 +960,7 @@ impl Endpoint {
|
||||
.body(
|
||||
serde_json::to_string(&ConfigurationRequest {
|
||||
spec,
|
||||
compute_ctl_config: ComputeCtlConfig::default(),
|
||||
compute_ctl_config,
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
|
||||
@@ -980,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
|
||||
// -out rootCA.crt -keyout rootCA.key
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args([
|
||||
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
|
||||
"req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
|
||||
])
|
||||
.args(["-subj", "/CN=Neon Local CA"])
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
@@ -1010,7 +1010,7 @@ fn generate_ssl_cert(
|
||||
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["req", "-new", "-nodes"])
|
||||
.args(["-newkey", "rsa:2048"])
|
||||
.args(["-newkey", "ed25519"])
|
||||
.args(["-subj", "/CN=localhost"])
|
||||
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
|
||||
@@ -535,6 +535,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
|
||||
gc_compaction_verification: settings
|
||||
.remove("gc_compaction_verification")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_compaction_verification' as bool")?,
|
||||
gc_compaction_initial_threshold_kb: settings
|
||||
.remove("gc_compaction_initial_threshold_kb")
|
||||
.map(|x| x.parse::<u64>())
|
||||
|
||||
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::models::{
|
||||
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
pub generation_override: Option<i32>,
|
||||
pub generation_override: Option<i32>, // only new tenants
|
||||
pub config: Option<TenantConfig>, // only new tenants
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -805,6 +808,7 @@ impl StorageController {
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
generation_override: None,
|
||||
config: None,
|
||||
};
|
||||
|
||||
let response = self
|
||||
|
||||
@@ -11,8 +11,8 @@ generate_id() {
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
|
||||
SPEC_FILE=/tmp/spec.json
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${SPEC_FILE_ORG} ${SPEC_FILE}
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
@@ -73,17 +73,27 @@ else
|
||||
ulid_extension=ulid
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
|
||||
cat ${SPEC_FILE}
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
# TODO(tristan957): Remove these workarounds for backwards compatibility after
|
||||
# the next compute release. That includes these next few lines and the
|
||||
# --spec-path in the compute_ctl invocation.
|
||||
if compute_ctl --help | grep --quiet -- '--config'; then
|
||||
SPEC_PATH="$CONFIG_FILE"
|
||||
else
|
||||
jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
|
||||
SPEC_PATH=/tmp/spec.json
|
||||
fi
|
||||
|
||||
echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
-S ${SPEC_FILE}
|
||||
--spec-path "$SPEC_PATH"
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
{
|
||||
"spec": {
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
},
|
||||
"compute_ctl_config": {
|
||||
"jwks": {
|
||||
"keys": []
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,141 +0,0 @@
|
||||
{
|
||||
"format_version": 1.0,
|
||||
|
||||
"timestamp": "2022-10-12T18:00:00.000Z",
|
||||
"operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
|
||||
|
||||
"cluster": {
|
||||
"cluster_id": "docker_compose",
|
||||
"name": "docker_compose_test",
|
||||
"state": "restarted",
|
||||
"roles": [
|
||||
{
|
||||
"name": "cloud_admin",
|
||||
"encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
|
||||
"options": null
|
||||
}
|
||||
],
|
||||
"databases": [
|
||||
],
|
||||
"settings": [
|
||||
{
|
||||
"name": "fsync",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "wal_log_hints",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "log_connections",
|
||||
"value": "on",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "port",
|
||||
"value": "55433",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "shared_buffers",
|
||||
"value": "1MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_connections",
|
||||
"value": "100",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "listen_addresses",
|
||||
"value": "0.0.0.0",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_wal_senders",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_slots",
|
||||
"value": "10",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "wal_sender_timeout",
|
||||
"value": "5s",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "wal_keep_size",
|
||||
"value": "0",
|
||||
"vartype": "integer"
|
||||
},
|
||||
{
|
||||
"name": "password_encryption",
|
||||
"value": "md5",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
"name": "restart_after_crash",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "synchronous_standby_names",
|
||||
"value": "walproposer",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "shared_preload_libraries",
|
||||
"value": "neon,pg_cron,timescaledb,pg_stat_statements",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.safekeepers",
|
||||
"value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.timeline_id",
|
||||
"value": "TIMELINE_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.tenant_id",
|
||||
"value": "TENANT_ID",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=pageserver port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_write_lag",
|
||||
"value": "500MB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "max_replication_flush_lag",
|
||||
"value": "10GB",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "cron.database",
|
||||
"value": "postgres",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
]
|
||||
}
|
||||
@@ -159,7 +159,7 @@ services:
|
||||
#- RUST_BACKTRACE=1
|
||||
# Mount the test files directly, for faster editing cycle.
|
||||
volumes:
|
||||
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
|
||||
- ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
|
||||
- ./compute_wrapper/shell/:/shell/
|
||||
ports:
|
||||
- 55433:55433 # pg protocol handler
|
||||
|
||||
0
explained_queries.sql
Normal file
0
explained_queries.sql
Normal file
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// All configuration parameters necessary for a compute. When
|
||||
/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
|
||||
/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
|
||||
/// and contains parameters necessary for operating `compute_ctl` independently
|
||||
/// of whether a tenant is attached to the compute or not.
|
||||
///
|
||||
/// This also happens to be the body of `compute_ctl`'s /configure request.
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct ComputeConfig {
|
||||
/// The compute spec
|
||||
pub spec: Option<ComputeSpec>,
|
||||
|
||||
/// The compute_ctl configuration
|
||||
#[allow(dead_code)]
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
impl From<ControlPlaneConfigResponse> for ComputeConfig {
|
||||
fn from(value: ControlPlaneConfigResponse) -> Self {
|
||||
Self {
|
||||
spec: value.spec,
|
||||
compute_ctl_config: value.compute_ctl_config,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ExtensionInstallResponse {
|
||||
pub extension: PgIdent,
|
||||
@@ -161,7 +187,7 @@ pub struct TlsConfig {
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ControlPlaneSpecResponse {
|
||||
pub struct ControlPlaneConfigResponse {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
pub status: ControlPlaneComputeStatus,
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
//! `ComputeSpec` represents the contents of the spec.json file.
|
||||
//!
|
||||
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
|
||||
//! all the information needed to start up the right version of PostgreSQL,
|
||||
//! and connect it to the storage nodes.
|
||||
//! The ComputeSpec contains all the information needed to start up
|
||||
//! the right version of PostgreSQL, and connect it to the storage nodes.
|
||||
//! It can be passed as part of the `config.json`, or the control plane can
|
||||
//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
|
||||
//! compute_ctl can fetch it by calling the control plane's API.
|
||||
use std::collections::HashMap;
|
||||
|
||||
use indexmap::IndexMap;
|
||||
@@ -165,13 +165,7 @@ pub struct ComputeSpec {
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
|
||||
/// Log level for audit logging:
|
||||
///
|
||||
/// Disabled - no audit logging. This is the default.
|
||||
/// log - log masked statements to the postgres log using pgaudit extension
|
||||
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
|
||||
///
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
/// Log level for compute audit logging
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
|
||||
@@ -295,14 +289,25 @@ impl ComputeMode {
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
/// Disabled, log, hipaa
|
||||
/// Default is Disabled
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
#[default]
|
||||
Disabled,
|
||||
// Deprecated, use Base instead
|
||||
Log,
|
||||
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
|
||||
// logged to the standard postgresql log stream
|
||||
Base,
|
||||
// Deprecated, use Full or Extended instead
|
||||
Hipaa,
|
||||
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access
|
||||
Extended,
|
||||
// (pgaudit.log='all', pgaudit.log_parameter='on'),
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access.
|
||||
Full,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
|
||||
@@ -30,6 +30,7 @@ tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
|
||||
@@ -4,6 +4,8 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use hyper0::Body;
|
||||
use hyper0::server::conn::Http;
|
||||
use metrics::{IntCounterVec, register_int_counter_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::{RequestService, RequestServiceBuilder};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
@@ -26,6 +28,24 @@ pub struct Server {
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
}
|
||||
|
||||
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_started_total",
|
||||
"Number of established http/https connections",
|
||||
&["scheme"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_errors_total",
|
||||
"Number of occured connection errors by type",
|
||||
&["type"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl Server {
|
||||
pub fn new(
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
@@ -60,6 +80,15 @@ impl Server {
|
||||
false
|
||||
}
|
||||
|
||||
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
|
||||
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
|
||||
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
|
||||
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
|
||||
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
|
||||
|
||||
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
|
||||
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
|
||||
|
||||
let mut connections = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -67,6 +96,7 @@ impl Server {
|
||||
let (tcp_stream, remote_addr) = match stream {
|
||||
Ok(stream) => stream,
|
||||
Err(err) => {
|
||||
tcp_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TCP connection: {err:#}");
|
||||
}
|
||||
@@ -78,11 +108,18 @@ impl Server {
|
||||
let tls_acceptor = self.tls_acceptor.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
let tls_error_cnt = tls_error_cnt.clone();
|
||||
let http_error_cnt = http_error_cnt.clone();
|
||||
let https_error_cnt = https_error_cnt.clone();
|
||||
let http_connection_cnt = http_connection_cnt.clone();
|
||||
let https_connection_cnt = https_connection_cnt.clone();
|
||||
|
||||
connections.push(tokio::spawn(
|
||||
async move {
|
||||
match tls_acceptor {
|
||||
Some(tls_acceptor) => {
|
||||
// Handle HTTPS connection.
|
||||
https_connection_cnt.inc();
|
||||
let tls_stream = tokio::select! {
|
||||
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
|
||||
_ = cancel.cancelled() => return,
|
||||
@@ -90,6 +127,7 @@ impl Server {
|
||||
let tls_stream = match tls_stream {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
tls_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
@@ -97,6 +135,7 @@ impl Server {
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
https_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
@@ -104,7 +143,9 @@ impl Server {
|
||||
}
|
||||
None => {
|
||||
// Handle HTTP connection.
|
||||
http_connection_cnt.inc();
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
http_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
@@ -115,6 +156,7 @@ impl Server {
|
||||
}
|
||||
Some(conn) = connections.next() => {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
@@ -122,6 +164,7 @@ impl Server {
|
||||
// Wait for graceful shutdown of all connections.
|
||||
while let Some(conn) = connections.next().await {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use rustls::{
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
|
||||
server::{ClientHello, ResolvesServerCert},
|
||||
sign::CertifiedKey,
|
||||
};
|
||||
use x509_cert::der::Reader;
|
||||
|
||||
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
|
||||
let cert_data = tokio::fs::read(filename)
|
||||
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
|
||||
Ok(certified_key)
|
||||
}
|
||||
|
||||
/// rustls's CertifiedKey with extra parsed fields used for metrics.
|
||||
struct ParsedCertifiedKey {
|
||||
certified_key: CertifiedKey,
|
||||
expiration_time: UnixTime,
|
||||
}
|
||||
|
||||
/// Parse expiration time from an X509 certificate.
|
||||
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
|
||||
let parsed_cert = x509_cert::der::SliceReader::new(cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
Ok(UnixTime::since_unix_epoch(
|
||||
parsed_cert
|
||||
.tbs_certificate
|
||||
.validity
|
||||
.not_after
|
||||
.to_unix_duration(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn load_and_parse_certified_key(
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
) -> anyhow::Result<ParsedCertifiedKey> {
|
||||
let certified_key = load_certified_key(key_filename, cert_filename).await?;
|
||||
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
|
||||
Ok(ParsedCertifiedKey {
|
||||
certified_key,
|
||||
expiration_time,
|
||||
})
|
||||
}
|
||||
|
||||
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"tls_certs_expiration_time_seconds",
|
||||
"Expiration time of the loaded certificate since unix epoch in seconds",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_started_total",
|
||||
"Number of certificate reload loop iterations started",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_updated_total",
|
||||
"Number of times the certificate was updated to the new one",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_failed_total",
|
||||
"Number of times the certificate reload failed",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
|
||||
/// the disk periodically.
|
||||
#[derive(Debug)]
|
||||
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
|
||||
impl ReloadingCertificateResolver {
|
||||
/// Creates a new Resolver by loading certificate and private key from FS and
|
||||
/// creating tokio::task to reload them with provided reload_period.
|
||||
/// resolver_name is used as metric's label.
|
||||
pub async fn new(
|
||||
resolver_name: &str,
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
reload_period: Duration,
|
||||
) -> anyhow::Result<Arc<Self>> {
|
||||
// Create metrics for current resolver.
|
||||
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
|
||||
let cert_reload_started_counter =
|
||||
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_updated_counter =
|
||||
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_failed_counter =
|
||||
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
|
||||
|
||||
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
|
||||
|
||||
let this = Arc::new(Self {
|
||||
certified_key: ArcSwap::from_pointee(
|
||||
load_certified_key(key_filename, cert_filename).await?,
|
||||
),
|
||||
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
|
||||
});
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
|
||||
tokio::spawn({
|
||||
let weak_this = Arc::downgrade(&this);
|
||||
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
|
||||
Some(this) => this,
|
||||
None => break, // Resolver has been destroyed, exit.
|
||||
};
|
||||
match load_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(new_certified_key) => {
|
||||
if new_certified_key.cert == this.certified_key.load().cert {
|
||||
cert_reload_started_counter.inc();
|
||||
|
||||
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(parsed_key) => {
|
||||
if parsed_key.certified_key.cert == this.certified_key.load().cert {
|
||||
tracing::debug!("Certificate has not changed since last reloading");
|
||||
} else {
|
||||
tracing::info!("Certificate has been reloaded");
|
||||
this.certified_key.store(Arc::new(new_certified_key));
|
||||
this.certified_key.store(Arc::new(parsed_key.certified_key));
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
cert_reload_updated_counter.inc();
|
||||
}
|
||||
last_reload_failed = false;
|
||||
}
|
||||
Err(err) => {
|
||||
cert_reload_failed_counter.inc();
|
||||
// Note: Reloading certs may fail if it conflicts with the script updating
|
||||
// the files at the same time. Warn only if the error is persistent.
|
||||
if last_reload_failed {
|
||||
|
||||
@@ -207,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
|
||||
/// Causes runtime errors if larger than max get_vectored batch size.
|
||||
pub max_batch_size: NonZeroUsize,
|
||||
pub execution: PageServiceProtocolPipelinedExecutionStrategy,
|
||||
// The default below is such that new versions of the software can start
|
||||
// with the old configuration.
|
||||
#[serde(default)]
|
||||
pub batching: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -216,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
|
||||
Tasks,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum PageServiceProtocolPipelinedBatchingStrategy {
|
||||
/// All get page requests in a batch will be at the same LSN
|
||||
#[default]
|
||||
UniformLsn,
|
||||
/// Get page requests in a batch may be at different LSN
|
||||
///
|
||||
/// One key cannot be present more than once at different LSNs in
|
||||
/// the same batch.
|
||||
ScatteredLsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case")]
|
||||
pub enum GetVectoredConcurrentIo {
|
||||
@@ -452,6 +469,8 @@ pub struct TenantConfigToml {
|
||||
// gc-compaction related configs
|
||||
/// Enable automatic gc-compaction trigger on this tenant.
|
||||
pub gc_compaction_enabled: bool,
|
||||
/// Enable verification of gc-compaction results.
|
||||
pub gc_compaction_verification: bool,
|
||||
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
|
||||
/// gc-compaction will be triggered.
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
@@ -613,9 +632,12 @@ impl Default for ConfigToml {
|
||||
page_service_pipelining: if !cfg!(test) {
|
||||
PageServicePipeliningConfig::Serial
|
||||
} else {
|
||||
// Do not turn this into the default until scattered reads have been
|
||||
// validated and rolled-out fully.
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
|
||||
batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
|
||||
})
|
||||
},
|
||||
get_vectored_concurrent_io: if !cfg!(test) {
|
||||
@@ -692,6 +714,7 @@ pub mod tenant_conf_defaults {
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
|
||||
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
|
||||
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
|
||||
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
|
||||
}
|
||||
@@ -746,6 +769,7 @@ impl Default for TenantConfigToml {
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
|
||||
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
|
||||
sampling_ratio: None,
|
||||
|
||||
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`storage_controller::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Import request for safekeeper timelines.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineImportRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub start_lsn: Lsn,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json;
|
||||
|
||||
@@ -927,7 +927,7 @@ impl Key {
|
||||
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
@@ -938,7 +938,7 @@ impl Key {
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
_ => return Err(ToRelBlockError(self.field1)),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ToRelBlockError(u8);
|
||||
|
||||
impl fmt::Display for ToRelBlockError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "unexpected value kind 0x{:02x}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ToRelBlockError {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -576,6 +576,8 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_verification: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_ratio_percent: FieldPatch<u64>,
|
||||
@@ -696,6 +698,9 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_enabled: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_verification: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_compaction_initial_threshold_kb: Option<u64>,
|
||||
|
||||
@@ -744,6 +749,7 @@ impl TenantConfig {
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
mut sampling_ratio,
|
||||
@@ -835,6 +841,9 @@ impl TenantConfig {
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
.apply(&mut gc_compaction_enabled);
|
||||
patch
|
||||
.gc_compaction_verification
|
||||
.apply(&mut gc_compaction_verification);
|
||||
patch
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.apply(&mut gc_compaction_initial_threshold_kb);
|
||||
@@ -876,6 +885,7 @@ impl TenantConfig {
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
sampling_ratio,
|
||||
@@ -974,6 +984,9 @@ impl TenantConfig {
|
||||
gc_compaction_enabled: self
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(global_conf.gc_compaction_enabled),
|
||||
gc_compaction_verification: self
|
||||
.gc_compaction_verification
|
||||
.unwrap_or(global_conf.gc_compaction_verification),
|
||||
gc_compaction_initial_threshold_kb: self
|
||||
.gc_compaction_initial_threshold_kb
|
||||
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
|
||||
|
||||
12
libs/remote_keys/Cargo.toml
Normal file
12
libs/remote_keys/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "remote_keys"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-smithy-types.workspace = true
|
||||
aws-sdk-kms.workspace = true
|
||||
aws-config.workspace = true
|
||||
utils.workspace = true
|
||||
47
libs/remote_keys/src/aws_keys.rs
Normal file
47
libs/remote_keys/src/aws_keys.rs
Normal file
@@ -0,0 +1,47 @@
|
||||
use aws_config::BehaviorVersion;
|
||||
|
||||
use crate::KeyId;
|
||||
|
||||
pub struct AwsRemoteKeyClient {
|
||||
client: aws_sdk_kms::Client,
|
||||
}
|
||||
|
||||
impl AwsRemoteKeyClient {
|
||||
pub async fn new() -> Self {
|
||||
let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28())
|
||||
.retry_config(
|
||||
aws_config::retry::RetryConfig::standard()
|
||||
.with_max_attempts(5) // Retry up to 5 times
|
||||
.with_initial_backoff(std::time::Duration::from_millis(200)) // Start with 200ms delay
|
||||
.with_max_backoff(std::time::Duration::from_secs(5)), // Cap at 5 seconds
|
||||
)
|
||||
.load()
|
||||
.await;
|
||||
let client = aws_sdk_kms::Client::new(&sdk_config);
|
||||
Self { client }
|
||||
}
|
||||
|
||||
pub async fn decrypt(&self, key_id: &KeyId, ciphertext: impl Into<Vec<u8>>) -> Vec<u8> {
|
||||
let output = self
|
||||
.client
|
||||
.decrypt()
|
||||
.key_id(&key_id.0)
|
||||
.ciphertext_blob(aws_smithy_types::Blob::new(ciphertext.into()))
|
||||
.send()
|
||||
.await
|
||||
.expect("decrypt");
|
||||
output.plaintext.expect("plaintext").into_inner()
|
||||
}
|
||||
|
||||
pub async fn encrypt(&self, key_id: &KeyId, ciphertext: impl Into<Vec<u8>>) -> Vec<u8> {
|
||||
let output = self
|
||||
.client
|
||||
.encrypt()
|
||||
.key_id(&key_id.0)
|
||||
.plaintext(aws_smithy_types::Blob::new(ciphertext.into()))
|
||||
.send()
|
||||
.await
|
||||
.expect("decrypt");
|
||||
output.ciphertext_blob.expect("ciphertext").into_inner()
|
||||
}
|
||||
}
|
||||
6
libs/remote_keys/src/lib.rs
Normal file
6
libs/remote_keys/src/lib.rs
Normal file
@@ -0,0 +1,6 @@
|
||||
mod aws_keys;
|
||||
pub use aws_keys::AwsRemoteKeyClient;
|
||||
|
||||
/// A string uniquely identifying a key
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct KeyId(pub String);
|
||||
@@ -10,6 +10,8 @@ default = []
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
|
||||
fuzz-read-path = ["testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
|
||||
@@ -126,7 +126,7 @@ async fn ingest(
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
let (_desc, path) = layer
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner())
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
|
||||
.await?
|
||||
.unwrap();
|
||||
tokio::fs::remove_file(path).await?;
|
||||
|
||||
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -353,9 +353,10 @@ where
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.get_vectored(query, self.io_concurrency.clone(), self.ctx)
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
|
||||
@@ -455,6 +455,7 @@ fn start_pageserver(
|
||||
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
|
||||
{
|
||||
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
|
||||
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walingest::{WalIngest, WalIngestErrorKind};
|
||||
|
||||
// Returns checkpoint LSN from controlfile
|
||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||
@@ -157,9 +157,9 @@ async fn import_rel(
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
RelationError::AlreadyExists => {
|
||||
debug!("Relation {} already exist. We must be extending it.", rel)
|
||||
match e.kind {
|
||||
WalIngestErrorKind::RelationAlreadyExists(rel) => {
|
||||
debug!("Relation {rel} already exists. We must be extending it.")
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use metrics::{
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -1714,6 +1714,28 @@ pub enum SmgrQueryType {
|
||||
Test,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
Debug,
|
||||
Clone,
|
||||
Copy,
|
||||
IntoStaticStr,
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
enum_map::Enum,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum GetPageBatchBreakReason {
|
||||
BatchFull,
|
||||
NonBatchableRequest,
|
||||
NonUniformLsn,
|
||||
SamePageAtDifferentLsn,
|
||||
NonUniformTimeline,
|
||||
ExecutorSteal,
|
||||
#[cfg(feature = "testing")]
|
||||
NonUniformKey,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
@@ -1725,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
|
||||
per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
}
|
||||
|
||||
@@ -1858,12 +1882,55 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
// it's a counter, but, name is prepared to extend it to a histogram of queue depth
|
||||
"pageserver_page_service_batch_break_reason_global",
|
||||
"Reason for breaking batches of get page requests",
|
||||
&["reason"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
struct GetPageBatchBreakReasonTimelineMetrics {
|
||||
map: EnumMap<GetPageBatchBreakReason, IntCounter>,
|
||||
}
|
||||
|
||||
impl GetPageBatchBreakReasonTimelineMetrics {
|
||||
fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
|
||||
GetPageBatchBreakReasonTimelineMetrics {
|
||||
map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
|
||||
let reason = GetPageBatchBreakReason::from_usize(reason_idx);
|
||||
PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
|
||||
tenant_id,
|
||||
shard_slug,
|
||||
timeline_id,
|
||||
reason.into(),
|
||||
])
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
fn inc(&self, reason: GetPageBatchBreakReason) {
|
||||
self.map[reason].inc()
|
||||
}
|
||||
}
|
||||
|
||||
static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_service_batch_break_reason",
|
||||
"Reason for breaking batches of get page requests",
|
||||
&["tenant_id", "shard_id", "timeline_id", "reason"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_page_service_config_max_batch_size",
|
||||
"Configured maximum batch size for the server-side batching functionality of page_service. \
|
||||
Labels expose more of the configuration parameters.",
|
||||
&["mode", "execution"]
|
||||
&["mode", "execution", "batching"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1871,10 +1938,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
|
||||
fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
|
||||
let (label_values, value) = match conf {
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
|
||||
PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
|
||||
PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching,
|
||||
}) => {
|
||||
let mode = "pipelined";
|
||||
let execution = match execution {
|
||||
@@ -1883,7 +1951,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
}
|
||||
PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
|
||||
};
|
||||
([mode, execution], max_batch_size.get())
|
||||
let batching = match batching {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
|
||||
};
|
||||
|
||||
([mode, execution, batching], max_batch_size.get())
|
||||
}
|
||||
};
|
||||
PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
|
||||
@@ -1979,6 +2052,15 @@ impl SmgrQueryTimePerTimeline {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_batch_break_reason = std::array::from_fn(|i| {
|
||||
let reason = GetPageBatchBreakReason::from_usize(i);
|
||||
PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
|
||||
.get_metric_with_label_values(&[reason.into()])
|
||||
.unwrap()
|
||||
});
|
||||
let per_timeline_batch_break_reason =
|
||||
GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
|
||||
|
||||
let global_flush_in_progress_micros =
|
||||
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
|
||||
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
|
||||
@@ -1996,6 +2078,8 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
global_batch_break_reason,
|
||||
per_timeline_batch_break_reason,
|
||||
throttling: pagestream_throttle_metrics,
|
||||
}
|
||||
}
|
||||
@@ -2024,9 +2108,16 @@ impl SmgrQueryTimePerTimeline {
|
||||
}
|
||||
|
||||
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
pub(crate) fn observe_getpage_batch_start(
|
||||
&self,
|
||||
batch_size: usize,
|
||||
break_reason: GetPageBatchBreakReason,
|
||||
) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
|
||||
self.global_batch_break_reason[break_reason.into_usize()].inc();
|
||||
self.per_timeline_batch_break_reason.inc(break_reason);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3392,6 +3483,15 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
|
||||
for reason in GetPageBatchBreakReason::iter() {
|
||||
let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
reason.into(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4270,6 +4370,7 @@ pub fn preinitialize_metrics(
|
||||
[
|
||||
&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
|
||||
&SMGR_QUERY_STARTED_GLOBAL,
|
||||
&PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
|
||||
@@ -18,7 +18,7 @@ use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::{
|
||||
@@ -58,8 +58,8 @@ use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
|
||||
TimelineMetrics,
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::{
|
||||
@@ -641,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
effective_request_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
}
|
||||
|
||||
@@ -670,8 +671,8 @@ enum BatchedFeMessage {
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: timeline::handle::WeakHandle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
batch_break_reason: GetPageBatchBreakReason,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
@@ -724,6 +725,119 @@ impl BatchedFeMessage {
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn should_break_batch(
|
||||
&self,
|
||||
other: &BatchedFeMessage,
|
||||
max_batch_size: NonZeroUsize,
|
||||
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
) -> Option<GetPageBatchBreakReason> {
|
||||
match (self, other) {
|
||||
(
|
||||
BatchedFeMessage::GetPage {
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::GetPage {
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
|
||||
return Some(GetPageBatchBreakReason::BatchFull);
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
|
||||
return Some(GetPageBatchBreakReason::NonUniformTimeline);
|
||||
}
|
||||
|
||||
match batching_strategy {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
|
||||
if let Some(last_in_batch) = accum_pages.last() {
|
||||
if last_in_batch.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
{
|
||||
trace!(
|
||||
accum_lsn = %last_in_batch.effective_request_lsn,
|
||||
this_lsn = %this_pages[0].effective_request_lsn,
|
||||
"stopping batching because LSN changed"
|
||||
);
|
||||
|
||||
return Some(GetPageBatchBreakReason::NonUniformLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
|
||||
// The read path doesn't curently support serving the same page at different LSNs.
|
||||
// While technically possible, it's uncertain if the complexity is worth it.
|
||||
// Break the batch if such a case is encountered.
|
||||
let same_page_different_lsn = accum_pages.iter().any(|batched| {
|
||||
batched.req.rel == this_pages[0].req.rel
|
||||
&& batched.req.blkno == this_pages[0].req.blkno
|
||||
&& batched.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
});
|
||||
|
||||
if same_page_different_lsn {
|
||||
trace!(
|
||||
rel=%this_pages[0].req.rel,
|
||||
blkno=%this_pages[0].req.blkno,
|
||||
lsn=%this_pages[0].effective_request_lsn,
|
||||
"stopping batching because same page was requested at different LSNs"
|
||||
);
|
||||
|
||||
return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return Some(GetPageBatchBreakReason::BatchFull);
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return Some(GetPageBatchBreakReason::NonUniformTimeline);
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return Some(GetPageBatchBreakReason::NonUniformKey);
|
||||
}
|
||||
None
|
||||
}
|
||||
(_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
@@ -1025,34 +1139,32 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
let res = Self::wait_or_get_last_lsn(
|
||||
let effective_request_lsn = match Self::effective_request_lsn(
|
||||
&shard,
|
||||
shard.get_last_record_lsn(),
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
let effective_request_lsn = match res {
|
||||
) {
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
effective_request_lsn,
|
||||
ctx,
|
||||
}],
|
||||
// The executor grabs the batch when it becomes idle.
|
||||
// Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
|
||||
// default reason for breaking the batch.
|
||||
batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -1078,6 +1190,7 @@ impl PageServerHandler {
|
||||
#[instrument(skip_all, level = tracing::Level::TRACE)]
|
||||
#[allow(clippy::boxed_local)]
|
||||
fn pagestream_do_batch(
|
||||
batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
|
||||
max_batch_size: NonZeroUsize,
|
||||
batch: &mut Result<BatchedFeMessage, QueryError>,
|
||||
this_msg: Result<BatchedFeMessage, QueryError>,
|
||||
@@ -1089,90 +1202,59 @@ impl PageServerHandler {
|
||||
Err(e) => return Err(Err(e)),
|
||||
};
|
||||
|
||||
match (&mut *batch, this_msg) {
|
||||
// something batched already, let's see if we can add this message to the batch
|
||||
(
|
||||
Ok(BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: accum_shard,
|
||||
pages: accum_pages,
|
||||
effective_request_lsn: accum_lsn,
|
||||
}),
|
||||
BatchedFeMessage::GetPage {
|
||||
span: _,
|
||||
shard: this_shard,
|
||||
pages: this_pages,
|
||||
effective_request_lsn: this_lsn,
|
||||
},
|
||||
) if (|| {
|
||||
assert_eq!(this_pages.len(), 1);
|
||||
if accum_pages.len() >= max_batch_size.get() {
|
||||
trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_pages.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
// the vectored get currently only supports a single LSN, so, bounce as soon
|
||||
// as the effective request_lsn changes
|
||||
if *accum_lsn != this_lsn {
|
||||
trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
let eligible_batch = match batch {
|
||||
Ok(b) => b,
|
||||
Err(_) => {
|
||||
return Err(Ok(this_msg));
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
};
|
||||
|
||||
let batch_break =
|
||||
eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
|
||||
|
||||
match batch_break {
|
||||
Some(reason) => {
|
||||
if let BatchedFeMessage::GetPage {
|
||||
batch_break_reason, ..
|
||||
} = eligible_batch
|
||||
{
|
||||
*batch_break_reason = reason;
|
||||
}
|
||||
if !accum_shard.is_same_handle_as(&this_shard) {
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
|
||||
Err(Ok(this_msg))
|
||||
}
|
||||
None => {
|
||||
// ok to batch
|
||||
match (eligible_batch, this_msg) {
|
||||
(
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: accum_pages, ..
|
||||
},
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: this_pages, ..
|
||||
},
|
||||
) => {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
BatchedFeMessage::Test {
|
||||
requests: accum_requests,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::Test {
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1393,8 +1475,8 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
batch_break_reason,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
@@ -1405,9 +1487,9 @@ impl PageServerHandler {
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
io_concurrency,
|
||||
batch_break_reason,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
@@ -1724,6 +1806,7 @@ impl PageServerHandler {
|
||||
let PageServicePipeliningConfigPipelined {
|
||||
max_batch_size,
|
||||
execution,
|
||||
batching: batching_strategy,
|
||||
} = pipelining_config;
|
||||
|
||||
// Macro to _define_ a pipeline stage.
|
||||
@@ -1775,7 +1858,7 @@ impl PageServerHandler {
|
||||
exit |= read_res.is_err();
|
||||
let could_send = batch_tx
|
||||
.send(read_res, |batch, res| {
|
||||
Self::pagestream_do_batch(max_batch_size, batch, res)
|
||||
Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
|
||||
})
|
||||
.await;
|
||||
exit |= could_send.is_err();
|
||||
@@ -1871,7 +1954,39 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let effective_request_lsn = Self::effective_request_lsn(
|
||||
timeline,
|
||||
last_record_lsn,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
latest_gc_cutoff_lsn,
|
||||
)?;
|
||||
|
||||
if effective_request_lsn > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Since we waited for 'effective_request_lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
}
|
||||
|
||||
Ok(effective_request_lsn)
|
||||
}
|
||||
|
||||
fn effective_request_lsn(
|
||||
timeline: &Timeline,
|
||||
last_record_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
not_modified_since: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
// Sanity check the request
|
||||
if request_lsn < not_modified_since {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
@@ -1906,19 +2021,7 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
if not_modified_since > last_record_lsn {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
not_modified_since,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// Since we waited for 'not_modified_since' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the last-record LSN can
|
||||
// advance immediately after we return anyway)
|
||||
Ok(not_modified_since)
|
||||
} else {
|
||||
// It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
|
||||
@@ -2073,16 +2176,16 @@ impl PageServerHandler {
|
||||
async fn handle_get_page_at_lsn_request_batched(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
io_concurrency: IoConcurrency,
|
||||
batch_break_reason: GetPageBatchBreakReason,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
timeline
|
||||
.query_metrics
|
||||
.observe_getpage_batch_start(requests.len());
|
||||
.observe_getpage_batch_start(requests.len(), batch_break_reason);
|
||||
|
||||
// If a page trace is running, submit an event for this request.
|
||||
if let Some(page_trace) = timeline.page_trace.load().as_ref() {
|
||||
@@ -2092,20 +2195,81 @@ impl PageServerHandler {
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn,
|
||||
effective_lsn: batch.effective_request_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If any request in the batch needs to wait for LSN, then do so now.
|
||||
let mut perf_instrument = false;
|
||||
let max_effective_lsn = requests
|
||||
.iter()
|
||||
.map(|req| {
|
||||
if req.ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
req.effective_request_lsn
|
||||
})
|
||||
.max()
|
||||
.expect("batch is never empty");
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id = %timeline.timeline_id,
|
||||
shard = %timeline.tenant_shard_id.shard_slug(),
|
||||
%max_effective_lsn
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if max_effective_lsn > last_record_lsn {
|
||||
if let Err(e) = timeline
|
||||
.wait_lsn(
|
||||
max_effective_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
timeline::WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
)
|
||||
})
|
||||
.await
|
||||
{
|
||||
return Vec::from_iter(requests.into_iter().map(|req| {
|
||||
Err(BatchedPageStreamError {
|
||||
err: PageStreamError::from(e.clone()),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests
|
||||
.iter()
|
||||
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
|
||||
effective_lsn,
|
||||
requests.iter().map(|p| {
|
||||
(
|
||||
&p.req.rel,
|
||||
&p.req.blkno,
|
||||
p.effective_request_lsn,
|
||||
p.ctx.attached_child(),
|
||||
)
|
||||
}),
|
||||
io_concurrency,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
assert_eq!(results.len(), requests.len());
|
||||
|
||||
@@ -6,14 +6,14 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
|
||||
use std::collections::{HashMap, HashSet, hash_map};
|
||||
use std::ops::{ControlFlow, Range};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, ensure};
|
||||
use crate::walingest::{WalIngestError, WalIngestErrorKind};
|
||||
use crate::{PERF_TRACE_TARGET, ensure_walingest};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
|
||||
TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
|
||||
@@ -21,7 +21,7 @@ use pageserver_api::key::{
|
||||
repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
@@ -50,7 +50,7 @@ use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
};
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -136,12 +136,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
AlreadyExists,
|
||||
#[error("invalid relnode")]
|
||||
InvalidRelnode,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
@@ -210,10 +206,9 @@ impl Timeline {
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages
|
||||
.iter()
|
||||
.map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
|
||||
effective_lsn,
|
||||
pages.iter().map(|(tag, blknum)| {
|
||||
(tag, blknum, effective_lsn, ctx.attached_child())
|
||||
}),
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
@@ -251,8 +246,7 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
|
||||
effective_lsn: Lsn,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
@@ -265,11 +259,13 @@ impl Timeline {
|
||||
let mut result = Vec::with_capacity(pages.len());
|
||||
let result_slots = result.spare_capacity_mut();
|
||||
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
BTreeMap::default();
|
||||
let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
let mut perf_instrument = false;
|
||||
for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
|
||||
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -280,14 +276,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let nblocks = match self
|
||||
.get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
|
||||
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"GET_REL_SIZE",
|
||||
reltag=%tag,
|
||||
lsn=%effective_lsn,
|
||||
lsn=%lsn,
|
||||
)
|
||||
})
|
||||
.await
|
||||
@@ -303,7 +299,7 @@ impl Timeline {
|
||||
if *blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, effective_lsn, nblocks
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
|
||||
slots_filled += 1;
|
||||
@@ -312,46 +308,29 @@ impl Timeline {
|
||||
|
||||
let key = rel_block_to_key(*tag, *blknum);
|
||||
|
||||
if ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
let key_slots = keys_slots.entry(key).or_default();
|
||||
key_slots.push((response_slot_idx, ctx));
|
||||
|
||||
let acc = req_keyspaces.entry(lsn).or_default();
|
||||
acc.add_key(key);
|
||||
}
|
||||
|
||||
let keyspace = {
|
||||
// add_key requires monotonicity
|
||||
let mut acc = KeySpaceAccum::new();
|
||||
for key in keys_slots
|
||||
.keys()
|
||||
// in fact it requires strong monotonicity
|
||||
.dedup()
|
||||
{
|
||||
acc.add_key(*key);
|
||||
}
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
let ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %self.tenant_shard_id.tenant_id,
|
||||
timeline_id = %self.timeline_id,
|
||||
lsn = %effective_lsn,
|
||||
shard = %self.tenant_shard_id.shard_slug(),
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
let query: Vec<(Lsn, KeySpace)> = req_keyspaces
|
||||
.into_iter()
|
||||
.map(|(lsn, acc)| (lsn, acc.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
let query = VersionedKeySpaceQuery::scattered(query);
|
||||
let res = self
|
||||
.get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
|
||||
.maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
|
||||
.get_vectored(query, io_concurrency, ctx)
|
||||
.maybe_perf_instrument(ctx, |current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"GET_BATCH",
|
||||
batch_size = %page_count,
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
match res {
|
||||
@@ -381,12 +360,12 @@ impl Timeline {
|
||||
// There is no standardized way to express that the batched span followed from N request spans.
|
||||
// So, abuse the system and mark the request contexts as follows_from the batch span, so we get
|
||||
// some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
|
||||
result_slots[first_slot].write(res);
|
||||
first_req_ctx.perf_follows_from(&ctx);
|
||||
first_req_ctx.perf_follows_from(ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
}
|
||||
@@ -425,7 +404,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
req_ctx.perf_follows_from(&ctx);
|
||||
req_ctx.perf_follows_from(ctx);
|
||||
result_slots[*slot].write(err);
|
||||
}
|
||||
|
||||
@@ -664,8 +643,9 @@ impl Timeline {
|
||||
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for batch in batches.parts {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, lsn);
|
||||
let blocks = self
|
||||
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, block) in blocks {
|
||||
@@ -902,8 +882,9 @@ impl Timeline {
|
||||
);
|
||||
|
||||
for batch in batches.parts.into_iter().rev() {
|
||||
let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
|
||||
let blocks = self
|
||||
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, clog_page) in blocks.into_iter().rev() {
|
||||
@@ -1478,8 +1459,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
@@ -1578,7 +1559,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u32, PageReconstructError> {
|
||||
) -> Result<u32, WalIngestError> {
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
@@ -1593,14 +1574,13 @@ impl DatadirModification<'_> {
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
self.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(0)
|
||||
} else {
|
||||
self.tline
|
||||
Ok(self
|
||||
.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1637,11 +1617,14 @@ impl DatadirModification<'_> {
|
||||
// TODO(vlad): remove this argument and replace the shard check with is_key_local
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
|
||||
let key = Key::from_compact(meta.key());
|
||||
let (rel, blkno) = key
|
||||
.to_rel_block()
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
|
||||
let new_nblocks = blkno + 1;
|
||||
|
||||
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
|
||||
@@ -1683,8 +1666,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1696,7 +1679,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.tline.tenant_shard_id.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1714,14 +1697,11 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1733,15 +1713,12 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(key, Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1751,15 +1728,11 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1776,15 +1749,11 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1832,8 +1801,10 @@ impl DatadirModification<'_> {
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
@@ -1874,13 +1845,13 @@ impl DatadirModification<'_> {
|
||||
xid: u64,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Add it to the directory entry
|
||||
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1891,7 +1862,7 @@ impl DatadirModification<'_> {
|
||||
let xid = xid as u32;
|
||||
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1909,22 +1880,22 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
origin_id: RepOriginId,
|
||||
origin_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = repl_origin_key(origin_id);
|
||||
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
|
||||
self.set_replorigin(origin_id, Lsn::INVALID).await
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CHECKPOINT_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1934,7 +1905,7 @@ impl DatadirModification<'_> {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
|
||||
@@ -1973,20 +1944,21 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RelationError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)))?;
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
|
||||
let dbdir_exists =
|
||||
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
|
||||
// Didn't exist. Update dbdir
|
||||
e.insert(false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::Db,
|
||||
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
|
||||
@@ -2003,27 +1975,25 @@ impl DatadirModification<'_> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self
|
||||
.sparse_get(sparse_rel_dir_key, ctx)
|
||||
.await
|
||||
.map_err(|e| RelationError::Other(e.into()))?;
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
if val == RelDirExists::Exists {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
@@ -2039,9 +2009,7 @@ impl DatadirModification<'_> {
|
||||
// will be key not found errors if we don't create an empty one for rel_size_v2.
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
|
||||
);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
@@ -2049,7 +2017,7 @@ impl DatadirModification<'_> {
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
@@ -2059,9 +2027,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&rel_dir).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2086,8 +2052,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
@@ -2117,8 +2083,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -2142,8 +2108,10 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2163,7 +2131,7 @@ impl DatadirModification<'_> {
|
||||
let key =
|
||||
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
if val == RelDirExists::Exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
@@ -2206,7 +2174,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Add it to the directory entry
|
||||
@@ -2215,7 +2183,7 @@ impl DatadirModification<'_> {
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::SlruSegment(kind),
|
||||
@@ -2242,7 +2210,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Put size
|
||||
@@ -2258,7 +2226,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2283,7 +2251,7 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Drop a relmapper file (pg_filenode.map)
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
|
||||
// TODO
|
||||
Ok(())
|
||||
}
|
||||
@@ -2293,7 +2261,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
xid: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
@@ -2308,7 +2276,8 @@ impl DatadirModification<'_> {
|
||||
));
|
||||
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
|
||||
} else {
|
||||
let xid: u32 = u32::try_from(xid)?;
|
||||
let xid: u32 = u32::try_from(xid)
|
||||
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -2333,7 +2302,7 @@ impl DatadirModification<'_> {
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
@@ -2342,7 +2311,7 @@ impl DatadirModification<'_> {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -2387,7 +2356,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
let new_val = aux_file::encode_file_value(&new_files)
|
||||
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -5933,12 +5933,20 @@ mod tests {
|
||||
use models::CompactLsnRange;
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::keyspace::KeySpaceRandomAccum;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::SeedableRng;
|
||||
#[cfg(feature = "testing")]
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, thread_rng};
|
||||
#[cfg(feature = "testing")]
|
||||
use std::ops::Range;
|
||||
use storage_layer::{IoConcurrency, PersistentLayerKey};
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
@@ -5948,7 +5956,7 @@ mod tests {
|
||||
use timeline::InMemoryLayerTestDesc;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use super::*;
|
||||
@@ -5960,6 +5968,318 @@ mod tests {
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct TestTimelineSpecification {
|
||||
start_lsn: Lsn,
|
||||
last_record_lsn: Lsn,
|
||||
|
||||
in_memory_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
|
||||
delta_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
|
||||
image_layers_shape: Vec<(Range<Key>, Lsn)>,
|
||||
|
||||
gap_chance: u8,
|
||||
will_init_chance: u8,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct Storage {
|
||||
storage: HashMap<(Key, Lsn), Value>,
|
||||
start_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
impl Storage {
|
||||
fn get(&self, key: Key, lsn: Lsn) -> Bytes {
|
||||
use bytes::BufMut;
|
||||
|
||||
let mut crnt_lsn = lsn;
|
||||
let mut got_base = false;
|
||||
|
||||
let mut acc = Vec::new();
|
||||
|
||||
while crnt_lsn >= self.start_lsn {
|
||||
if let Some(value) = self.storage.get(&(key, crnt_lsn)) {
|
||||
acc.push(value.clone());
|
||||
|
||||
match value {
|
||||
Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => {
|
||||
if *will_init {
|
||||
got_base = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Value::Image(_) => {
|
||||
got_base = true;
|
||||
break;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap();
|
||||
}
|
||||
|
||||
assert!(
|
||||
got_base,
|
||||
"Input data was incorrect. No base image for {key}@{lsn}"
|
||||
);
|
||||
|
||||
tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len());
|
||||
|
||||
let mut blob = BytesMut::new();
|
||||
for value in acc.into_iter().rev() {
|
||||
match value {
|
||||
Value::WalRecord(NeonWalRecord::Test { append, .. }) => {
|
||||
blob.extend_from_slice(append.as_bytes());
|
||||
}
|
||||
Value::Image(img) => {
|
||||
blob.put(img);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
blob.into()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn randomize_timeline(
|
||||
tenant: &Arc<Tenant>,
|
||||
new_timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
spec: TestTimelineSpecification,
|
||||
random: &mut rand::rngs::StdRng,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Timeline>, Storage, Vec<Lsn>)> {
|
||||
let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default();
|
||||
let mut interesting_lsns = vec![spec.last_record_lsn];
|
||||
|
||||
for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
|
||||
let mut lsn = lsn_range.start;
|
||||
while lsn < lsn_range.end {
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
let gap = random.gen_range(1..=100) <= spec.gap_chance;
|
||||
let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
|
||||
|
||||
if gap {
|
||||
continue;
|
||||
}
|
||||
|
||||
let record = if will_init {
|
||||
Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
|
||||
};
|
||||
|
||||
storage.insert((key, lsn), record);
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
lsn = Lsn(lsn.0 + 1);
|
||||
}
|
||||
|
||||
// Stash some interesting LSN for future use
|
||||
for offset in [0, 5, 100].iter() {
|
||||
if *offset == 0 {
|
||||
interesting_lsns.push(lsn_range.start);
|
||||
} else {
|
||||
let below = lsn_range.start.checked_sub(*offset);
|
||||
match below {
|
||||
Some(v) if v >= spec.start_lsn => {
|
||||
interesting_lsns.push(v);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let above = Lsn(lsn_range.start.0 + offset);
|
||||
interesting_lsns.push(above);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
|
||||
let mut lsn = lsn_range.start;
|
||||
while lsn < lsn_range.end {
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
let gap = random.gen_range(1..=100) <= spec.gap_chance;
|
||||
let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
|
||||
|
||||
if gap {
|
||||
continue;
|
||||
}
|
||||
|
||||
let record = if will_init {
|
||||
Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
|
||||
};
|
||||
|
||||
storage.insert((key, lsn), record);
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
lsn = Lsn(lsn.0 + 1);
|
||||
}
|
||||
|
||||
// Stash some interesting LSN for future use
|
||||
for offset in [0, 5, 100].iter() {
|
||||
if *offset == 0 {
|
||||
interesting_lsns.push(lsn_range.start);
|
||||
} else {
|
||||
let below = lsn_range.start.checked_sub(*offset);
|
||||
match below {
|
||||
Some(v) if v >= spec.start_lsn => {
|
||||
interesting_lsns.push(v);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let above = Lsn(lsn_range.start.0 + offset);
|
||||
interesting_lsns.push(above);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (key_range, lsn) in spec.image_layers_shape.iter() {
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
let blob = Bytes::from(format!("[image {key}@{lsn}]"));
|
||||
let record = Value::Image(blob.clone());
|
||||
storage.insert((key, *lsn), record);
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
// Stash some interesting LSN for future use
|
||||
for offset in [0, 5, 100].iter() {
|
||||
if *offset == 0 {
|
||||
interesting_lsns.push(*lsn);
|
||||
} else {
|
||||
let below = lsn.checked_sub(*offset);
|
||||
match below {
|
||||
Some(v) if v >= spec.start_lsn => {
|
||||
interesting_lsns.push(v);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let above = Lsn(lsn.0 + offset);
|
||||
interesting_lsns.push(above);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let in_memory_test_layers = {
|
||||
let mut acc = Vec::new();
|
||||
|
||||
for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
|
||||
let mut data = Vec::new();
|
||||
|
||||
let mut lsn = lsn_range.start;
|
||||
while lsn < lsn_range.end {
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
if let Some(record) = storage.get(&(key, lsn)) {
|
||||
data.push((key, lsn, record.clone()));
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
lsn = Lsn(lsn.0 + 1);
|
||||
}
|
||||
|
||||
acc.push(InMemoryLayerTestDesc {
|
||||
data,
|
||||
lsn_range: lsn_range.clone(),
|
||||
is_open: false,
|
||||
})
|
||||
}
|
||||
|
||||
acc
|
||||
};
|
||||
|
||||
let delta_test_layers = {
|
||||
let mut acc = Vec::new();
|
||||
|
||||
for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
|
||||
let mut data = Vec::new();
|
||||
|
||||
let mut lsn = lsn_range.start;
|
||||
while lsn < lsn_range.end {
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
if let Some(record) = storage.get(&(key, lsn)) {
|
||||
data.push((key, lsn, record.clone()));
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
lsn = Lsn(lsn.0 + 1);
|
||||
}
|
||||
|
||||
acc.push(DeltaLayerTestDesc {
|
||||
data,
|
||||
lsn_range: lsn_range.clone(),
|
||||
key_range: key_range.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
acc
|
||||
};
|
||||
|
||||
let image_test_layers = {
|
||||
let mut acc = Vec::new();
|
||||
|
||||
for (key_range, lsn) in spec.image_layers_shape.iter() {
|
||||
let mut data = Vec::new();
|
||||
|
||||
let mut key = key_range.start;
|
||||
while key < key_range.end {
|
||||
if let Some(record) = storage.get(&(key, *lsn)) {
|
||||
let blob = match record {
|
||||
Value::Image(blob) => blob.clone(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
data.push((key, blob));
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
acc.push((*lsn, data));
|
||||
}
|
||||
|
||||
acc
|
||||
};
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
new_timeline_id,
|
||||
spec.start_lsn,
|
||||
pg_version,
|
||||
ctx,
|
||||
in_memory_test_layers,
|
||||
delta_test_layers,
|
||||
image_test_layers,
|
||||
spec.last_record_lsn,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok((
|
||||
tline,
|
||||
Storage {
|
||||
storage,
|
||||
start_lsn: spec.start_lsn,
|
||||
},
|
||||
interesting_lsns,
|
||||
))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() -> anyhow::Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
|
||||
@@ -6786,10 +7106,11 @@ mod tests {
|
||||
for read in reads {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn);
|
||||
|
||||
let vectored_res = tline
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
reads_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -6868,10 +7189,11 @@ mod tests {
|
||||
};
|
||||
let read_lsn = child_timeline.get_last_record_lsn();
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn);
|
||||
|
||||
let vectored_res = child_timeline
|
||||
.get_vectored_impl(
|
||||
aux_keyspace.clone(),
|
||||
read_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7017,10 +7339,12 @@ mod tests {
|
||||
let read = KeySpace {
|
||||
ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
|
||||
};
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn);
|
||||
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(
|
||||
read.clone(),
|
||||
current_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7151,12 +7475,16 @@ mod tests {
|
||||
}
|
||||
|
||||
for query_lsn in query_lsns {
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
);
|
||||
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(
|
||||
KeySpace {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7655,10 +7983,11 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut cnt = 0;
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
|
||||
|
||||
for (key, value) in tline
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
query,
|
||||
&mut ValuesReconstructState::new(io_concurrency.clone()),
|
||||
&ctx,
|
||||
)
|
||||
@@ -7865,8 +8194,9 @@ mod tests {
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
|
||||
let res = tline
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
|
||||
}
|
||||
@@ -8163,13 +8493,10 @@ mod tests {
|
||||
|
||||
// test vectored scan on parent timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
|
||||
let res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -8189,13 +8516,10 @@ mod tests {
|
||||
|
||||
// test vectored scan on child timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
|
||||
let res = child
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, &ctx)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
@@ -8229,13 +8553,9 @@ mod tests {
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
let mut res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key..key.next()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
Ok(res.pop_last().map(|(k, v)| {
|
||||
assert_eq!(k, key);
|
||||
@@ -9257,6 +9577,7 @@ mod tests {
|
||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -9381,7 +9702,15 @@ mod tests {
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
@@ -9460,6 +9789,7 @@ mod tests {
|
||||
&[],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -9508,6 +9838,7 @@ mod tests {
|
||||
&[Lsn(0x30)],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -10358,14 +10689,13 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace::single(get_key(0)..get_key(10)),
|
||||
delta_layer_end_lsn,
|
||||
);
|
||||
|
||||
let results = tline
|
||||
.get_vectored(
|
||||
keyspace,
|
||||
delta_layer_end_lsn,
|
||||
IoConcurrency::sequential(),
|
||||
&ctx,
|
||||
)
|
||||
.get_vectored(query, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
@@ -10513,9 +10843,13 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let query = VersionedKeySpaceQuery::uniform(
|
||||
KeySpace::single(get_key(0)..get_key(10)),
|
||||
last_record_lsn,
|
||||
);
|
||||
|
||||
let results = tline
|
||||
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
|
||||
.get_vectored(query, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
@@ -10529,6 +10863,214 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// A randomized read path test. Generates a layer map according to a deterministic
|
||||
// specification. Fills the (key, LSN) space in random manner and then performs
|
||||
// random scattered queries validating the results against in-memory storage.
|
||||
//
|
||||
// See this internal Notion page for a diagram of the layer map:
|
||||
// https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4
|
||||
//
|
||||
// A fuzzing mode is also supported. In this mode, the test will use a random
|
||||
// seed instead of a hardcoded one. Use it in conjunction with `cargo stress`
|
||||
// to run multiple instances in parallel:
|
||||
//
|
||||
// $ RUST_BACKTRACE=1 RUST_LOG=INFO \
|
||||
// cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_read_path() -> anyhow::Result<()> {
|
||||
use rand::seq::SliceRandom;
|
||||
|
||||
let seed = if cfg!(feature = "fuzz-read-path") {
|
||||
let seed: u64 = thread_rng().r#gen();
|
||||
seed
|
||||
} else {
|
||||
// Use a hard-coded seed when not in fuzzing mode.
|
||||
// Note that with the current approach results are not reproducible
|
||||
// accross platforms and Rust releases.
|
||||
const SEED: u64 = 0;
|
||||
SEED
|
||||
};
|
||||
|
||||
let mut random = StdRng::seed_from_u64(seed);
|
||||
|
||||
let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") {
|
||||
const QUERIES: u64 = 5000;
|
||||
let will_init_chance: u8 = random.gen_range(0..=10);
|
||||
let gap_chance: u8 = random.gen_range(0..=50);
|
||||
|
||||
(QUERIES, will_init_chance, gap_chance)
|
||||
} else {
|
||||
const QUERIES: u64 = 1000;
|
||||
const WILL_INIT_CHANCE: u8 = 1;
|
||||
const GAP_CHANCE: u8 = 5;
|
||||
|
||||
(QUERIES, WILL_INIT_CHANCE, GAP_CHANCE)
|
||||
};
|
||||
|
||||
let harness = TenantHarness::create("test_read_path").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
tracing::info!("Using random seed: {seed}");
|
||||
tracing::info!(%will_init_chance, %gap_chance, "Fill params");
|
||||
|
||||
// Define the layer map shape. Note that this part is not randomized.
|
||||
|
||||
const KEY_DIMENSION_SIZE: u32 = 99;
|
||||
let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap();
|
||||
let end_key = start_key.add(KEY_DIMENSION_SIZE);
|
||||
let total_key_range = start_key..end_key;
|
||||
let total_key_range_size = end_key.to_i128() - start_key.to_i128();
|
||||
let total_start_lsn = Lsn(104);
|
||||
let last_record_lsn = Lsn(504);
|
||||
|
||||
assert!(total_key_range_size % 3 == 0);
|
||||
|
||||
let in_memory_layers_shape = vec![
|
||||
(total_key_range.clone(), Lsn(304)..Lsn(400)),
|
||||
(total_key_range.clone(), Lsn(400)..last_record_lsn),
|
||||
];
|
||||
|
||||
let delta_layers_shape = vec![
|
||||
(
|
||||
start_key..(start_key.add((total_key_range_size / 3) as u32)),
|
||||
Lsn(200)..Lsn(304),
|
||||
),
|
||||
(
|
||||
(start_key.add((total_key_range_size / 3) as u32))
|
||||
..(start_key.add((total_key_range_size * 2 / 3) as u32)),
|
||||
Lsn(200)..Lsn(304),
|
||||
),
|
||||
(
|
||||
(start_key.add((total_key_range_size * 2 / 3) as u32))
|
||||
..(start_key.add(total_key_range_size as u32)),
|
||||
Lsn(200)..Lsn(304),
|
||||
),
|
||||
];
|
||||
|
||||
let image_layers_shape = vec![
|
||||
(
|
||||
start_key.add((total_key_range_size * 2 / 3 - 10) as u32)
|
||||
..start_key.add((total_key_range_size * 2 / 3 + 10) as u32),
|
||||
Lsn(456),
|
||||
),
|
||||
(
|
||||
start_key.add((total_key_range_size / 3 - 10) as u32)
|
||||
..start_key.add((total_key_range_size / 3 + 10) as u32),
|
||||
Lsn(256),
|
||||
),
|
||||
(total_key_range.clone(), total_start_lsn),
|
||||
];
|
||||
|
||||
let specification = TestTimelineSpecification {
|
||||
start_lsn: total_start_lsn,
|
||||
last_record_lsn,
|
||||
in_memory_layers_shape,
|
||||
delta_layers_shape,
|
||||
image_layers_shape,
|
||||
gap_chance,
|
||||
will_init_chance,
|
||||
};
|
||||
|
||||
// Create and randomly fill in the layers according to the specification
|
||||
let (tline, storage, interesting_lsns) = randomize_timeline(
|
||||
&tenant,
|
||||
TIMELINE_ID,
|
||||
DEFAULT_PG_VERSION,
|
||||
specification,
|
||||
&mut random,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Now generate queries based on the interesting lsns that we've collected.
|
||||
//
|
||||
// While there's still room in the query, pick and interesting LSN and a random
|
||||
// key. Then roll the dice to see if the next key should also be included in
|
||||
// the query. When the roll fails, break the "batch" and pick another point in the
|
||||
// (key, LSN) space.
|
||||
|
||||
const PICK_NEXT_CHANCE: u8 = 50;
|
||||
for _ in 0..queries {
|
||||
let query = {
|
||||
let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
|
||||
let mut used_keys: HashSet<Key> = HashSet::default();
|
||||
|
||||
while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
|
||||
let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
|
||||
let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
|
||||
|
||||
while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
|
||||
if used_keys.contains(&selected_key)
|
||||
|| selected_key >= start_key.add(KEY_DIMENSION_SIZE)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
keyspaces_at_lsn
|
||||
.entry(*selected_lsn)
|
||||
.or_default()
|
||||
.add_key(selected_key);
|
||||
used_keys.insert(selected_key);
|
||||
|
||||
let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE;
|
||||
if pick_next {
|
||||
selected_key = selected_key.next();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VersionedKeySpaceQuery::scattered(
|
||||
keyspaces_at_lsn
|
||||
.into_iter()
|
||||
.map(|(lsn, acc)| (lsn, acc.to_keyspace()))
|
||||
.collect(),
|
||||
)
|
||||
};
|
||||
|
||||
// Run the query and validate the results
|
||||
|
||||
let results = tline
|
||||
.get_vectored(query.clone(), IoConcurrency::Sequential, &ctx)
|
||||
.await;
|
||||
|
||||
let blobs = match results {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
panic!("seed={seed} Error returned for query {query}: {err}");
|
||||
}
|
||||
};
|
||||
|
||||
for (key, key_res) in blobs.into_iter() {
|
||||
match key_res {
|
||||
Ok(blob) => {
|
||||
let requested_at_lsn = query.map_key_to_lsn(&key);
|
||||
let expected = storage.get(key, requested_at_lsn);
|
||||
|
||||
if blob != expected {
|
||||
tracing::error!(
|
||||
"seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}"
|
||||
);
|
||||
}
|
||||
|
||||
assert_eq!(blob, expected);
|
||||
}
|
||||
Err(err) => {
|
||||
let requested_at_lsn = query.map_key_to_lsn(&key);
|
||||
|
||||
panic!(
|
||||
"seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
|
||||
(
|
||||
k1.is_delta,
|
||||
@@ -11571,6 +12113,99 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x24),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
// This record will fail to redo
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
delta1,
|
||||
)], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Compaction will fail, but should not fire any critical error.
|
||||
// Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
|
||||
// process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
|
||||
// compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
|
||||
let res = tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: None,
|
||||
compact_lsn_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
assert!(res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
|
||||
|
||||
@@ -22,6 +22,7 @@ use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
@@ -169,7 +170,13 @@ pub struct BlobWriter<const BUFFERED: bool> {
|
||||
}
|
||||
|
||||
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
|
||||
pub fn new(
|
||||
inner: VirtualFile,
|
||||
start_offset: u64,
|
||||
_gate: &utils::sync::gate::Gate,
|
||||
_cancel: CancellationToken,
|
||||
_ctx: &RequestContext,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
offset: start_offset,
|
||||
@@ -432,12 +439,14 @@ pub(crate) mod tests {
|
||||
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
|
||||
let temp_dir = camino_tempfile::tempdir()?;
|
||||
let pathbuf = temp_dir.path().join("file");
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Write part (in block to drop the file)
|
||||
let mut offsets = Vec::new();
|
||||
{
|
||||
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
|
||||
for blob in blobs.iter() {
|
||||
let (_, res) = if compression {
|
||||
let res = wtr
|
||||
|
||||
@@ -714,7 +714,7 @@ impl LayerMap {
|
||||
true
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
|
||||
pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
|
||||
@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
}
|
||||
|
||||
/// Iterate all the layers
|
||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
||||
pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
|
||||
// NOTE we can actually perform this without rebuilding,
|
||||
// but it's not necessary for now.
|
||||
if !self.buffer.is_empty() {
|
||||
|
||||
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
|
||||
}
|
||||
|
||||
/// Uniquely identify a layer visit by the layer
|
||||
/// and LSN floor (or start LSN) of the reads.
|
||||
/// The layer itself is not enough since we may
|
||||
/// have different LSN lower bounds for delta layer reads.
|
||||
/// and LSN range of the reads. Note that the end of the range is exclusive.
|
||||
///
|
||||
/// The layer itself is not enough since we may have different LSN lower
|
||||
/// bounds for delta layer reads. Scenarios where this can happen are:
|
||||
///
|
||||
/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
|
||||
/// and a query that only partially hits the image layer. Part of the query
|
||||
/// needs to read the whole in-memory layer and the other part needs to read
|
||||
/// only up to the image layer. Hence, they'll have different LSN floor values
|
||||
/// for the read.
|
||||
///
|
||||
/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
|
||||
/// The start LSN for one range is inside a layer and the start LSN for another range
|
||||
/// Is above the layer (includes all of it). Both ranges need to read the layer all the
|
||||
/// Way to the end but starting at different points. Hence, they'll have different LSN
|
||||
/// Ceil values.
|
||||
///
|
||||
/// The implication is that we might visit the same layer multiple times
|
||||
/// in order to read different LSN ranges from it. In practice, this isn't very concerning
|
||||
/// because:
|
||||
/// 1. Layer overlaps are rare and generally not intended
|
||||
/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
|
||||
/// are grouped tightly enough (likely the case).
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
struct LayerToVisitId {
|
||||
layer_id: LayerId,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
@@ -805,6 +826,7 @@ impl LayerFringe {
|
||||
let layer_to_visit_id = LayerToVisitId {
|
||||
layer_id: layer.id(),
|
||||
lsn_floor: lsn_range.start,
|
||||
lsn_ceil: lsn_range.end,
|
||||
};
|
||||
|
||||
let entry = self.visit_reads.entry(layer_to_visit_id.clone());
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::sync::Arc;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{KEY_SIZE, Key};
|
||||
use pageserver_api::value::Value;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::TenantShardId;
|
||||
@@ -179,7 +180,7 @@ impl BatchLayerWriter {
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers.
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
pub struct SplitImageLayerWriter<'a> {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
lsn: Lsn,
|
||||
@@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter {
|
||||
tenant_shard_id: TenantShardId,
|
||||
batches: BatchLayerWriter,
|
||||
start_key: Key,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl SplitImageLayerWriter {
|
||||
impl<'a> SplitImageLayerWriter<'a> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
@@ -198,6 +202,8 @@ impl SplitImageLayerWriter {
|
||||
start_key: Key,
|
||||
lsn: Lsn,
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
@@ -208,6 +214,8 @@ impl SplitImageLayerWriter {
|
||||
tenant_shard_id,
|
||||
&(start_key..Key::MAX),
|
||||
lsn,
|
||||
gate,
|
||||
cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -217,6 +225,8 @@ impl SplitImageLayerWriter {
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
lsn,
|
||||
start_key,
|
||||
gate,
|
||||
cancel,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -239,6 +249,8 @@ impl SplitImageLayerWriter {
|
||||
self.tenant_shard_id,
|
||||
&(key..Key::MAX),
|
||||
self.lsn,
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -291,7 +303,7 @@ impl SplitImageLayerWriter {
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
pub struct SplitDeltaLayerWriter<'a> {
|
||||
inner: Option<(Key, DeltaLayerWriter)>,
|
||||
target_layer_size: u64,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter {
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
batches: BatchLayerWriter,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
impl<'a> SplitDeltaLayerWriter<'a> {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
gate: &'a utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
@@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter {
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
batches: BatchLayerWriter::new(conf).await?,
|
||||
gate,
|
||||
cancel,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter {
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -362,6 +382,8 @@ impl SplitDeltaLayerWriter {
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -469,6 +491,8 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -480,6 +504,8 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -546,6 +572,8 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -556,6 +584,8 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -643,6 +673,8 @@ mod tests {
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -654,6 +686,8 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -730,6 +764,8 @@ mod tests {
|
||||
tenant.tenant_shard_id,
|
||||
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
4 * 1024 * 1024,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -400,12 +401,15 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
@@ -420,7 +424,7 @@ impl DeltaLayerWriterInner {
|
||||
let mut file = VirtualFile::create(&path, ctx).await?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -628,12 +632,15 @@ impl DeltaLayerWriter {
|
||||
///
|
||||
/// Start building a new delta layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
@@ -644,6 +651,8 @@ impl DeltaLayerWriter {
|
||||
tenant_shard_id,
|
||||
key_start,
|
||||
lsn_range,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -1885,6 +1894,8 @@ pub(crate) mod test {
|
||||
harness.tenant_shard_id,
|
||||
entries_meta.key_range.start,
|
||||
entries_meta.lsn_range.clone(),
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2079,6 +2090,8 @@ pub(crate) mod test {
|
||||
tenant.tenant_shard_id,
|
||||
Key::MIN,
|
||||
Lsn(0x11)..truncate_at,
|
||||
&branch.gate,
|
||||
branch.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2213,6 +2226,8 @@ pub(crate) mod test {
|
||||
tenant.tenant_shard_id,
|
||||
*key_start,
|
||||
(*lsn_min)..lsn_end,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -748,12 +749,15 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Create the file initially with a temporary filename.
|
||||
@@ -780,7 +784,7 @@ impl ImageLayerWriterInner {
|
||||
};
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
|
||||
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
|
||||
|
||||
// Initialize the b-tree index builder
|
||||
let block_buf = BlockBuf::new();
|
||||
@@ -988,18 +992,30 @@ impl ImageLayerWriter {
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
Ok(Self {
|
||||
inner: Some(
|
||||
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
|
||||
.await?,
|
||||
ImageLayerWriterInner::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
key_range,
|
||||
lsn,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
),
|
||||
})
|
||||
}
|
||||
@@ -1192,7 +1208,7 @@ mod test {
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
@@ -1203,6 +1219,8 @@ mod test {
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1235,7 +1253,7 @@ mod test {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
shard_count,
|
||||
ShardStripeSize(0x8000),
|
||||
ShardStripeSize(0x800),
|
||||
)
|
||||
.unwrap();
|
||||
let harness = TenantHarness::create_custom(
|
||||
@@ -1268,6 +1286,8 @@ mod test {
|
||||
harness.tenant_shard_id,
|
||||
&range,
|
||||
lsn,
|
||||
&timeline.gate,
|
||||
timeline.cancel.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1287,12 +1307,12 @@ mod test {
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
assert_eq!(original_size, 122880);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
@@ -1300,7 +1320,7 @@ mod test {
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
@@ -1309,19 +1329,19 @@ mod test {
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
assert_eq!(wrote_keys, 0x1000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
assert_eq!(replacement.metadata().file_size, 73728);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
@@ -1346,6 +1366,8 @@ mod test {
|
||||
tenant.tenant_shard_id,
|
||||
&key_range,
|
||||
lsn,
|
||||
&tline.gate,
|
||||
tline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -719,6 +719,8 @@ impl InMemoryLayer {
|
||||
ctx: &RequestContext,
|
||||
key_range: Option<Range<Key>>,
|
||||
l0_flush_global_state: &l0_flush::Inner,
|
||||
gate: &utils::sync::gate::Gate,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
@@ -759,6 +761,8 @@ impl InMemoryLayer {
|
||||
self.tenant_shard_id,
|
||||
Key::MIN,
|
||||
self.start_lsn..end_lsn,
|
||||
gate,
|
||||
cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -585,7 +585,7 @@ pub(crate) enum PageReconstructError {
|
||||
WalRedo(anyhow::Error),
|
||||
|
||||
#[error("{0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
}
|
||||
|
||||
impl From<anyhow::Error> for PageReconstructError {
|
||||
@@ -690,16 +690,23 @@ impl std::fmt::Display for ReadPath {
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
keyspace: KeySpace,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
query: Option<VersionedKeySpaceQuery>,
|
||||
// This is largest request LSN from the get page request batch
|
||||
original_hwm_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
/// Debug information about the read path if there's an error
|
||||
read_path: Option<ReadPath>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
impl MissingKeyError {
|
||||
fn enrich(&mut self, query: VersionedKeySpaceQuery) {
|
||||
self.query = Some(query);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
@@ -710,14 +717,18 @@ impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
"could not find data for key {} (shard {:?}), original HWM LSN {}",
|
||||
self.keyspace, self.shard, self.original_hwm_lsn
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if let Some(ref query) = self.query {
|
||||
write!(f, ", query {}", query)?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
}
|
||||
@@ -817,7 +828,7 @@ pub(crate) enum GetVectoredError {
|
||||
InvalidLsn(Lsn),
|
||||
|
||||
#[error("requested key not found: {0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
MissingKey(Box<MissingKeyError>),
|
||||
|
||||
#[error("ancestry walk")]
|
||||
GetReadyAncestorError(#[source] GetReadyAncestorError),
|
||||
@@ -928,7 +939,7 @@ impl std::fmt::Debug for Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
pub(crate) enum WaitLsnError {
|
||||
// Called on a timeline which is shutting down
|
||||
#[error("Shutdown")]
|
||||
@@ -1128,14 +1139,12 @@ impl Timeline {
|
||||
// page_service.
|
||||
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![key..key.next()],
|
||||
};
|
||||
|
||||
let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
let key_value = vectored_res?.pop_first();
|
||||
@@ -1153,15 +1162,17 @@ impl Timeline {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
})),
|
||||
None => Err(PageReconstructError::MissingKey(Box::new(
|
||||
MissingKeyError {
|
||||
keyspace: KeySpace::single(key..key.next()),
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
original_hwm_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
query: None,
|
||||
},
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1174,21 +1185,18 @@ impl Timeline {
|
||||
/// which actually vectorizes the read path.
|
||||
pub(crate) async fn get_vectored(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
query: VersionedKeySpaceQuery,
|
||||
io_concurrency: super::storage_layer::IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(lsn));
|
||||
}
|
||||
let total_keyspace = query.total_keyspace();
|
||||
|
||||
let key_count = keyspace.total_raw_size().try_into().unwrap();
|
||||
let key_count = total_keyspace.total_raw_size().try_into().unwrap();
|
||||
if key_count > Timeline::MAX_GET_VECTORED_KEYS {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
for range in &keyspace.ranges {
|
||||
for range in &total_keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
assert!(!self.shard_identity.is_key_disposable(&key));
|
||||
@@ -1197,9 +1205,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
trace!(
|
||||
"get vectored request for {:?}@{} from task kind {:?}",
|
||||
keyspace,
|
||||
lsn,
|
||||
"get vectored query {} from task kind {:?}",
|
||||
query,
|
||||
ctx.task_kind(),
|
||||
);
|
||||
|
||||
@@ -1208,12 +1215,7 @@ impl Timeline {
|
||||
.map(|metric| (metric, Instant::now()));
|
||||
|
||||
let res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(io_concurrency),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
|
||||
.await;
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
@@ -1264,13 +1266,10 @@ impl Timeline {
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(ScanLatencyOngoingRecording::start_recording);
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(io_concurrency),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
|
||||
.await;
|
||||
|
||||
if let Some(recording) = start {
|
||||
@@ -1282,16 +1281,19 @@ impl Timeline {
|
||||
|
||||
pub(super) async fn get_vectored_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
query: VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
|
||||
Some(ReadPath::new(keyspace.clone(), lsn))
|
||||
Some(ReadPath::new(
|
||||
query.total_keyspace(),
|
||||
query.high_watermark_lsn()?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
|
||||
@@ -1311,7 +1313,7 @@ impl Timeline {
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
|
||||
self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
|
||||
.await
|
||||
};
|
||||
@@ -1324,6 +1326,13 @@ impl Timeline {
|
||||
.map(|state| state.collect_pending_ios())
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
while collect_futs.next().await.is_some() {}
|
||||
|
||||
// Enrich the missing key error with the original query.
|
||||
if let GetVectoredError::MissingKey(mut missing_err) = err {
|
||||
missing_err.enrich(query.clone());
|
||||
return Err(GetVectoredError::MissingKey(missing_err));
|
||||
}
|
||||
|
||||
return Err(err);
|
||||
};
|
||||
|
||||
@@ -1341,6 +1350,8 @@ impl Timeline {
|
||||
|
||||
let futs = FuturesUnordered::new();
|
||||
for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
let req_lsn_for_key = query.map_key_to_lsn(&key);
|
||||
|
||||
futs.push({
|
||||
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
|
||||
let ctx = RequestContextBuilder::from(&ctx)
|
||||
@@ -1387,7 +1398,7 @@ impl Timeline {
|
||||
|
||||
let walredo_deltas = converted.num_deltas();
|
||||
let walredo_res = walredo_self
|
||||
.reconstruct_value(key, lsn, converted, redo_attempt_type)
|
||||
.reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
@@ -1414,15 +1425,18 @@ impl Timeline {
|
||||
// to avoid infinite results.
|
||||
if !results.is_empty() {
|
||||
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
|
||||
let total_keyspace = query.total_keyspace();
|
||||
let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
|
||||
|
||||
static LOG_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
|
||||
LOG_PACER.lock().unwrap().call(|| {
|
||||
let num_keys = keyspace.total_raw_size();
|
||||
let num_keys = total_keyspace.total_raw_size();
|
||||
let num_pages = results.len();
|
||||
tracing::info!(
|
||||
shard_id = %self.tenant_shard_id.shard_slug(),
|
||||
lsn = %lsn,
|
||||
"Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
|
||||
lsn = %max_request_lsn,
|
||||
"Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
|
||||
);
|
||||
});
|
||||
}
|
||||
@@ -2723,6 +2737,10 @@ impl Timeline {
|
||||
.tenant_conf
|
||||
.gc_compaction_enabled
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
|
||||
let gc_compaction_verification = tenant_conf
|
||||
.tenant_conf
|
||||
.gc_compaction_verification
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
|
||||
let gc_compaction_initial_threshold_kb = tenant_conf
|
||||
.tenant_conf
|
||||
.gc_compaction_initial_threshold_kb
|
||||
@@ -2737,6 +2755,7 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
|
||||
GcCompactionCombinedSettings {
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
}
|
||||
@@ -3935,6 +3954,154 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Type representing a query in the ([`Lsn`], [`Key`]) space.
|
||||
/// In other words, a set of segments in a 2D space.
|
||||
///
|
||||
/// This representation has the advatange of avoiding hash map
|
||||
/// allocations for uniform queries.
|
||||
pub(crate) enum VersionedKeySpaceQuery {
|
||||
/// Variant for queries at a single [`Lsn`]
|
||||
Uniform { keyspace: KeySpace, lsn: Lsn },
|
||||
/// Variant for queries at multiple [`Lsn`]s
|
||||
Scattered {
|
||||
keyspaces_at_lsn: Vec<(Lsn, KeySpace)>,
|
||||
},
|
||||
}
|
||||
|
||||
impl VersionedKeySpaceQuery {
|
||||
pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self {
|
||||
Self::Uniform { keyspace, lsn }
|
||||
}
|
||||
|
||||
pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self {
|
||||
Self::Scattered { keyspaces_at_lsn }
|
||||
}
|
||||
|
||||
/// Returns the most recent (largest) LSN included in the query.
|
||||
/// If any of the LSNs included in the query are invalid, returns
|
||||
/// an error instead.
|
||||
fn high_watermark_lsn(&self) -> Result<Lsn, GetVectoredError> {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(*lsn));
|
||||
}
|
||||
|
||||
Ok(*lsn)
|
||||
}
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
let mut max_lsn = None;
|
||||
for (lsn, _keyspace) in keyspaces_at_lsn.iter() {
|
||||
if !lsn.is_valid() {
|
||||
return Err(GetVectoredError::InvalidLsn(*lsn));
|
||||
}
|
||||
max_lsn = std::cmp::max(max_lsn, Some(lsn));
|
||||
}
|
||||
|
||||
if let Some(computed) = max_lsn {
|
||||
Ok(*computed)
|
||||
} else {
|
||||
Err(GetVectoredError::Other(anyhow!("empty input")))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the total keyspace being queried: the result of projecting
|
||||
/// everything in the key dimensions onto the key axis.
|
||||
fn total_keyspace(&self) -> KeySpace {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.clone(),
|
||||
Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
|
||||
.iter()
|
||||
.map(|(_lsn, keyspace)| keyspace)
|
||||
.fold(KeySpace::default(), |mut acc, v| {
|
||||
acc.merge(v);
|
||||
acc
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns LSN for a specific key.
|
||||
///
|
||||
/// Invariant: requested key must be part of [`Self::total_keyspace`]
|
||||
pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => *lsn,
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
keyspaces_at_lsn
|
||||
.iter()
|
||||
.find(|(_lsn, keyspace)| keyspace.contains(key))
|
||||
.expect("Returned key was requested")
|
||||
.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove any parts of the query (segments) which overlap with the provided
|
||||
/// key space (also segments).
|
||||
fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove),
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||
keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| {
|
||||
let removed = keyspace.remove_overlapping_with(to_remove);
|
||||
removed_accum.add_keyspace(removed);
|
||||
});
|
||||
|
||||
removed_accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
Self::Uniform { keyspace, .. } => keyspace.is_empty(),
|
||||
Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
|
||||
.iter()
|
||||
.all(|(_lsn, keyspace)| keyspace.is_empty()),
|
||||
}
|
||||
}
|
||||
|
||||
/// "Lower" the query on the LSN dimension
|
||||
fn lower(&mut self, to: Lsn) {
|
||||
match self {
|
||||
Self::Uniform { lsn, .. } => {
|
||||
// If the originally requested LSN is smaller than the starting
|
||||
// LSN of the ancestor we are descending into, we need to respect that.
|
||||
// Hence the min.
|
||||
*lsn = std::cmp::min(*lsn, to);
|
||||
}
|
||||
Self::Scattered { keyspaces_at_lsn } => {
|
||||
keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| {
|
||||
*lsn = std::cmp::min(*lsn, to);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for VersionedKeySpaceQuery {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
|
||||
match self {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
write!(f, "{keyspace} @ {lsn}")?;
|
||||
}
|
||||
VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
|
||||
for (lsn, keyspace) in keyspaces_at_lsn.iter() {
|
||||
write!(f, "{keyspace} @ {lsn},")?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
write!(f, "]")
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
#[allow(clippy::doc_lazy_continuation)]
|
||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||
@@ -3949,16 +4116,15 @@ impl Timeline {
|
||||
/// 2.4. If the fringe is empty, go back to 1
|
||||
async fn get_vectored_reconstruct_data(
|
||||
&self,
|
||||
mut keyspace: KeySpace,
|
||||
request_lsn: Lsn,
|
||||
mut query: VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let original_hwm_lsn = query.high_watermark_lsn().unwrap();
|
||||
|
||||
let mut timeline_owned: Arc<Timeline>;
|
||||
let mut timeline = self;
|
||||
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
|
||||
let missing_keyspace = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
@@ -3975,15 +4141,14 @@ impl Timeline {
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_IO_TIMELINE",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
high_watermark_lsn = %query.high_watermark_lsn().unwrap(),
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
&query,
|
||||
reconstruct_state,
|
||||
&self.cancel,
|
||||
&ctx,
|
||||
@@ -3992,23 +4157,23 @@ impl Timeline {
|
||||
.await?
|
||||
};
|
||||
|
||||
keyspace.remove_overlapping_with(&completed);
|
||||
query.remove_overlapping_with(&completed);
|
||||
|
||||
// Do not descend into the ancestor timeline for aux files.
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
keyspace.remove_overlapping_with(&KeySpace {
|
||||
query.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved
|
||||
if keyspace.is_empty() {
|
||||
if query.is_empty() {
|
||||
break None;
|
||||
}
|
||||
|
||||
let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
|
||||
// Not fully retrieved but no ancestor timeline.
|
||||
break Some(keyspace);
|
||||
break Some(query.total_keyspace());
|
||||
};
|
||||
|
||||
// Now we see if there are keys covered by the image layer but does not exist in the
|
||||
@@ -4019,7 +4184,7 @@ impl Timeline {
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
let mut removed = query.remove_overlapping_with(&image_covered_keyspace);
|
||||
// Do not fire missing key error and end early for sparse keys. Note that we hava already removed
|
||||
// non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
|
||||
// figuring out what is the inherited key range and do a fine-grained pruning.
|
||||
@@ -4029,11 +4194,11 @@ impl Timeline {
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
// If we reached this point, `remove_overlapping_with` should not have made any change to the
|
||||
// keyspace.
|
||||
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
// Each key range in the original query is at some point in the LSN space.
|
||||
// When descending into the ancestor, lower all ranges in the LSN space
|
||||
// such that new changes on the parent timeline are not visible.
|
||||
query.lower(timeline.ancestor_lsn);
|
||||
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
@@ -4042,7 +4207,6 @@ impl Timeline {
|
||||
parent: crnt_perf_span,
|
||||
"GET_ANCESTOR",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
ancestor = %ancestor_timeline.timeline_id,
|
||||
ancestor_lsn = %timeline.ancestor_lsn
|
||||
)
|
||||
@@ -4072,22 +4236,47 @@ impl Timeline {
|
||||
};
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
shard: self
|
||||
.shard_identity
|
||||
.get_shard_number(&missing_keyspace.start().unwrap()),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError {
|
||||
keyspace: missing_keyspace, /* better if we can store the full keyspace */
|
||||
shard: self.shard_identity.number,
|
||||
original_hwm_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
backtrace: None,
|
||||
read_path: std::mem::take(&mut reconstruct_state.read_path),
|
||||
}));
|
||||
query: None,
|
||||
})));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_vectored_init_fringe(
|
||||
&self,
|
||||
query: &VersionedKeySpaceQuery,
|
||||
) -> Result<LayerFringe, GetVectoredError> {
|
||||
let mut fringe = LayerFringe::new();
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
match query {
|
||||
VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
|
||||
// LSNs requested by the compute or determined by the pageserver
|
||||
// are inclusive. Queries to the layer map use exclusive LSNs.
|
||||
// Hence, bump the value before the query - same in the other
|
||||
// match arm.
|
||||
let cont_lsn = Lsn(lsn.0 + 1);
|
||||
guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?;
|
||||
}
|
||||
VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
|
||||
for (lsn, keyspace) in keyspaces_at_lsn.iter() {
|
||||
let cont_lsn_for_keyspace = Lsn(lsn.0 + 1);
|
||||
guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(fringe)
|
||||
}
|
||||
|
||||
/// Collect the reconstruct data for a keyspace from the specified timeline.
|
||||
///
|
||||
/// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
|
||||
@@ -4106,8 +4295,7 @@ impl Timeline {
|
||||
/// decides how to deal with these two keyspaces.
|
||||
async fn get_vectored_reconstruct_data_timeline(
|
||||
timeline: &Timeline,
|
||||
keyspace: KeySpace,
|
||||
mut cont_lsn: Lsn,
|
||||
query: &VersionedKeySpaceQuery,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -4123,14 +4311,7 @@ impl Timeline {
|
||||
let _guard = timeline.gc_compaction_layer_update_lock.read().await;
|
||||
|
||||
// Initialize the fringe
|
||||
let mut fringe = {
|
||||
let mut fringe = LayerFringe::new();
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
guard.update_search_fringe(&keyspace, cont_lsn, &mut fringe)?;
|
||||
|
||||
fringe
|
||||
};
|
||||
let mut fringe = timeline.get_vectored_init_fringe(query).await?;
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
let mut image_covered_keyspace = KeySpaceRandomAccum::new();
|
||||
@@ -4156,7 +4337,7 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
let mut unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
let cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited(&layer_to_read);
|
||||
|
||||
@@ -4805,7 +4986,13 @@ impl Timeline {
|
||||
let ctx = ctx.attached_child();
|
||||
let work = async move {
|
||||
let Some((desc, path)) = frozen_layer
|
||||
.write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
|
||||
.write_to_disk(
|
||||
&ctx,
|
||||
key_range,
|
||||
self_clone.l0_flush_global_state.inner(),
|
||||
&self_clone.gate,
|
||||
self_clone.cancel.clone(),
|
||||
)
|
||||
.await?
|
||||
else {
|
||||
return Ok(None);
|
||||
@@ -4991,13 +5178,11 @@ impl Timeline {
|
||||
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|
||||
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|
||||
{
|
||||
let query =
|
||||
VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn);
|
||||
|
||||
let results = self
|
||||
.get_vectored(
|
||||
key_request_accum.consume_keyspace(),
|
||||
lsn,
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
.get_vectored(query, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -5086,7 +5271,11 @@ impl Timeline {
|
||||
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
|
||||
// not contain too many keys, otherwise this takes a lot of memory.
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.get_vectored_impl(
|
||||
VersionedKeySpaceQuery::uniform(partition.clone(), lsn),
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let (data, total_kb_retrieved, total_keys_retrieved) = {
|
||||
let mut new_data = BTreeMap::new();
|
||||
@@ -5343,6 +5532,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6707,6 +6898,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
&(min_key..end_key),
|
||||
lsn,
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -6768,6 +6961,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
deltas.key_range.start,
|
||||
deltas.lsn_range,
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -80,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId {
|
||||
|
||||
pub struct GcCompactionCombinedSettings {
|
||||
pub gc_compaction_enabled: bool,
|
||||
pub gc_compaction_verification: bool,
|
||||
pub gc_compaction_initial_threshold_kb: u64,
|
||||
pub gc_compaction_ratio_percent: u64,
|
||||
}
|
||||
@@ -225,6 +226,7 @@ impl GcCompactionQueue {
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
..
|
||||
} = timeline.get_gc_compaction_settings();
|
||||
if !gc_compaction_enabled {
|
||||
return Ok(());
|
||||
@@ -747,8 +749,8 @@ impl KeyHistoryRetention {
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut SplitDeltaLayerWriter,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter>,
|
||||
delta_writer: &mut SplitDeltaLayerWriter<'_>,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter<'_>>,
|
||||
stat: &mut CompactionStatistics,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -788,6 +790,114 @@ impl KeyHistoryRetention {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Verify if every key in the retention is readable by replaying the logs.
|
||||
async fn verify(
|
||||
&self,
|
||||
key: Key,
|
||||
base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
|
||||
full_history: &[(Key, Lsn, Value)],
|
||||
tline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Usually the min_lsn should be the first record but we do a full iteration to be safe.
|
||||
let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
|
||||
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
|
||||
return Ok(());
|
||||
};
|
||||
let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
|
||||
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
|
||||
return Ok(());
|
||||
};
|
||||
let mut base_img = base_img_from_ancestor
|
||||
.as_ref()
|
||||
.map(|(_, lsn, img)| (*lsn, img));
|
||||
let mut history = Vec::new();
|
||||
|
||||
async fn collect_and_verify(
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: &Option<(Lsn, &Bytes)>,
|
||||
history: &[(Lsn, &NeonWalRecord)],
|
||||
tline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut records = history
|
||||
.iter()
|
||||
.map(|(lsn, val)| (*lsn, (*val).clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// WAL redo requires records in the reverse LSN order
|
||||
records.reverse();
|
||||
let data = ValueReconstructState {
|
||||
img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
|
||||
records,
|
||||
};
|
||||
|
||||
tline
|
||||
.reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
|
||||
.await
|
||||
.with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
|
||||
for (lsn, val) in logs {
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
base_img = Some((*lsn, img));
|
||||
history.clear();
|
||||
}
|
||||
Value::WalRecord(rec) if val.will_init() => {
|
||||
base_img = None;
|
||||
history.clear();
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
}
|
||||
}
|
||||
if *retain_lsn >= min_lsn {
|
||||
// Only verify after the key appears in the full history for the first time.
|
||||
|
||||
if base_img.is_none() && history.is_empty() {
|
||||
anyhow::bail!(
|
||||
"verificatoin failed: key {} has no history at {}",
|
||||
key,
|
||||
retain_lsn
|
||||
);
|
||||
};
|
||||
// We don't modify history: in theory, we could replace the history with a single
|
||||
// image as in `generate_key_retention` to make redos at later LSNs faster. But we
|
||||
// want to verify everything as if they are read from the real layer map.
|
||||
collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
|
||||
}
|
||||
}
|
||||
|
||||
for (lsn, val) in &self.above_horizon.0 {
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
// Above the GC horizon, we verify every time we see an image.
|
||||
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
|
||||
base_img = Some((*lsn, img));
|
||||
history.clear();
|
||||
}
|
||||
Value::WalRecord(rec) if val.will_init() => {
|
||||
// Above the GC horizon, we verify every time we see an init record.
|
||||
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
|
||||
base_img = None;
|
||||
history.clear();
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
history.push((*lsn, rec));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Ensure the latest record is readable.
|
||||
collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
@@ -1119,7 +1229,17 @@ impl Timeline {
|
||||
// being potentially much longer.
|
||||
let rewrite_max = partition_count;
|
||||
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
let outcome = self
|
||||
.compact_shard_ancestors(
|
||||
rewrite_max,
|
||||
options.flags.contains(CompactFlags::YieldForL0),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
match outcome {
|
||||
CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
|
||||
CompactionOutcome::Done | CompactionOutcome::Skipped => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(CompactionOutcome::Done)
|
||||
@@ -1136,8 +1256,10 @@ impl Timeline {
|
||||
async fn compact_shard_ancestors(
|
||||
self: &Arc<Self>,
|
||||
rewrite_max: usize,
|
||||
yield_for_l0: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let mut outcome = CompactionOutcome::Done;
|
||||
let mut drop_layers = Vec::new();
|
||||
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
|
||||
@@ -1148,15 +1270,13 @@ impl Timeline {
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map()?.iter_historic_layers() {
|
||||
let layers_iter = layers.layer_map()?.iter_historic_layers();
|
||||
let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
|
||||
for layer_desc in layers_iter {
|
||||
layers_checked += 1;
|
||||
let layer = layers.get_from_desc(&layer_desc);
|
||||
if layer.metadata().shard.shard_count == self.shard_identity.count {
|
||||
// This layer does not belong to a historic ancestor, no need to re-image it.
|
||||
@@ -1171,8 +1291,8 @@ impl Timeline {
|
||||
// This ancestral layer only covers keys that belong to other shards.
|
||||
// We include the full metadata in the log: if we had some critical bug that caused
|
||||
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard.",
|
||||
debug!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard",
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
@@ -1234,19 +1354,36 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if layers_to_rewrite.len() >= rewrite_max {
|
||||
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
layers_to_rewrite.len()
|
||||
);
|
||||
continue;
|
||||
outcome = CompactionOutcome::Pending;
|
||||
break;
|
||||
}
|
||||
|
||||
// Fall through: all our conditions for doing a rewrite passed.
|
||||
layers_to_rewrite.push(layer);
|
||||
}
|
||||
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O.
|
||||
drop(layers);
|
||||
|
||||
// Drop out early if there's nothing to do.
|
||||
if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
|
||||
checked {layers_checked}/{layers_total} layers \
|
||||
(latest_gc_cutoff={} pitr_cutoff={})",
|
||||
layers_to_rewrite.len(),
|
||||
drop_layers.len(),
|
||||
*latest_gc_cutoff,
|
||||
pitr_cutoff,
|
||||
);
|
||||
let started = Instant::now();
|
||||
|
||||
let mut replace_image_layers = Vec::new();
|
||||
|
||||
for layer in layers_to_rewrite {
|
||||
@@ -1254,13 +1391,15 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
|
||||
info!(layer=%layer, "rewriting layer after shard split");
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&layer.layer_desc().key_range,
|
||||
layer.layer_desc().image_layer_lsn(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1292,7 +1431,7 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
|
||||
@@ -1302,6 +1441,26 @@ impl Timeline {
|
||||
// the layer has no data for us with the ShardedRange check above, but
|
||||
drop_layers.push(layer);
|
||||
}
|
||||
|
||||
// Yield for L0 compaction if necessary, but make sure we update the layer map below
|
||||
// with the work we've already done.
|
||||
if yield_for_l0
|
||||
&& self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some()
|
||||
{
|
||||
info!("shard ancestor compaction yielding for L0 compaction");
|
||||
outcome = CompactionOutcome::YieldForL0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for layer in &drop_layers {
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split (no keys for this shard)",
|
||||
);
|
||||
}
|
||||
|
||||
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
|
||||
@@ -1319,17 +1478,36 @@ impl Timeline {
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
|
||||
// load.
|
||||
match self.remote_client.wait_completion().await {
|
||||
Ok(()) => (),
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
if outcome != CompactionOutcome::YieldForL0 {
|
||||
info!("shard ancestor compaction waiting for uploads");
|
||||
tokio::select! {
|
||||
result = self.remote_client.wait_completion() => match result {
|
||||
Ok(()) => {},
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
},
|
||||
// Don't wait if there's L0 compaction to do. We don't need to update the outcome
|
||||
// here, because we've already done the actual work.
|
||||
_ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"shard ancestor compaction done in {:.3}s{}",
|
||||
started.elapsed().as_secs_f64(),
|
||||
match outcome {
|
||||
CompactionOutcome::Pending =>
|
||||
format!(", with pending work (rewrite_max={rewrite_max})"),
|
||||
CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
|
||||
CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
|
||||
}
|
||||
);
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-persistent");
|
||||
|
||||
Ok(())
|
||||
Ok(outcome)
|
||||
}
|
||||
|
||||
/// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
|
||||
@@ -1861,6 +2039,8 @@ impl Timeline {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2148,6 +2328,7 @@ impl Timeline {
|
||||
/// ```
|
||||
///
|
||||
/// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn generate_key_retention(
|
||||
self: &Arc<Timeline>,
|
||||
key: Key,
|
||||
@@ -2156,6 +2337,7 @@ impl Timeline {
|
||||
retain_lsn_below_horizon: &[Lsn],
|
||||
delta_threshold_cnt: usize,
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
verification: bool,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
|
||||
@@ -2242,8 +2424,8 @@ impl Timeline {
|
||||
"should have at least below + above horizon batches"
|
||||
);
|
||||
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
|
||||
if let Some((key, lsn, img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img)));
|
||||
if let Some((key, lsn, ref img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img.clone())));
|
||||
}
|
||||
|
||||
/// Generate debug information for the replay history
|
||||
@@ -2357,22 +2539,15 @@ impl Timeline {
|
||||
// Whether to reconstruct the image. In debug mode, we will generate an image
|
||||
// at every retain_lsn to ensure data is not corrupted, but we won't put the
|
||||
// image into the final layer.
|
||||
let generate_image = produce_image || debug_mode;
|
||||
if produce_image {
|
||||
let img_and_lsn = if produce_image {
|
||||
records_since_last_image = 0;
|
||||
}
|
||||
let img_and_lsn = if generate_image {
|
||||
let replay_history_for_debug = if debug_mode {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = if produce_image {
|
||||
std::mem::take(&mut replay_history)
|
||||
} else {
|
||||
replay_history.clone()
|
||||
};
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
|
||||
@@ -2407,6 +2582,7 @@ impl Timeline {
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
}
|
||||
// WAL redo requires records in the reverse LSN order
|
||||
records.reverse();
|
||||
let state = ValueReconstructState { img, records };
|
||||
// last batch does not generate image so i is always in range, unless we force generate
|
||||
@@ -2439,10 +2615,16 @@ impl Timeline {
|
||||
assert_eq!(retention.len(), lsn_split_points.len() + 1);
|
||||
for (idx, logs) in retention.into_iter().enumerate() {
|
||||
if idx == lsn_split_points.len() {
|
||||
return Ok(KeyHistoryRetention {
|
||||
let retention = KeyHistoryRetention {
|
||||
below_horizon: result,
|
||||
above_horizon: KeyLogAtLsn(logs),
|
||||
});
|
||||
};
|
||||
if verification {
|
||||
retention
|
||||
.verify(key, &base_img_from_ancestor, full_history, self)
|
||||
.await?;
|
||||
}
|
||||
return Ok(retention);
|
||||
} else {
|
||||
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
|
||||
}
|
||||
@@ -2909,6 +3091,9 @@ impl Timeline {
|
||||
}
|
||||
(false, res)
|
||||
};
|
||||
|
||||
let verification = self.get_gc_compaction_settings().gc_compaction_verification;
|
||||
|
||||
info!(
|
||||
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
|
||||
job_desc.selected_layers.len(),
|
||||
@@ -3055,6 +3240,8 @@ impl Timeline {
|
||||
job_desc.compaction_key_range.start,
|
||||
lowest_retain_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3071,6 +3258,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
@@ -3167,6 +3356,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
desc.key_range.start,
|
||||
desc.lsn_range.clone(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3184,6 +3375,8 @@ impl Timeline {
|
||||
self.tenant_shard_id,
|
||||
job_desc.compaction_key_range.end,
|
||||
desc.lsn_range.clone(),
|
||||
&self.gate,
|
||||
self.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3225,6 +3418,7 @@ impl Timeline {
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
verification,
|
||||
)
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
@@ -3265,6 +3459,7 @@ impl Timeline {
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
verification,
|
||||
)
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
@@ -3753,6 +3948,8 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
self.timeline.tenant_shard_id,
|
||||
key_range.start,
|
||||
lsn_range.clone(),
|
||||
&self.timeline.gate,
|
||||
self.timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -3828,6 +4025,8 @@ impl TimelineAdaptor {
|
||||
self.timeline.tenant_shard_id,
|
||||
key_range,
|
||||
lsn,
|
||||
&self.timeline.gate,
|
||||
self.timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
|
||||
AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::VersionedKeySpaceQuery;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
|
||||
}
|
||||
}
|
||||
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
|
||||
let data = ancestor
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key_range.clone()),
|
||||
image_lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.get_vectored_impl(query, &mut reconstruct_state, ctx)
|
||||
.await
|
||||
.context("failed to retrieve aux keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
@@ -231,6 +228,8 @@ async fn generate_tombstone_image_layer(
|
||||
detached.tenant_shard_id,
|
||||
&key_range,
|
||||
image_lsn,
|
||||
&detached.gate,
|
||||
detached.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -779,6 +778,8 @@ async fn copy_lsn_prefix(
|
||||
target_timeline.tenant_shard_id,
|
||||
layer.layer_desc().key_range.start,
|
||||
layer.layer_desc().lsn_range.start..end_lsn,
|
||||
&target_timeline.gate,
|
||||
target_timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -738,6 +738,8 @@ impl ChunkProcessingJob {
|
||||
self.timeline.tenant_shard_id,
|
||||
&self.range,
|
||||
self.pgdata_lsn,
|
||||
&self.timeline.gate,
|
||||
self.timeline.cancel.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Cancelled => Ok(()),
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
if cancellation.is_cancelled() {
|
||||
|
||||
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
ClosedGate,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::Cancelled => {
|
||||
debug!("Connection cancelled")
|
||||
}
|
||||
WalReceiverError::ClosedGate => {
|
||||
// doesn't happen at runtime
|
||||
}
|
||||
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
|
||||
@@ -21,13 +21,13 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use std::backtrace::Backtrace;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -38,7 +38,7 @@ use postgres_ffi::{
|
||||
fsm_logical_to_physical, pg_constants,
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::bin_ser::{DeserializeError, SerializeError};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
@@ -104,12 +104,101 @@ struct WarnIngestLag {
|
||||
timestamp_invalid_msg_ratelimit: RateLimit,
|
||||
}
|
||||
|
||||
pub struct WalIngestError {
|
||||
pub backtrace: std::backtrace::Backtrace,
|
||||
pub kind: WalIngestErrorKind,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum WalIngestErrorKind {
|
||||
#[error(transparent)]
|
||||
#[allow(private_interfaces)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
#[error(transparent)]
|
||||
DeserializationFailure(#[from] DeserializeError),
|
||||
#[error(transparent)]
|
||||
SerializationFailure(#[from] SerializeError),
|
||||
#[error("the request contains data not supported by pageserver: {0} @ {1}")]
|
||||
InvalidKey(Key, Lsn),
|
||||
#[error("twophase file for xid {0} already exists")]
|
||||
FileAlreadyExists(u64),
|
||||
#[error("slru segment {0:?}/{1} already exists")]
|
||||
SlruAlreadyExists(SlruKind, u32),
|
||||
#[error("relation already exists")]
|
||||
RelationAlreadyExists(RelTag),
|
||||
#[error("invalid reldir key {0}")]
|
||||
InvalidRelDirKey(Key),
|
||||
|
||||
#[error(transparent)]
|
||||
LogicalError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
EncodeAuxFileError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
MaybeRelSizeV2Error(anyhow::Error),
|
||||
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl<T> From<T> for WalIngestError
|
||||
where
|
||||
WalIngestErrorKind: From<T>,
|
||||
{
|
||||
fn from(value: T) -> Self {
|
||||
WalIngestError {
|
||||
backtrace: Backtrace::capture(),
|
||||
kind: WalIngestErrorKind::from(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for WalIngestError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
self.kind.source()
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
self.kind.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
if f.alternate() {
|
||||
f.debug_map()
|
||||
.key(&"backtrace")
|
||||
.value(&self.backtrace)
|
||||
.key(&"kind")
|
||||
.value(&self.kind)
|
||||
.finish()
|
||||
} else {
|
||||
writeln!(f, "Error: {:?}", self.kind)?;
|
||||
if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! ensure_walingest {
|
||||
($($t:tt)*) => {
|
||||
_ = || -> Result<(), anyhow::Error> {
|
||||
anyhow::ensure!($($t)*);
|
||||
Ok(())
|
||||
}().map_err(WalIngestErrorKind::LogicalError)?;
|
||||
};
|
||||
}
|
||||
|
||||
impl WalIngest {
|
||||
pub async fn new(
|
||||
timeline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<WalIngest> {
|
||||
) -> Result<WalIngest, WalIngestError> {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
||||
@@ -145,7 +234,7 @@ impl WalIngest {
|
||||
interpreted: InterpretedWalRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
) -> Result<bool, WalIngestError> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
let prev_len = modification.len();
|
||||
|
||||
@@ -288,7 +377,7 @@ impl WalIngest {
|
||||
}
|
||||
|
||||
/// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
|
||||
let next_full_xid =
|
||||
enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
|
||||
|
||||
@@ -298,9 +387,9 @@ impl WalIngest {
|
||||
if xid > next_xid {
|
||||
// Wraparound occurred, must be from a prev epoch.
|
||||
if epoch == 0 {
|
||||
bail!(
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
|
||||
);
|
||||
)))?;
|
||||
}
|
||||
epoch -= 1;
|
||||
}
|
||||
@@ -313,7 +402,7 @@ impl WalIngest {
|
||||
clear_vm_bits: ClearVmBits,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
@@ -402,7 +491,7 @@ impl WalIngest {
|
||||
create: DbaseCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseCreate {
|
||||
db_id,
|
||||
tablespace_id,
|
||||
@@ -505,7 +594,7 @@ impl WalIngest {
|
||||
dbase_drop: DbaseDrop,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseDrop {
|
||||
db_id,
|
||||
tablespace_ids,
|
||||
@@ -523,7 +612,7 @@ impl WalIngest {
|
||||
create: SmgrCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let SmgrCreate { rel } = create;
|
||||
self.put_rel_creation(modification, rel, ctx).await?;
|
||||
Ok(())
|
||||
@@ -537,7 +626,7 @@ impl WalIngest {
|
||||
truncate: XlSmgrTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let XlSmgrTruncate {
|
||||
blkno,
|
||||
rnode,
|
||||
@@ -689,7 +778,7 @@ impl WalIngest {
|
||||
record: XactRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (xact_common, is_commit, is_prepared) = match record {
|
||||
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
|
||||
let xid: u64 = if modification.tline.pg_version >= 17 {
|
||||
@@ -813,7 +902,7 @@ impl WalIngest {
|
||||
truncate: ClogTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogTruncate {
|
||||
pageno,
|
||||
oldest_xid,
|
||||
@@ -889,7 +978,7 @@ impl WalIngest {
|
||||
zero_page: ClogZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogZeroPage { segno, rpageno } = zero_page;
|
||||
|
||||
self.put_slru_page_image(
|
||||
@@ -907,7 +996,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Create WAL record for updating the multixact-offsets page
|
||||
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -1010,7 +1099,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (maxsegment, startsegment, endsegment) =
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestMulti = xlrec.end_trunc_off;
|
||||
@@ -1058,7 +1147,7 @@ impl WalIngest {
|
||||
zero_page: MultiXactZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let MultiXactZeroPage {
|
||||
slru_kind,
|
||||
segno,
|
||||
@@ -1080,7 +1169,7 @@ impl WalIngest {
|
||||
update: RelmapUpdate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RelmapUpdate { update, buf } = update;
|
||||
|
||||
modification
|
||||
@@ -1093,7 +1182,7 @@ impl WalIngest {
|
||||
raw_record: RawXlogRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RawXlogRecord { info, lsn, mut buf } = raw_record;
|
||||
let pg_version = modification.tline.pg_version;
|
||||
|
||||
@@ -1235,12 +1324,12 @@ impl WalIngest {
|
||||
put: PutLogicalMessage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let PutLogicalMessage { path, buf } = put;
|
||||
modification.put_file(path.as_str(), &buf, ctx).await
|
||||
}
|
||||
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
StandbyRecord::RunningXacts(running_xacts) => {
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
@@ -1258,7 +1347,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
record: ReploriginRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
ReploriginRecord::Set(set) => {
|
||||
modification
|
||||
@@ -1278,7 +1367,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1291,7 +1380,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_page_image(rel, blknum, img)?;
|
||||
@@ -1305,7 +1394,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_wal_record(rel, blknum, rec)?;
|
||||
@@ -1318,7 +1407,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_truncation(rel, nblocks, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1329,7 +1418,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let new_nblocks = blknum + 1;
|
||||
// Check if the relation exists. We implicitly create relations on first
|
||||
// record.
|
||||
@@ -1423,7 +1512,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.shard.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1441,7 +1530,7 @@ impl WalIngest {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// we don't use a cache for this like we do for relations. SLRUS are explcitly
|
||||
// extended with ZEROPAGE records, not with commit records, so it happens
|
||||
// a lot less frequently.
|
||||
@@ -1509,6 +1598,7 @@ async fn get_relsize(
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use super::*;
|
||||
@@ -1530,7 +1620,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
|
||||
for i in 14..=16 {
|
||||
dispatch_pgversion!(i, {
|
||||
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
|
||||
|
||||
@@ -2118,9 +2118,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
*/
|
||||
if (wp->config->syncSafekeepers)
|
||||
{
|
||||
int n_synced;
|
||||
|
||||
n_synced = 0;
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
@@ -2129,8 +2126,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
/* alive safekeeper which is not synced yet; wait for it */
|
||||
if (sk->state != SS_OFFLINE && !synced)
|
||||
return;
|
||||
if (synced)
|
||||
n_synced++;
|
||||
}
|
||||
|
||||
if (newCommitLsn >= wp->propTermStartLsn)
|
||||
|
||||
@@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
if let Some(mut redis_kv_client) = redis_kv_client {
|
||||
maintenance_tasks.spawn(async move {
|
||||
redis_kv_client.try_connect().await?;
|
||||
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
|
||||
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
|
||||
|
||||
drop(redis_kv_client);
|
||||
|
||||
// `handle_cancel_messages` was terminated due to the tx_cancel
|
||||
// being dropped. this is not worthy of an error, and this task can only return `Err`,
|
||||
// so let's wait forever instead.
|
||||
std::future::pending().await
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use ipnet::{IpNet, Ipv4Net, Ipv6Net};
|
||||
use postgres_client::CancelToken;
|
||||
use postgres_client::tls::MakeTlsConnect;
|
||||
use pq_proto::CancelKeyData;
|
||||
use redis::{FromRedisValue, Pipeline, Value, pipe};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tracing::{debug, info};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use crate::auth::backend::ComputeUserInfo;
|
||||
use crate::auth::{AuthError, check_peer_addr_is_in_list};
|
||||
@@ -30,6 +31,7 @@ type IpSubnetKey = IpNet;
|
||||
|
||||
const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
|
||||
const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
|
||||
const BATCH_SIZE: usize = 8;
|
||||
|
||||
// Message types for sending through mpsc channel
|
||||
pub enum CancelKeyOp {
|
||||
@@ -54,78 +56,168 @@ pub enum CancelKeyOp {
|
||||
},
|
||||
}
|
||||
|
||||
impl CancelKeyOp {
|
||||
fn register(self, pipe: &mut Pipeline) -> Option<CancelReplyOp> {
|
||||
#[allow(clippy::used_underscore_binding)]
|
||||
match self {
|
||||
CancelKeyOp::StoreCancelKey {
|
||||
key,
|
||||
field,
|
||||
value,
|
||||
resp_tx,
|
||||
_guard,
|
||||
expire,
|
||||
} => {
|
||||
pipe.hset(&key, field, value);
|
||||
pipe.expire(key, expire);
|
||||
let resp_tx = resp_tx?;
|
||||
Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard })
|
||||
}
|
||||
CancelKeyOp::GetCancelData {
|
||||
key,
|
||||
resp_tx,
|
||||
_guard,
|
||||
} => {
|
||||
pipe.hgetall(key);
|
||||
Some(CancelReplyOp::GetCancelData { resp_tx, _guard })
|
||||
}
|
||||
CancelKeyOp::RemoveCancelKey {
|
||||
key,
|
||||
field,
|
||||
resp_tx,
|
||||
_guard,
|
||||
} => {
|
||||
pipe.hdel(key, field);
|
||||
let resp_tx = resp_tx?;
|
||||
Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard })
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Message types for sending through mpsc channel
|
||||
pub enum CancelReplyOp {
|
||||
StoreCancelKey {
|
||||
resp_tx: oneshot::Sender<anyhow::Result<()>>,
|
||||
_guard: CancelChannelSizeGuard<'static>,
|
||||
},
|
||||
GetCancelData {
|
||||
resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
|
||||
_guard: CancelChannelSizeGuard<'static>,
|
||||
},
|
||||
RemoveCancelKey {
|
||||
resp_tx: oneshot::Sender<anyhow::Result<()>>,
|
||||
_guard: CancelChannelSizeGuard<'static>,
|
||||
},
|
||||
}
|
||||
|
||||
impl CancelReplyOp {
|
||||
fn send_err(self, e: anyhow::Error) {
|
||||
match self {
|
||||
CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
|
||||
resp_tx
|
||||
.send(Err(e))
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
CancelReplyOp::GetCancelData { resp_tx, _guard } => {
|
||||
resp_tx
|
||||
.send(Err(e))
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
|
||||
resp_tx
|
||||
.send(Err(e))
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn send_value(self, v: redis::Value) {
|
||||
match self {
|
||||
CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
|
||||
let send =
|
||||
FromRedisValue::from_owned_redis_value(v).context("could not parse value");
|
||||
resp_tx
|
||||
.send(send)
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
CancelReplyOp::GetCancelData { resp_tx, _guard } => {
|
||||
let send =
|
||||
FromRedisValue::from_owned_redis_value(v).context("could not parse value");
|
||||
resp_tx
|
||||
.send(send)
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
|
||||
let send =
|
||||
FromRedisValue::from_owned_redis_value(v).context("could not parse value");
|
||||
resp_tx
|
||||
.send(send)
|
||||
.inspect_err(|_| tracing::debug!("could not send reply"))
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Running as a separate task to accept messages through the rx channel
|
||||
// In case of problems with RTT: switch to recv_many() + redis pipeline
|
||||
pub async fn handle_cancel_messages(
|
||||
client: &mut RedisKVClient,
|
||||
mut rx: mpsc::Receiver<CancelKeyOp>,
|
||||
) -> anyhow::Result<Infallible> {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut batch = Vec::new();
|
||||
let mut replies = vec![];
|
||||
|
||||
loop {
|
||||
if let Some(msg) = rx.recv().await {
|
||||
match msg {
|
||||
CancelKeyOp::StoreCancelKey {
|
||||
key,
|
||||
field,
|
||||
value,
|
||||
resp_tx,
|
||||
_guard,
|
||||
expire,
|
||||
} => {
|
||||
let res = client.hset(&key, field, value).await;
|
||||
if let Some(resp_tx) = resp_tx {
|
||||
if res.is_ok() {
|
||||
resp_tx
|
||||
.send(client.expire(key, expire).await)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!(
|
||||
"failed to send StoreCancelKey response: {:?}",
|
||||
e
|
||||
);
|
||||
})
|
||||
.ok();
|
||||
} else {
|
||||
resp_tx
|
||||
.send(res)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!(
|
||||
"failed to send StoreCancelKey response: {:?}",
|
||||
e
|
||||
);
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
} else if res.is_ok() {
|
||||
drop(client.expire(key, expire).await);
|
||||
} else {
|
||||
tracing::warn!("failed to store cancel key: {:?}", res);
|
||||
}
|
||||
if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
|
||||
warn!("shutting down cancellation queue");
|
||||
break Ok(());
|
||||
}
|
||||
|
||||
let batch_size = batch.len();
|
||||
debug!(batch_size, "running cancellation jobs");
|
||||
|
||||
let mut pipe = pipe();
|
||||
for msg in batch.drain(..) {
|
||||
if let Some(reply) = msg.register(&mut pipe) {
|
||||
replies.push(reply);
|
||||
} else {
|
||||
pipe.ignore();
|
||||
}
|
||||
}
|
||||
|
||||
let responses = replies.len();
|
||||
|
||||
match client.query(pipe).await {
|
||||
// for each reply, we expect that many values.
|
||||
Ok(Value::Array(values)) if values.len() == responses => {
|
||||
debug!(
|
||||
batch_size,
|
||||
responses, "successfully completed cancellation jobs",
|
||||
);
|
||||
for (value, reply) in std::iter::zip(values, replies.drain(..)) {
|
||||
reply.send_value(value);
|
||||
}
|
||||
CancelKeyOp::GetCancelData {
|
||||
key,
|
||||
resp_tx,
|
||||
_guard,
|
||||
} => {
|
||||
drop(resp_tx.send(client.hget_all(key).await));
|
||||
}
|
||||
Ok(value) => {
|
||||
debug!(?value, "unexpected redis return value");
|
||||
for reply in replies.drain(..) {
|
||||
reply.send_err(anyhow!("incorrect response type from redis"));
|
||||
}
|
||||
CancelKeyOp::RemoveCancelKey {
|
||||
key,
|
||||
field,
|
||||
resp_tx,
|
||||
_guard,
|
||||
} => {
|
||||
if let Some(resp_tx) = resp_tx {
|
||||
resp_tx
|
||||
.send(client.hdel(key, field).await)
|
||||
.inspect_err(|e| {
|
||||
tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
|
||||
})
|
||||
.ok();
|
||||
} else {
|
||||
drop(client.hdel(key, field).await);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
for reply in replies.drain(..) {
|
||||
reply.send_err(anyhow!("could not send cmd to redis: {err}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
replies.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use redis::{AsyncCommands, ToRedisArgs};
|
||||
use redis::aio::ConnectionLike;
|
||||
use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
|
||||
|
||||
use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
|
||||
@@ -8,6 +9,23 @@ pub struct RedisKVClient {
|
||||
limiter: GlobalRateLimiter,
|
||||
}
|
||||
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait Queryable {
|
||||
async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T>;
|
||||
}
|
||||
|
||||
impl Queryable for Pipeline {
|
||||
async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
|
||||
self.query_async(conn).await
|
||||
}
|
||||
}
|
||||
|
||||
impl Queryable for Cmd {
|
||||
async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
|
||||
self.query_async(conn).await
|
||||
}
|
||||
}
|
||||
|
||||
impl RedisKVClient {
|
||||
pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
|
||||
Self {
|
||||
@@ -27,158 +45,24 @@ impl RedisKVClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
F: ToRedisArgs + Send + Sync,
|
||||
V: ToRedisArgs + Send + Sync,
|
||||
{
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping hset");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.hset(&key, &field, &value).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to set a key-value pair: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.client
|
||||
.hset(key, field, value)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn hset_multiple<K, V>(
|
||||
pub(crate) async fn query<T: FromRedisValue>(
|
||||
&mut self,
|
||||
key: &str,
|
||||
items: &[(K, V)],
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
V: ToRedisArgs + Send + Sync,
|
||||
{
|
||||
q: impl Queryable,
|
||||
) -> anyhow::Result<T> {
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping hset_multiple");
|
||||
tracing::info!("Rate limit exceeded. Skipping query");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.hset_multiple(key, items).await {
|
||||
Ok(()) => return Ok(()),
|
||||
match q.query(&mut self.client).await {
|
||||
Ok(t) => return Ok(t),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to set a key-value pair: {e}");
|
||||
tracing::error!("failed to run query: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
tracing::info!("Redis client is disconnected. Reconnecting...");
|
||||
self.try_connect().await?;
|
||||
self.client
|
||||
.hset_multiple(key, items)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
{
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping expire");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.expire(&key, seconds).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to set a key-value pair: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.client
|
||||
.expire(key, seconds)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
F: ToRedisArgs + Send + Sync,
|
||||
V: redis::FromRedisValue,
|
||||
{
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping hget");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.hget(&key, &field).await {
|
||||
Ok(value) => return Ok(value),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to get a value: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.client
|
||||
.hget(key, field)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
V: redis::FromRedisValue,
|
||||
{
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping hgetall");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.hgetall(&key).await {
|
||||
Ok(value) => return Ok(value),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to get a value: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.client.hgetall(key).await.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
|
||||
where
|
||||
K: ToRedisArgs + Send + Sync,
|
||||
F: ToRedisArgs + Send + Sync,
|
||||
{
|
||||
if !self.limiter.check() {
|
||||
tracing::info!("Rate limit exceeded. Skipping hdel");
|
||||
return Err(anyhow::anyhow!("Rate limit exceeded"));
|
||||
}
|
||||
|
||||
match self.client.hdel(&key, &field).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) => {
|
||||
tracing::error!("failed to delete a key-value pair: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Redis client is disconnected. Reconnectiong...");
|
||||
self.try_connect().await?;
|
||||
self.client
|
||||
.hdel(key, field)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
Ok(q.query(&mut self.client).await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ pub async fn task_main_https(
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
let cert_resolver = ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
|
||||
@@ -629,15 +629,13 @@ impl ComputeHook {
|
||||
};
|
||||
|
||||
let result = if !self.config.use_local_compute_notifications {
|
||||
let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url {
|
||||
Some(if control_plane_url.ends_with('/') {
|
||||
format!("{control_plane_url}notify-attach")
|
||||
} else {
|
||||
format!("{control_plane_url}/notify-attach")
|
||||
})
|
||||
} else {
|
||||
self.config.compute_hook_url.clone()
|
||||
};
|
||||
let compute_hook_url =
|
||||
self.config
|
||||
.control_plane_url
|
||||
.as_ref()
|
||||
.map(|control_plane_url| {
|
||||
format!("{}/notify-attach", control_plane_url.trim_end_matches('/'))
|
||||
});
|
||||
|
||||
// We validate this at startup
|
||||
let notify_url = compute_hook_url.as_ref().unwrap();
|
||||
|
||||
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TimelineImportRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
|
||||
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.step_down().await)
|
||||
// Spawn a background task: once we start stepping down, we must finish: if the client drops
|
||||
// their request we should avoid stopping in some part-stepped-down state.
|
||||
let handle = tokio::spawn(async move {
|
||||
let state = get_state(&req);
|
||||
state.service.step_down().await
|
||||
});
|
||||
|
||||
let result = handle
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
maybe_rate_limit(&req, tenant_id).await;
|
||||
|
||||
let mut req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
|
||||
import_req.tenant_id,
|
||||
import_req.timeline_id
|
||||
)));
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.timeline_import(import_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1949,6 +1991,16 @@ pub fn make_router(
|
||||
RequestName("debug_v1_tenant_locate"),
|
||||
)
|
||||
})
|
||||
.post(
|
||||
"/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
|
||||
|r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_timeline_import,
|
||||
RequestName("debug_v1_timeline_import"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
||||
})
|
||||
|
||||
@@ -86,10 +86,6 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
peer_jwt_token: Option<String>,
|
||||
|
||||
/// URL to control plane compute notification endpoint
|
||||
#[arg(long)]
|
||||
compute_hook_url: Option<String>,
|
||||
|
||||
/// URL to control plane storage API prefix
|
||||
#[arg(long)]
|
||||
control_plane_url: Option<String>,
|
||||
@@ -360,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
"Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict
|
||||
if args.compute_hook_url.is_none() && args.control_plane_url.is_none() =>
|
||||
{
|
||||
StrictMode::Strict if args.control_plane_url.is_none() => {
|
||||
// Production systems should always have a control plane URL set, to prevent falling
|
||||
// back to trying to use neon_local.
|
||||
anyhow::bail!(
|
||||
"neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode"
|
||||
"`--control-plane-url` is not set: this is only permitted in `--dev` mode"
|
||||
);
|
||||
}
|
||||
StrictMode::Strict if args.use_local_compute_notifications => {
|
||||
@@ -394,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
safekeeper_jwt_token: secrets.safekeeper_jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
peer_jwt_token: secrets.peer_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
control_plane_url: args.control_plane_url,
|
||||
max_offline_interval: args
|
||||
.max_offline_interval
|
||||
@@ -472,6 +465,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let https_listener = tcp_listener::bind(https_addr)?;
|
||||
|
||||
let resolver = ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&args.ssl_key_file,
|
||||
&args.ssl_cert_file,
|
||||
*args.ssl_cert_reload_period,
|
||||
|
||||
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::{failpoint_support, pausable_failpoint};
|
||||
|
||||
use crate::background_node_operations::{
|
||||
@@ -357,18 +357,10 @@ pub struct Config {
|
||||
// This JWT token will be used to authenticate with other storage controller instances
|
||||
pub peer_jwt_token: Option<String>,
|
||||
|
||||
/// Where the compute hook should send notifications of pageserver attachment locations
|
||||
/// (this URL points to the control plane in prod). If this is None, the compute hook will
|
||||
/// assume it is running in a test environment and try to update neon_local.
|
||||
pub compute_hook_url: Option<String>,
|
||||
|
||||
/// Prefix for storage API endpoints of the control plane. We use this prefix to compute
|
||||
/// URLs that we use to send pageserver and safekeeper attachment locations.
|
||||
/// If this is None, the compute hook will assume it is running in a test environment
|
||||
/// and try to invoke neon_local instead.
|
||||
///
|
||||
/// For now, there is also `compute_hook_url` which allows configuration of the pageserver
|
||||
/// specific endpoint, but it is in the process of being phased out.
|
||||
pub control_plane_url: Option<String>,
|
||||
|
||||
/// Grace period within which a pageserver does not respond to heartbeats, but is still
|
||||
@@ -594,6 +586,8 @@ struct TenantShardSplitAbort {
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
/// Until this abort op is complete, no other operations may be done on the tenant
|
||||
_tenant_lock: TracingExclusiveGuard<TenantOperations>,
|
||||
/// The reconciler gate for the duration of the split operation, and any included abort.
|
||||
_gate: GateGuard,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -1460,7 +1454,7 @@ impl Service {
|
||||
// Retry until shutdown: we must keep this request object alive until it is properly
|
||||
// processed, as it holds a lock guard that prevents other operations trying to do things
|
||||
// to the tenant while it is in a weird part-split state.
|
||||
while !self.cancel.is_cancelled() {
|
||||
while !self.reconcilers_cancel.is_cancelled() {
|
||||
match self.abort_tenant_shard_split(&op).await {
|
||||
Ok(_) => break,
|
||||
Err(e) => {
|
||||
@@ -1473,9 +1467,12 @@ impl Service {
|
||||
// when we retry, so that the abort op will succeed. If the abort op is failing
|
||||
// for some other reason, we will keep retrying forever, or until a human notices
|
||||
// and does something about it (either fixing a pageserver or restarting the controller).
|
||||
tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
tokio::time::timeout(
|
||||
Duration::from_secs(5),
|
||||
self.reconcilers_cancel.cancelled(),
|
||||
)
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1847,6 +1844,7 @@ impl Service {
|
||||
};
|
||||
|
||||
if insert {
|
||||
let config = attach_req.config.clone().unwrap_or_default();
|
||||
let tsp = TenantShardPersistence {
|
||||
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
|
||||
@@ -1855,7 +1853,7 @@ impl Service {
|
||||
generation: attach_req.generation_override.or(Some(0)),
|
||||
generation_pageserver: None,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
config: serde_json::to_string(&config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
@@ -1878,16 +1876,16 @@ impl Service {
|
||||
Ok(()) => {
|
||||
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
let mut shard = TenantShard::new(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantShard::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
None,
|
||||
),
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
None,
|
||||
);
|
||||
shard.config = config;
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(attach_req.tenant_shard_id, shard);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
@@ -1972,11 +1970,12 @@ impl Service {
|
||||
.set_attached(scheduler, attach_req.node_id);
|
||||
|
||||
tracing::info!(
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}",
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
|
||||
attach_req.tenant_shard_id,
|
||||
tenant_shard.generation,
|
||||
// TODO: this is an odd number of 0xf's
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
|
||||
attach_req.config,
|
||||
);
|
||||
|
||||
// Trick the reconciler into not doing anything for this tenant: this helps
|
||||
@@ -4910,7 +4909,7 @@ impl Service {
|
||||
1,
|
||||
10,
|
||||
Duration::from_secs(5),
|
||||
&self.cancel,
|
||||
&self.reconcilers_cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -5161,6 +5160,11 @@ impl Service {
|
||||
)
|
||||
.await;
|
||||
|
||||
let _gate = self
|
||||
.reconcilers_gate
|
||||
.enter()
|
||||
.map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
let new_shard_count = ShardCount::new(split_req.new_shard_count);
|
||||
let new_stripe_size = split_req.new_stripe_size;
|
||||
|
||||
@@ -5188,6 +5192,7 @@ impl Service {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
_tenant_lock,
|
||||
_gate,
|
||||
})
|
||||
// Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
|
||||
.ok();
|
||||
@@ -5527,7 +5532,10 @@ impl Service {
|
||||
"failpoint".to_string()
|
||||
)));
|
||||
|
||||
failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"shard-split-post-remote-sleep",
|
||||
&self.reconcilers_cancel
|
||||
);
|
||||
|
||||
tracing::info!(
|
||||
"Split {} into {}",
|
||||
@@ -5585,7 +5593,7 @@ impl Service {
|
||||
stripe_size,
|
||||
preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
|
||||
},
|
||||
&self.cancel,
|
||||
&self.reconcilers_cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -8670,9 +8678,24 @@ impl Service {
|
||||
failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
|
||||
|
||||
self.inner.write().unwrap().step_down();
|
||||
// TODO: would it make sense to have a time-out for this?
|
||||
self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
|
||||
.await;
|
||||
|
||||
// Wait for reconciliations to stop, or terminate this process if they
|
||||
// fail to stop in time (this indicates a bug in shutdown)
|
||||
tokio::select! {
|
||||
_ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
|
||||
tracing::info!("Reconciliations stopped, proceeding with step down");
|
||||
}
|
||||
_ = async {
|
||||
failpoint_support::sleep_millis_async!("step-down-delay-timeout");
|
||||
tokio::time::sleep(Duration::from_secs(10)).await
|
||||
} => {
|
||||
tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
|
||||
|
||||
// The caller may proceed to act as leader when it sees this request fail: reduce the chance
|
||||
// of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
let mut global_observed = GlobalObservedState::default();
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
@@ -12,13 +12,16 @@ use crate::persistence::{
|
||||
use crate::safekeeper::Safekeeper;
|
||||
use anyhow::Context;
|
||||
use http_utils::error::ApiError;
|
||||
use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
|
||||
use pageserver_api::controller_api::{
|
||||
SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
|
||||
};
|
||||
use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
|
||||
use safekeeper_api::membership::{MemberSet, SafekeeperId};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Service;
|
||||
|
||||
@@ -298,6 +301,31 @@ impl Service {
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// Directly insert the timeline into the database without reconciling it with safekeepers.
|
||||
///
|
||||
/// Useful if the timeline already exists on the specified safekeepers,
|
||||
/// but we want to make it storage controller managed.
|
||||
pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
|
||||
let persistence = TimelinePersistence {
|
||||
tenant_id: req.tenant_id.to_string(),
|
||||
timeline_id: req.timeline_id.to_string(),
|
||||
start_lsn: Lsn::INVALID.into(),
|
||||
generation: 1,
|
||||
sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
|
||||
new_sk_set: None,
|
||||
cplane_notified_generation: 1,
|
||||
deleted_at: None,
|
||||
};
|
||||
let inserted = self.persistence.insert_timeline(persistence).await?;
|
||||
if inserted {
|
||||
tracing::info!("imported timeline into db");
|
||||
} else {
|
||||
tracing::info!("didn't import timeline into db, as it is already present in db");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
|
||||
pub(super) async fn tenant_timeline_delete_safekeepers(
|
||||
self: &Arc<Self>,
|
||||
|
||||
@@ -3,19 +3,35 @@
|
||||
* Create a Neon project on staging.
|
||||
* Grant the superuser privileges to the DB user.
|
||||
* (Optional) create a branch for testing
|
||||
* Configure the endpoint by updating the control-plane database with the following settings:
|
||||
* Add the following settings to the `pg_settings` section of the default endpoint configuration for the project using the admin interface:
|
||||
* `Timeone`: `America/Los_Angeles`
|
||||
* `DateStyle`: `Postgres,MDY`
|
||||
* `compute_query_id`: `off`
|
||||
* Add the following section to the project configuration:
|
||||
```json
|
||||
"preload_libraries": {
|
||||
"use_defaults": false,
|
||||
"enabled_libraries": []
|
||||
}
|
||||
```
|
||||
* Checkout the actual `Neon` sources
|
||||
* Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
|
||||
```bash
|
||||
$ cd vendor/postgres-v17
|
||||
$ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
|
||||
```
|
||||
* Set the environment variables (please modify according your configuration):
|
||||
```bash
|
||||
$ export DEFAULT_PG_VERSION=17
|
||||
$ export BUILD_TYPE=release
|
||||
```
|
||||
* Build the Neon binaries see [README.md](../../README.md)
|
||||
* Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
|
||||
* Set the environment variable `PG_VERSION` to the version of your project.
|
||||
* Update poetry, run
|
||||
```bash
|
||||
$ scripts/pysync
|
||||
```
|
||||
* Run
|
||||
```bash
|
||||
$ pytest -m remote_cluster -k cloud_regress
|
||||
$ scripts/pytest -m remote_cluster -k cloud_regress
|
||||
```
|
||||
@@ -194,6 +194,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
counter("pageserver_wait_lsn_started_count"),
|
||||
counter("pageserver_wait_lsn_finished_count"),
|
||||
counter("pageserver_wait_ondemand_download_seconds_sum"),
|
||||
counter("pageserver_page_service_batch_break_reason"),
|
||||
*histogram("pageserver_page_service_batch_size"),
|
||||
*histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
|
||||
@@ -14,6 +14,7 @@ import threading
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from collections.abc import Mapping
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
@@ -946,6 +947,8 @@ class NeonEnvBuilder:
|
||||
continue
|
||||
if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
|
||||
continue
|
||||
if FINAL_METRICS_FILE_NAME == test_file.name:
|
||||
continue
|
||||
log.debug(f"Removing large database {test_file} file")
|
||||
test_file.unlink()
|
||||
elif test_entry.is_dir():
|
||||
@@ -1254,6 +1257,7 @@ class NeonEnv:
|
||||
"mode": "pipelined",
|
||||
"execution": "concurrent-futures",
|
||||
"max_batch_size": 32,
|
||||
"batching": "scattered-lsn",
|
||||
}
|
||||
|
||||
get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
|
||||
@@ -1455,6 +1459,12 @@ class NeonEnv:
|
||||
except Exception as e:
|
||||
metric_errors.append(e)
|
||||
log.error(f"metric validation failed on {pageserver.id}: {e}")
|
||||
|
||||
try:
|
||||
pageserver.snapshot_final_metrics()
|
||||
except Exception as e:
|
||||
log.error(f"metric snapshot failed on {pageserver.id}: {e}")
|
||||
|
||||
try:
|
||||
pageserver.stop(immediate=immediate)
|
||||
except RuntimeError:
|
||||
@@ -1985,10 +1995,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
tenant_shard_id: TenantId | TenantShardId,
|
||||
pageserver_id: int,
|
||||
generation_override: int | None = None,
|
||||
config: None | dict[str, Any] = None,
|
||||
) -> int:
|
||||
body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
|
||||
if generation_override is not None:
|
||||
body["generation_override"] = generation_override
|
||||
if config is not None:
|
||||
body["config"] = config
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
@@ -2883,13 +2896,14 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
self,
|
||||
immediate: bool = False,
|
||||
timeout_in_seconds: int | None = None,
|
||||
extra_env_vars: dict[str, str] | None = None,
|
||||
):
|
||||
"""
|
||||
High level wrapper for restart: restarts the process, and waits for
|
||||
tenant state to stabilize.
|
||||
"""
|
||||
self.stop(immediate=immediate)
|
||||
self.start(timeout_in_seconds=timeout_in_seconds)
|
||||
self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
|
||||
self.quiesce_tenants()
|
||||
|
||||
def quiesce_tenants(self):
|
||||
@@ -2966,6 +2980,20 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
value = self.http_client().get_metric_value(metric)
|
||||
assert value == 0, f"Nonzero {metric} == {value}"
|
||||
|
||||
def snapshot_final_metrics(self):
|
||||
"""
|
||||
Take a snapshot of this pageserver's metrics and stash in its work directory.
|
||||
"""
|
||||
if not self.running:
|
||||
log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running")
|
||||
return
|
||||
|
||||
metrics = self.http_client().get_metrics_str()
|
||||
metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
|
||||
|
||||
with open(metrics_snapshot_path, "w") as f:
|
||||
f.write(metrics)
|
||||
|
||||
def tenant_attach(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -2978,11 +3006,12 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
client = self.http_client()
|
||||
if generation is None:
|
||||
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
elif override_storage_controller_generation:
|
||||
if generation is None or override_storage_controller_generation:
|
||||
generation = self.env.storage_controller.attach_hook_issue(
|
||||
tenant_id, self.id, generation
|
||||
tenant_id,
|
||||
self.id,
|
||||
generation_override=generation if override_storage_controller_generation else None,
|
||||
config=config,
|
||||
)
|
||||
return client.tenant_attach(
|
||||
tenant_id,
|
||||
@@ -4297,31 +4326,32 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
def respec_deep(self, **kwargs: Any) -> None:
|
||||
"""
|
||||
Update the endpoint.json file taking into account nested keys.
|
||||
It does one level deep update. Should enough for most cases.
|
||||
Distinct method from respec() to do not break existing functionality.
|
||||
NOTE: This method also updates the spec.json file, not endpoint.json.
|
||||
We need it because neon_local also writes to spec.json, so intended
|
||||
NOTE: This method also updates the config.json file, not endpoint.json.
|
||||
We need it because neon_local also writes to config.json, so intended
|
||||
use-case is i) start endpoint with some config, ii) respec_deep(),
|
||||
iii) call reconfigure() to apply the changes.
|
||||
"""
|
||||
config_path = os.path.join(self.endpoint_path(), "spec.json")
|
||||
with open(config_path) as f:
|
||||
data_dict: dict[str, Any] = json.load(f)
|
||||
|
||||
log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
if key not in data_dict:
|
||||
data_dict[key] = value
|
||||
def update(curr, patch):
|
||||
for k, v in patch.items():
|
||||
if isinstance(v, Mapping):
|
||||
curr[k] = update(curr.get(k, {}), v)
|
||||
else:
|
||||
data_dict[key] = {**data_dict[key], **value}
|
||||
else:
|
||||
data_dict[key] = value
|
||||
curr[k] = v
|
||||
return curr
|
||||
|
||||
config_path = os.path.join(self.endpoint_path(), "config.json")
|
||||
with open(config_path) as f:
|
||||
config: dict[str, Any] = json.load(f)
|
||||
|
||||
log.debug("Current compute config: %s", json.dumps(config, indent=4))
|
||||
|
||||
update(config, kwargs)
|
||||
|
||||
with open(config_path, "w") as file:
|
||||
log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
|
||||
json.dump(data_dict, file, indent=4)
|
||||
log.debug("Updating compute config to: %s", json.dumps(config, indent=4))
|
||||
json.dump(config, file, indent=4)
|
||||
|
||||
def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
|
||||
"""
|
||||
@@ -4338,7 +4368,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
wait_until(check_migrations_done)
|
||||
|
||||
# Mock the extension part of spec passed from control plane for local testing
|
||||
# endpooint.rs adds content of this file as a part of the spec.json
|
||||
# endpooint.rs adds content of this file as a part of the config.json
|
||||
def create_remote_extension_spec(self, spec: dict[str, Any]):
|
||||
"""Create a remote extension spec file for the endpoint."""
|
||||
remote_extensions_spec_path = os.path.join(
|
||||
@@ -5126,6 +5156,8 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
|
||||
r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
|
||||
)
|
||||
|
||||
FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
|
||||
|
||||
|
||||
SKIP_DIRS = frozenset(
|
||||
(
|
||||
|
||||
@@ -65,13 +65,11 @@ def single_timeline(
|
||||
assert ps_http.tenant_list() == []
|
||||
|
||||
def attach(tenant):
|
||||
# NB: create the new tenant in the storage controller with the correct tenant config. This
|
||||
# will pick up the existing tenant data from remote storage. If we just attach it to the
|
||||
# Pageserver, the storage controller will reset the tenant config to the default.
|
||||
env.create_tenant(
|
||||
tenant_id=tenant,
|
||||
timeline_id=template_timeline,
|
||||
conf=template_config,
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config,
|
||||
generation=100,
|
||||
override_storage_controller_generation=True,
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
|
||||
@@ -199,7 +199,7 @@ def wait_for_last_record_lsn(
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
|
||||
current_lsn = Lsn(0)
|
||||
for i in range(1000):
|
||||
for i in range(2000):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
|
||||
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(
|
||||
|
||||
n_txns = 500000
|
||||
|
||||
def setup_wrapper(env: NeonEnv):
|
||||
return setup_tenant_template(env, n_txns)
|
||||
|
||||
env = setup_pageserver_with_tenants(
|
||||
neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
|
||||
neon_env_builder,
|
||||
f"large_slru_count-{n_tenants}-{n_txns}",
|
||||
n_tenants,
|
||||
lambda env: setup_tenant_template(env, n_txns),
|
||||
)
|
||||
run_benchmark(env, pg_bin, record, duration)
|
||||
|
||||
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
|
||||
"gc_period": "0s", # disable periodic gc
|
||||
"checkpoint_timeout": "10 years",
|
||||
"compaction_period": "0s", # disable periodic compaction
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
|
||||
template_tenant, template_timeline = env.create_tenant(set_default=True)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import concurrent.futures
|
||||
import dataclasses
|
||||
import json
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -28,38 +30,33 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
|
||||
class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
|
||||
max_batch_size: int
|
||||
execution: str
|
||||
batching: str
|
||||
mode: str = "pipelined"
|
||||
|
||||
|
||||
EXECUTION = ["concurrent-futures", "tasks"]
|
||||
EXECUTION = ["concurrent-futures"]
|
||||
BATCHING = ["uniform-lsn", "scattered-lsn"]
|
||||
|
||||
NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
|
||||
for max_batch_size in [1, 32]:
|
||||
for execution in EXECUTION:
|
||||
NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
|
||||
for batching in BATCHING:
|
||||
NON_BATCHABLE.append(
|
||||
PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
|
||||
)
|
||||
|
||||
BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
|
||||
for max_batch_size in [1, 2, 4, 8, 16, 32]:
|
||||
BATCHABLE: list[PageServicePipeliningConfig] = []
|
||||
for max_batch_size in [32]:
|
||||
for execution in EXECUTION:
|
||||
BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
|
||||
for batching in BATCHING:
|
||||
BATCHABLE.append(
|
||||
PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
|
||||
[
|
||||
# non-batchable workloads
|
||||
# (A separate benchmark will consider latency).
|
||||
*[
|
||||
(
|
||||
50,
|
||||
config,
|
||||
TARGET_RUNTIME,
|
||||
1,
|
||||
128,
|
||||
f"not batchable {dataclasses.asdict(config)}",
|
||||
)
|
||||
for config in NON_BATCHABLE
|
||||
],
|
||||
# batchable workloads should show throughput and CPU efficiency improvements
|
||||
*[
|
||||
(
|
||||
@@ -137,7 +134,14 @@ def test_throughput(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
# minimal lfc & small shared buffers to force requests to pageserver
|
||||
"neon.max_file_cache_size=1MB",
|
||||
"shared_buffers=10MB",
|
||||
],
|
||||
)
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
@@ -155,7 +159,6 @@ def test_throughput(
|
||||
tablesize = tablesize_mib * 1024 * 1024
|
||||
npages = tablesize // (8 * 1024)
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
|
||||
# TODO: can we force postgres to do sequential scans?
|
||||
|
||||
#
|
||||
# Run the workload, collect `Metrics` before and after, calculate difference, normalize.
|
||||
@@ -166,6 +169,7 @@ def test_throughput(
|
||||
time: float
|
||||
pageserver_batch_size_histo_sum: float
|
||||
pageserver_batch_size_histo_count: float
|
||||
pageserver_batch_breaks_reason_count: dict[str, int]
|
||||
compute_getpage_count: float
|
||||
pageserver_cpu_seconds_total: float
|
||||
|
||||
@@ -179,6 +183,10 @@ def test_throughput(
|
||||
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
|
||||
- other.pageserver_cpu_seconds_total,
|
||||
pageserver_batch_breaks_reason_count={
|
||||
reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
|
||||
for reason, count in self.pageserver_batch_breaks_reason_count.items()
|
||||
},
|
||||
)
|
||||
|
||||
def normalize(self, by) -> "Metrics":
|
||||
@@ -188,6 +196,10 @@ def test_throughput(
|
||||
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
|
||||
compute_getpage_count=self.compute_getpage_count / by,
|
||||
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
|
||||
pageserver_batch_breaks_reason_count={
|
||||
reason: count / by
|
||||
for reason, count in self.pageserver_batch_breaks_reason_count.items()
|
||||
},
|
||||
)
|
||||
|
||||
def get_metrics() -> Metrics:
|
||||
@@ -197,6 +209,20 @@ def test_throughput(
|
||||
)
|
||||
compute_getpage_count = cur.fetchall()[0][0]
|
||||
pageserver_metrics = ps_http.get_metrics()
|
||||
for name, samples in pageserver_metrics.metrics.items():
|
||||
for sample in samples:
|
||||
log.info(f"{name=} labels={sample.labels} {sample.value}")
|
||||
|
||||
raw_batch_break_reason_count = pageserver_metrics.query_all(
|
||||
"pageserver_page_service_batch_break_reason_total",
|
||||
filter={"timeline_id": str(env.initial_timeline)},
|
||||
)
|
||||
|
||||
batch_break_reason_count = {
|
||||
sample.labels["reason"]: int(sample.value)
|
||||
for sample in raw_batch_break_reason_count
|
||||
}
|
||||
|
||||
return Metrics(
|
||||
time=time.time(),
|
||||
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
|
||||
@@ -205,34 +231,58 @@ def test_throughput(
|
||||
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
|
||||
"pageserver_page_service_batch_size_count"
|
||||
).value,
|
||||
pageserver_batch_breaks_reason_count=batch_break_reason_count,
|
||||
compute_getpage_count=compute_getpage_count,
|
||||
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
|
||||
"libmetrics_process_cpu_seconds_highres"
|
||||
).value,
|
||||
)
|
||||
|
||||
def workload() -> Metrics:
|
||||
def workload(disruptor_started: threading.Event) -> Metrics:
|
||||
disruptor_started.wait()
|
||||
start = time.time()
|
||||
iters = 0
|
||||
while time.time() - start < target_runtime or iters < 2:
|
||||
log.info("Seqscan %d", iters)
|
||||
if iters == 1:
|
||||
# round zero for warming up
|
||||
before = get_metrics()
|
||||
cur.execute(
|
||||
"select clear_buffer_cache()"
|
||||
) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
|
||||
cur.execute("select sum(data::bigint) from t")
|
||||
assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
|
||||
iters += 1
|
||||
after = get_metrics()
|
||||
return (after - before).normalize(iters - 1)
|
||||
|
||||
def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
iters = 0
|
||||
while True:
|
||||
cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
|
||||
if stop_disruptor.is_set():
|
||||
break
|
||||
disruptor_started.set()
|
||||
iters += 1
|
||||
time.sleep(0.001)
|
||||
return iters
|
||||
|
||||
env.pageserver.patch_config_toml_nonrecursive(
|
||||
{"page_service_pipelining": dataclasses.asdict(pipelining_config)}
|
||||
)
|
||||
env.pageserver.restart()
|
||||
metrics = workload()
|
||||
|
||||
# set trace for log analysis below
|
||||
env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
|
||||
|
||||
log.info("Starting workload")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
disruptor_started = threading.Event()
|
||||
stop_disruptor = threading.Event()
|
||||
disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
|
||||
workload_fut = executor.submit(workload, disruptor_started)
|
||||
metrics = workload_fut.result()
|
||||
stop_disruptor.set()
|
||||
ndisruptions = disruptor_fut.result()
|
||||
log.info("Disruptor issued %d disrupting requests", ndisruptions)
|
||||
|
||||
log.info("Results: %s", metrics)
|
||||
|
||||
@@ -249,7 +299,16 @@ def test_throughput(
|
||||
#
|
||||
|
||||
for metric, value in dataclasses.asdict(metrics).items():
|
||||
zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
|
||||
if metric == "pageserver_batch_breaks_reason_count":
|
||||
assert isinstance(value, dict)
|
||||
for reason, count in value.items():
|
||||
zenbenchmark.record(
|
||||
f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
|
||||
)
|
||||
else:
|
||||
zenbenchmark.record(
|
||||
f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
|
||||
)
|
||||
|
||||
zenbenchmark.record(
|
||||
"perfmetric.batching_factor",
|
||||
@@ -262,7 +321,10 @@ def test_throughput(
|
||||
PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
|
||||
for max_batch_size in [1, 32]:
|
||||
for execution in EXECUTION:
|
||||
PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
|
||||
for batching in BATCHING:
|
||||
PRECISION_CONFIGS.append(
|
||||
PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@@ -97,6 +97,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
|
||||
_record_branch_creation_durations(neon_compare, branch_creation_durations)
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
@pytest.mark.parametrize("n_branches", [500, 1024])
|
||||
@pytest.mark.parametrize("shape", ["one_ancestor", "random"])
|
||||
def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str):
|
||||
@@ -205,7 +206,7 @@ def wait_and_record_startup_metrics(
|
||||
assert len(matching) == len(expected_labels)
|
||||
return matching
|
||||
|
||||
samples = wait_until(metrics_are_filled)
|
||||
samples = wait_until(metrics_are_filled, timeout=60)
|
||||
|
||||
for sample in samples:
|
||||
phase = sample.labels["phase"]
|
||||
|
||||
@@ -52,6 +52,8 @@ def test_ingest_insert_bulk(
|
||||
# would compete with Pageserver for bandwidth.
|
||||
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout='600 s'"
|
||||
|
||||
neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -92,7 +94,18 @@ def test_ingest_insert_bulk(
|
||||
worker_rows = rows / CONCURRENCY
|
||||
pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
|
||||
|
||||
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
for attempt in range(5):
|
||||
try:
|
||||
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
break
|
||||
except Exception as e:
|
||||
# if we disable backpressure, postgres can become unresponsive for longer than a minute
|
||||
# and new connection attempts time out in postgres after 1 minute
|
||||
# so if this happens we retry new connection
|
||||
log.error(f"Attempt {attempt + 1}/5: Failed to select current wal lsn: {e}")
|
||||
if attempt == 4:
|
||||
log.error("Exceeded maximum retry attempts for selecting current wal lsn")
|
||||
raise
|
||||
|
||||
# Wait for pageserver to ingest the WAL.
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -64,8 +64,8 @@ def test_ro_replica_lag(
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project_id = project["project"]["id"]
|
||||
log.info("Project ID: {}", project_id)
|
||||
log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
|
||||
log.info("Project ID: %s", project_id)
|
||||
log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
error_occurred = False
|
||||
try:
|
||||
@@ -81,7 +81,7 @@ def test_ro_replica_lag(
|
||||
endpoint_type="read_only",
|
||||
settings={"pg_settings": {"hot_standby_feedback": "on"}},
|
||||
)
|
||||
log.info("Replica endpoint ID: {}", replica["endpoint"]["id"])
|
||||
log.info("Replica endpoint ID: %s", replica["endpoint"]["id"])
|
||||
replica_env = master_env.copy()
|
||||
replica_env["PGHOST"] = replica["endpoint"]["host"]
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
@@ -197,8 +197,8 @@ def test_replication_start_stop(
|
||||
|
||||
project = neon_api.create_project(pg_version)
|
||||
project_id = project["project"]["id"]
|
||||
log.info("Project ID: {}", project_id)
|
||||
log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
|
||||
log.info("Project ID: %s", project_id)
|
||||
log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
try:
|
||||
branch_id = project["branch"]["id"]
|
||||
@@ -215,7 +215,7 @@ def test_replication_start_stop(
|
||||
endpoint_type="read_only",
|
||||
settings={"pg_settings": {"hot_standby_feedback": "on"}},
|
||||
)
|
||||
log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"])
|
||||
log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"])
|
||||
replicas.append(replica)
|
||||
neon_api.wait_for_operation_to_finish(project_id)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.timeout(600)
|
||||
@pytest.mark.timeout(1200)
|
||||
@pytest.mark.parametrize("shard_count", [1, 8, 32])
|
||||
@pytest.mark.parametrize(
|
||||
"wal_receiver_protocol",
|
||||
|
||||
@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
},
|
||||
"rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
|
||||
"gc_compaction_enabled": True,
|
||||
"gc_compaction_verification": False,
|
||||
"gc_compaction_initial_threshold_kb": 1024000,
|
||||
"gc_compaction_ratio_percent": 200,
|
||||
"image_creation_preempt_threshold": 5,
|
||||
|
||||
@@ -162,6 +162,8 @@ def test_pageserver_compaction_preempt(
|
||||
conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
|
||||
env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
|
||||
@@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
|
||||
# and reconfigure the endpoint to create some test databases.
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": TEST_ROLE_NAMES,
|
||||
"databases": TEST_DB_NAMES,
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": TEST_ROLE_NAMES,
|
||||
"databases": TEST_DB_NAMES,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
|
||||
# and reconfigure the endpoint to apply the changes.
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": TEST_ROLE_NAMES,
|
||||
"databases": TEST_DB_NAMES,
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": TEST_ROLE_NAMES,
|
||||
"databases": TEST_DB_NAMES,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
|
||||
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [],
|
||||
"databases": [],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [],
|
||||
"databases": [],
|
||||
},
|
||||
"delta_operations": delta_operations,
|
||||
},
|
||||
"delta_operations": delta_operations,
|
||||
}
|
||||
)
|
||||
endpoint.reconfigure()
|
||||
@@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
|
||||
# and reconfigure the endpoint to apply the changes.
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES,
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
|
||||
# and reconfigure the endpoint to apply the changes.
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES_NEW,
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES_NEW,
|
||||
},
|
||||
"delta_operations": [
|
||||
{"action": "delete_db", "name": SUB_DB_NAME},
|
||||
# also test the case when we try to delete a non-existent database
|
||||
# shouldn't happen in normal operation,
|
||||
# but can occur when failed operations are retried
|
||||
{"action": "delete_db", "name": "nonexistent_db"},
|
||||
],
|
||||
},
|
||||
"delta_operations": [
|
||||
{"action": "delete_db", "name": SUB_DB_NAME},
|
||||
# also test the case when we try to delete a non-existent database
|
||||
# shouldn't happen in normal operation,
|
||||
# but can occur when failed operations are retried
|
||||
{"action": "delete_db", "name": "nonexistent_db"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
@@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
|
||||
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [
|
||||
{
|
||||
# We need to create role via compute_ctl, because in this case it will receive
|
||||
# additional grants equivalent to our real environment, so we can repro some
|
||||
# issues.
|
||||
"name": "neon",
|
||||
# Some autocomplete-suggested hash, no specific meaning.
|
||||
"encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
|
||||
"options": [],
|
||||
},
|
||||
],
|
||||
"databases": [
|
||||
{
|
||||
"name": TEST_DB_NAME,
|
||||
"owner": "neon",
|
||||
},
|
||||
],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [
|
||||
{
|
||||
# We need to create role via compute_ctl, because in this case it will receive
|
||||
# additional grants equivalent to our real environment, so we can repro some
|
||||
# issues.
|
||||
"name": "neon",
|
||||
# Some autocomplete-suggested hash, no specific meaning.
|
||||
"encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
|
||||
"options": [],
|
||||
},
|
||||
],
|
||||
"databases": [
|
||||
{
|
||||
"name": TEST_DB_NAME,
|
||||
"owner": "neon",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
|
||||
# Drop role via compute_ctl
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": TEST_GRANTEE,
|
||||
},
|
||||
],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": TEST_GRANTEE,
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
endpoint.reconfigure()
|
||||
@@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
|
||||
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": "readonly2",
|
||||
},
|
||||
],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": "readonly2",
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
endpoint.reconfigure()
|
||||
@@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [
|
||||
{
|
||||
# We need to create role via compute_ctl, because in this case it will receive
|
||||
# additional grants equivalent to our real environment, so we can repro some
|
||||
# issues.
|
||||
"name": TEST_GRANTOR,
|
||||
# Some autocomplete-suggested hash, no specific meaning.
|
||||
"encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
|
||||
"options": [],
|
||||
},
|
||||
],
|
||||
"databases": [
|
||||
{
|
||||
"name": TEST_DB_NAME,
|
||||
"owner": TEST_GRANTOR,
|
||||
},
|
||||
],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"roles": [
|
||||
{
|
||||
# We need to create role via compute_ctl, because in this case it will receive
|
||||
# additional grants equivalent to our real environment, so we can repro some
|
||||
# issues.
|
||||
"name": TEST_GRANTOR,
|
||||
# Some autocomplete-suggested hash, no specific meaning.
|
||||
"encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
|
||||
"options": [],
|
||||
},
|
||||
],
|
||||
"databases": [
|
||||
{
|
||||
"name": TEST_DB_NAME,
|
||||
"owner": TEST_GRANTOR,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
|
||||
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": TEST_GRANTEE,
|
||||
},
|
||||
],
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_role",
|
||||
"name": TEST_GRANTEE,
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
endpoint.reconfigure()
|
||||
|
||||
@@ -31,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
|
||||
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": True,
|
||||
"cluster": {
|
||||
"settings": [
|
||||
{
|
||||
"name": "log_line_prefix",
|
||||
"vartype": "string",
|
||||
"value": TEST_LOG_LINE_PREFIX,
|
||||
}
|
||||
]
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": True,
|
||||
"cluster": {
|
||||
"settings": [
|
||||
{
|
||||
"name": "log_line_prefix",
|
||||
"vartype": "string",
|
||||
"value": TEST_LOG_LINE_PREFIX,
|
||||
}
|
||||
]
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -16,6 +16,7 @@ def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind:
|
||||
"mode": "pipelined",
|
||||
"max_batch_size": 32,
|
||||
"execution": "concurrent-futures",
|
||||
"batching": "uniform-lsn",
|
||||
}
|
||||
|
||||
neon_env_builder.pageserver_config_override = patch_pageserver_toml
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import json
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -16,7 +15,6 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
||||
|
||||
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
|
||||
def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
"refill_interval": "100ms",
|
||||
"refill_amount": int(rate_limit_rps / 10),
|
||||
"max": int(rate_limit_rps / 10),
|
||||
"fair": True,
|
||||
},
|
||||
},
|
||||
)
|
||||
@@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
_, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
|
||||
|
||||
log.info("run pagebench")
|
||||
duration_secs = 10
|
||||
duration_secs = 20
|
||||
actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
|
||||
|
||||
log.info("validate the client is capped at the configured rps limit")
|
||||
expect_ncompleted = duration_secs * rate_limit_rps
|
||||
delta_abs = abs(expect_ncompleted - actual_ncompleted)
|
||||
threshold = 0.05 * expect_ncompleted
|
||||
assert threshold / rate_limit_rps < 0.1 * duration_secs, (
|
||||
"test self-test: unrealistic expecations regarding precision in this test"
|
||||
)
|
||||
assert delta_abs < 0.05 * expect_ncompleted, (
|
||||
assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
|
||||
"the throttling deviates more than 5percent from the expectation"
|
||||
)
|
||||
|
||||
@@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
timeout=compaction_period,
|
||||
)
|
||||
|
||||
log.info("validate the metrics")
|
||||
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
||||
assert smgr_query_seconds_post is not None
|
||||
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
||||
@@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
|
||||
actual_throttled_secs = actual_throttled_usecs / 1_000_000
|
||||
|
||||
log.info("validate that the metric doesn't include throttle wait time")
|
||||
assert duration_secs >= 10 * actual_smgr_query_seconds, (
|
||||
"smgr metrics should not include throttle wait time"
|
||||
)
|
||||
|
||||
log.info("validate that the throttling wait time metrics is correct")
|
||||
assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
|
||||
"most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
|
||||
"throttling and processing latency = total request time; this assert validates thi holds on average"
|
||||
)
|
||||
|
||||
|
||||
throttle_config_with_field_fair_set = {
|
||||
"task_kinds": ["PageRequestHandler"],
|
||||
"fair": True,
|
||||
"initial": 27,
|
||||
"refill_interval": "43s",
|
||||
"refill_amount": 23,
|
||||
"max": 42,
|
||||
}
|
||||
|
||||
|
||||
def assert_throttle_config_with_field_fair_set(conf):
|
||||
"""
|
||||
Field `fair` is ignored, so, responses don't contain it
|
||||
"""
|
||||
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
|
||||
without_fair.pop("fair")
|
||||
|
||||
assert conf == without_fair
|
||||
|
||||
|
||||
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
||||
"""
|
||||
env = neon_env_builder.init_start()
|
||||
vps_http = env.storage_controller.pageserver_api()
|
||||
# with_fair config should still be settable
|
||||
vps_http.set_tenant_config(
|
||||
env.initial_tenant,
|
||||
{"timeline_get_throttle": throttle_config_with_field_fair_set},
|
||||
)
|
||||
conf = vps_http.tenant_config(env.initial_tenant)
|
||||
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|
||||
assert_throttle_config_with_field_fair_set(
|
||||
conf.tenant_specific_overrides["timeline_get_throttle"]
|
||||
)
|
||||
|
||||
|
||||
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
||||
"""
|
||||
|
||||
def set_tenant_config(ps_cfg):
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
|
||||
|
||||
neon_env_builder.pageserver_config_override = set_tenant_config
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
conf = ps_http.tenant_config(env.initial_tenant)
|
||||
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
|
||||
# without this assertion, the test would pass even if the throttling was completely broken
|
||||
# but the request processing is so slow that it makes up for the latency that a correct throttling
|
||||
# implementation would add
|
||||
assert actual_smgr_query_seconds < 0.66 * duration_secs, (
|
||||
"test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
|
||||
)
|
||||
|
||||
@@ -239,6 +239,8 @@ def test_isolation(
|
||||
"neon.regress_test_mode = true",
|
||||
# Stack size should be increased for tests to pass with asan.
|
||||
"max_stack_depth = 4MB",
|
||||
# Neon extensiosn starts 2 BGW so decreasing number of parallel workers which can affect deadlock-parallel test if it hits max_worker_processes.
|
||||
"max_worker_processes = 16",
|
||||
],
|
||||
)
|
||||
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import ssl
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
|
||||
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
|
||||
cur_cert = ssl.get_server_certificate(("localhost", port))
|
||||
assert cur_cert == sk_cert
|
||||
|
||||
|
||||
def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test metrics exported from http/https server and tls cert reloader.
|
||||
"""
|
||||
neon_env_builder.use_https_pageserver_api = True
|
||||
neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
|
||||
|
||||
ps_client = env.pageserver.http_client()
|
||||
|
||||
# 1. Test connection started metric.
|
||||
filter_https = {"scheme": "https"}
|
||||
old_https_conn_count = (
|
||||
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
|
||||
)
|
||||
|
||||
addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
|
||||
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
|
||||
|
||||
new_https_conn_count = (
|
||||
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
|
||||
)
|
||||
# The counter should increase after the request,
|
||||
# but it may increase by more than one because of storcon requests.
|
||||
assert new_https_conn_count > old_https_conn_count
|
||||
|
||||
# 2. Test tls connection error.
|
||||
# Request without specified CA cert file should fail.
|
||||
with pytest.raises(requests.exceptions.SSLError):
|
||||
requests.get(addr)
|
||||
|
||||
tls_error_cnt = (
|
||||
ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
|
||||
)
|
||||
assert tls_error_cnt == 1
|
||||
|
||||
# 3. Test expiration time metric.
|
||||
expiration_time = datetime.fromtimestamp(
|
||||
ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
|
||||
)
|
||||
now = datetime.now()
|
||||
# neon_local generates certs valid for 100 years.
|
||||
# Compare with +-1 year to not care about leap years.
|
||||
assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
|
||||
|
||||
# 4. Test cert reload failed metric.
|
||||
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
|
||||
assert reload_error_cnt == 0
|
||||
|
||||
os.remove(env.pageserver.workdir / "server.crt")
|
||||
|
||||
def reload_failed():
|
||||
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
|
||||
assert reload_error_cnt > 0
|
||||
|
||||
wait_until(reload_failed)
|
||||
|
||||
@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step_down_times_out", [False, True])
|
||||
def test_storage_controller_leadership_transfer_during_split(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
storage_controller_proxy: StorageControllerProxy,
|
||||
port_distributor: PortDistributor,
|
||||
step_down_times_out: bool,
|
||||
):
|
||||
"""
|
||||
Exercise a race between shard splitting and graceful leadership transfer. This is
|
||||
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# We are testing scenarios where the step down API does not complete: either because it is stuck
|
||||
# doing a shard split, or because it totally times out on some other failpoint.
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*step_down.*request was dropped before completing.*",
|
||||
".*step_down.*operation timed out.*",
|
||||
".*Send step down request failed, will retry.*",
|
||||
".*Send step down request still failed after.*retries.*",
|
||||
".*Leader .+ did not respond to step-down request.*",
|
||||
]
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
# Start a shard split
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
)
|
||||
pause_failpoint = "shard-split-pre-complete"
|
||||
env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
|
||||
|
||||
if not step_down_times_out:
|
||||
# Prevent the timeout self-terminate code from executing: we will block step down on the
|
||||
# shard split itself
|
||||
env.storage_controller.configure_failpoints(
|
||||
("step-down-delay-timeout", "return(3600000)")
|
||||
)
|
||||
|
||||
split_fut = executor.submit(
|
||||
env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
|
||||
)
|
||||
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
|
||||
)
|
||||
|
||||
if step_down_times_out:
|
||||
# Step down will time out, original controller will terminate itself
|
||||
env.storage_controller.allowed_errors.extend([".*terminating process.*"])
|
||||
else:
|
||||
# Step down does not time out: original controller hits its shard split completion
|
||||
# code path and realises that it must not purge the parent shards from the database.
|
||||
env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
|
||||
|
||||
def passed_split_abort():
|
||||
try:
|
||||
log.info("Checking log for pattern...")
|
||||
assert env.storage_controller.log_contains(
|
||||
".*Using observed state received from leader.*"
|
||||
)
|
||||
# This log is indicative of entering startup_reconcile, which happens
|
||||
# after the point we would abort shard splits
|
||||
assert env.storage_controller.log_contains(".*Populating tenant shards.*")
|
||||
except Exception:
|
||||
log.exception("Failed to find pattern in log")
|
||||
raise
|
||||
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
|
||||
assert env.storage_controller.log_contains(".*Aborting shard split.*")
|
||||
|
||||
# Proxy is still talking to original controller here: disable its pause failpoint so
|
||||
# that its shard split can run to completion.
|
||||
log.info("Disabling failpoint")
|
||||
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
|
||||
# on handling the shard split request.
|
||||
env.storage_controller.request(
|
||||
"PUT",
|
||||
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
|
||||
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
if step_down_times_out:
|
||||
# We will let the old controller hit a timeout path where it terminates itself, rather than
|
||||
# completing step_down and trying to complete a shard split
|
||||
def old_controller_terminated():
|
||||
assert env.storage_controller.log_contains(".*terminating process.*")
|
||||
|
||||
def previous_stepped_down():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.STEPPED_DOWN
|
||||
wait_until(old_controller_terminated)
|
||||
else:
|
||||
# Proxy is still talking to original controller here: disable its pause failpoint so
|
||||
# that its shard split can run to completion.
|
||||
log.info("Disabling failpoint")
|
||||
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
|
||||
# on handling the shard split request.
|
||||
env.storage_controller.request(
|
||||
"PUT",
|
||||
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
|
||||
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
log.info("Awaiting step down")
|
||||
wait_until(previous_stepped_down)
|
||||
def previous_stepped_down():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.STEPPED_DOWN
|
||||
)
|
||||
|
||||
# Let the shard split complete: this may happen _after_ the replacement has come up
|
||||
# and tried to clean up the databases
|
||||
log.info("Unblocking & awaiting shard split")
|
||||
with pytest.raises(Exception, match="Unexpected child shard count"):
|
||||
# This split fails when it tries to persist results, because it encounters
|
||||
# changes already made by the new controller's abort-on-startup
|
||||
split_fut.result()
|
||||
log.info("Awaiting step down")
|
||||
wait_until(previous_stepped_down)
|
||||
|
||||
# Let the shard split complete: this may happen _after_ the replacement has come up
|
||||
# and tried to clean up the databases
|
||||
log.info("Unblocking & awaiting shard split")
|
||||
with pytest.raises(Exception, match="Unexpected child shard count"):
|
||||
# This split fails when it tries to persist results, because it encounters
|
||||
# changes already made by the new controller's abort-on-startup
|
||||
split_fut.result()
|
||||
|
||||
log.info("Routing to new leader")
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
|
||||
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
env.storage_controller.wait_until_ready()
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Check that the stepped down instance forwards requests
|
||||
# to the new leader while it's still running.
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
|
||||
status = env.storage_controller.node_status(env.pageservers[0].id)
|
||||
assert status["scheduling"] == "Pause"
|
||||
if not step_down_times_out:
|
||||
# Check that the stepped down instance forwards requests
|
||||
# to the new leader while it's still running.
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
|
||||
status = env.storage_controller.node_status(env.pageservers[0].id)
|
||||
assert status["scheduling"] == "Pause"
|
||||
|
||||
|
||||
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
|
||||
NUMBER_OF_DBS = 5
|
||||
|
||||
# Create and start endpoint so that neon_local put all the generated
|
||||
# stuff into the spec.json file.
|
||||
# stuff into the config.json file.
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
@@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
|
||||
}
|
||||
)
|
||||
|
||||
# Update the spec.json file to create the databases
|
||||
# Update the config.json file to create the databases
|
||||
# and reconfigure the endpoint to apply the changes.
|
||||
endpoint.respec_deep(
|
||||
**{
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES,
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"cluster": {
|
||||
"databases": TEST_DB_NAMES,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@@ -390,6 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
|
||||
# Tenant creation requests which arrive out of order will generate complaints about
|
||||
# generation nubmers out of order.
|
||||
env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
|
||||
env.pageserver.allowed_errors.append(".*due to stale generation.+")
|
||||
|
||||
# Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
|
||||
# an incomplete attach, or some other problem. In the field this should be rare,
|
||||
|
||||
Reference in New Issue
Block a user