Merge pull request #10107 from neondatabase/rc/release-proxy/2024-12-12

Proxy release 2024-12-12
This commit is contained in:
Conrad Ludgate
2024-12-12 10:21:30 +00:00
committed by GitHub
121 changed files with 8027 additions and 1071 deletions

View File

@@ -21,3 +21,5 @@ config-variables:
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
- DEV_AWS_OIDC_ROLE_ARN
- BENCHMARK_INGEST_TARGET_PROJECTID
- PGREGRESS_PG16_PROJECT_ID
- PGREGRESS_PG17_PROJECT_ID

View File

@@ -283,7 +283,7 @@ jobs:
submodules: true
- name: Pytest regression tests
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:

View File

@@ -255,15 +255,17 @@ jobs:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
# run without LFC on v17 release only
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
# Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
# debug build with LFC enabled doesn't block merging.
test-cfg: |
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
{"pg_version":"v15", "lfc_state": "without-lfc"},
{"pg_version":"v16", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc"}]'
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
{"pg_version":"v15", "lfc_state": "with-lfc"},
{"pg_version":"v16", "lfc_state": "with-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc"},
{"pg_version":"v17", "lfc_state": "without-lfc"}]'
|| '[{"pg_version":"v17", "lfc_state": "without-lfc"},
{"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -1066,6 +1068,70 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Create git tag and GitHub release
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
uses: actions/github-script@v7
with:
retries: 5
script: |
const tag = "${{ needs.tag.outputs.build-tag }}";
try {
const existingRef = await github.rest.git.getRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: `tags/${tag}`,
});
if (existingRef.data.object.sha !== context.sha) {
throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
}
console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
} catch (error) {
if (error.status !== 404) {
throw error;
}
console.log(`Tag ${tag} does not exist. Creating it...`);
await github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: `refs/tags/${tag}`,
sha: context.sha,
});
console.log(`Tag ${tag} created successfully.`);
}
# TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
if (context.ref !== 'refs/heads/release') {
console.log(`GitHub release skipped for ${context.ref}.`);
return;
}
try {
const existingRelease = await github.rest.repos.getReleaseByTag({
owner: context.repo.owner,
repo: context.repo.repo,
tag: tag,
});
console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
} catch (error) {
if (error.status !== 404) {
throw error;
}
console.log(`Release for tag ${tag} does not exist. Creating it...`);
await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: tag,
generate_release_notes: true,
});
console.log(`Release for tag ${tag} created successfully.`);
}
- name: Trigger deploy workflow
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,35 +1181,6 @@ jobs:
exit 1
fi
- name: Create git tag
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
uses: actions/github-script@v7
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
await github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
sha: context.sha,
})
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v7
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: "${{ needs.tag.outputs.build-tag }}",
generate_release_notes: true,
})
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ deploy ]

View File

@@ -23,11 +23,14 @@ jobs:
regress:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
strategy:
fail-fast: false
matrix:
pg-version: [16, 17]
runs-on: us-east-2
container:
@@ -40,9 +43,11 @@ jobs:
submodules: true
- name: Patch the test
env:
PG_VERSION: ${{matrix.pg-version}}
run: |
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
cd "vendor/postgres-v${PG_VERSION}"
patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"
- name: Generate a random password
id: pwgen
@@ -55,8 +60,9 @@ jobs:
- name: Change tests according to the generated password
env:
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
PG_VERSION: ${{matrix.pg-version}}
run: |
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
for fname in sql/*.sql expected/*.out; do
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
done
@@ -73,15 +79,29 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Create a new branch
id: create-branch
uses: ./.github/actions/neon-branch-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
- name: Run the regression tests
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: cloud_regress
pg_version: ${{ env.DEFAULT_PG_VERSION }}
pg_version: ${{matrix.pg-version}}
extra_params: -m remote_cluster
env:
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
- name: Delete branch
uses: ./.github/actions/neon-branch-delete
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
branch_id: ${{steps.create-branch.outputs.branch_id}}
- name: Create Allure report
id: create-allure-report

View File

@@ -1,16 +1,29 @@
/.github/ @neondatabase/developer-productivity
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
/libs/proxy/ @neondatabase/proxy
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/storage
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
/pageserver/ @neondatabase/storage
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
/vendor/ @neondatabase/compute
/compute/ @neondatabase/compute
/compute_tools/ @neondatabase/compute
# Proxy
/libs/proxy/ @neondatabase/proxy
/proxy/ @neondatabase/proxy
# Storage
/pageserver/ @neondatabase/storage
/safekeeper/ @neondatabase/storage
/storage_controller @neondatabase/storage
/storage_scrubber @neondatabase/storage
/vendor/ @neondatabase/compute
/libs/pageserver_api/ @neondatabase/storage
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/storage
# Shared
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage

519
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
## Azure SDK crates
azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
## Local libraries
compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }

View File

@@ -115,7 +115,7 @@ RUN set -e \
# Keep the version the same as in compute/compute-node.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
ENV SQL_EXPORTER_VERSION=0.13.1
ENV SQL_EXPORTER_VERSION=0.16.0
RUN curl -fsSL \
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
--output sql_exporter.tar.gz \

View File

@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
# Keep the version the same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
#########################################################################################
#

View File

@@ -6,6 +6,7 @@
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
import 'sql_exporter/compute_current_lsn.libsonnet',
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
import 'sql_exporter/compute_max_connections.libsonnet',
import 'sql_exporter/compute_receive_lsn.libsonnet',
import 'sql_exporter/compute_subscriptions_count.libsonnet',

View File

@@ -0,0 +1,7 @@
SELECT
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;

View File

@@ -0,0 +1,17 @@
local neon = import 'neon.libsonnet';
local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
{
metric_name: 'compute_logical_snapshots_bytes',
type: 'gauge',
help: 'Size of the pg_logical/snapshots directory, not including temporary files',
key_labels: [
'timeline_id',
],
values: [
'logical_snapshots_bytes',
],
query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
}

View File

@@ -0,0 +1,9 @@
SELECT
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
) AS logical_snapshots_bytes;

File diff suppressed because it is too large Load Diff

View File

@@ -246,47 +246,48 @@ fn try_spec_from_cli(
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
let spec;
let mut live_config_allowed = false;
match spec_json {
// First, try to get cluster spec from the cli argument
Some(json) => {
info!("got spec from cli argument {}", json);
spec = Some(serde_json::from_str(json)?);
}
None => {
// Second, try to read it from the file if path is provided
if let Some(sp) = spec_path {
let path = Path::new(sp);
let file = File::open(path)?;
spec = Some(serde_json::from_reader(file)?);
live_config_allowed = true;
} else if let Some(id) = compute_id {
if let Some(cp_base) = control_plane_uri {
live_config_allowed = true;
spec = match get_spec_from_control_plane(cp_base, id) {
Ok(s) => s,
Err(e) => {
error!("cannot get response from control plane: {}", e);
panic!("neither spec nor confirmation that compute is in the Empty state was received");
}
};
} else {
panic!("must specify both --control-plane-uri and --compute-id or none");
}
} else {
panic!(
"compute spec should be provided by one of the following ways: \
--spec OR --spec-path OR --control-plane-uri and --compute-id"
);
}
}
// First, try to get cluster spec from the cli argument
if let Some(spec_json) = spec_json {
info!("got spec from cli argument {}", spec_json);
return Ok(CliSpecParams {
spec: Some(serde_json::from_str(spec_json)?),
live_config_allowed: false,
});
}
// Second, try to read it from the file if path is provided
if let Some(spec_path) = spec_path {
let file = File::open(Path::new(spec_path))?;
return Ok(CliSpecParams {
spec: Some(serde_json::from_reader(file)?),
live_config_allowed: true,
});
}
let Some(compute_id) = compute_id else {
panic!(
"compute spec should be provided by one of the following ways: \
--spec OR --spec-path OR --control-plane-uri and --compute-id"
);
};
let Some(control_plane_uri) = control_plane_uri else {
panic!("must specify both --control-plane-uri and --compute-id or none");
};
Ok(CliSpecParams {
spec,
live_config_allowed,
})
match get_spec_from_control_plane(control_plane_uri, compute_id) {
Ok(spec) => Ok(CliSpecParams {
spec,
live_config_allowed: true,
}),
Err(e) => {
error!(
"cannot get response from control plane: {}\n\
neither spec nor confirmation that compute is in the Empty state was received",
e
);
Err(e)
}
}
}
struct CliSpecParams {

View File

@@ -1243,12 +1243,7 @@ impl ComputeNode {
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
// TODO(ololobus): We need a concurrency during reconfiguration as well,
// but DB is already running and used by user. We can easily get out of
// `max_connections` limit, and the current code won't handle that.
// let compute_state = self.state.lock().unwrap().clone();
// let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
let max_concurrent_connections = 1;
let max_concurrent_connections = spec.reconfigure_concurrency;
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:

View File

@@ -537,12 +537,14 @@ components:
properties:
extname:
type: string
versions:
type: array
version:
type: string
items:
type: string
n_databases:
type: integer
owned_by_superuser:
type: integer
SetRoleGrantsRequest:
type: object

View File

@@ -1,7 +1,6 @@
use compute_api::responses::{InstalledExtension, InstalledExtensions};
use metrics::proto::MetricFamily;
use std::collections::HashMap;
use std::collections::HashSet;
use anyhow::Result;
use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
/// Connect to every database (see list_dbs above) and get the list of installed extensions.
///
/// Same extension can be installed in multiple databases with different versions,
/// we only keep the highest and lowest version across all databases.
/// so we report a separate metric (number of databases where it is installed)
/// for each extension version.
pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
conf.application_name("compute_ctl:get_installed_extensions");
let mut client = conf.connect(NoTls)?;
let databases: Vec<String> = list_dbs(&mut client)?;
let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
for db in databases.iter() {
conf.dbname(db);
let mut db_client = conf.connect(NoTls)?;
let extensions: Vec<(String, String)> = db_client
let extensions: Vec<(String, String, i32)> = db_client
.query(
"SELECT extname, extversion FROM pg_catalog.pg_extension;",
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)?
.iter()
.map(|row| (row.get("extname"), row.get("extversion")))
.map(|row| {
(
row.get("extname"),
row.get("extversion"),
row.get("extowner"),
)
})
.collect();
for (extname, v) in extensions.iter() {
for (extname, v, extowner) in extensions.iter() {
let version = v.to_string();
// increment the number of databases where the version of extension is installed
INSTALLED_EXTENSIONS
.with_label_values(&[extname, &version])
.inc();
// check if the extension is owned by superuser
// 10 is the oid of superuser
let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };
extensions_map
.entry(extname.to_string())
.entry((
extname.to_string(),
version.clone(),
owned_by_superuser.to_string(),
))
.and_modify(|e| {
e.versions.insert(version.clone());
// count the number of databases where the extension is installed
e.n_databases += 1;
})
.or_insert(InstalledExtension {
extname: extname.to_string(),
versions: HashSet::from([version.clone()]),
version: version.clone(),
n_databases: 1,
owned_by_superuser: owned_by_superuser.to_string(),
});
}
}
let res = InstalledExtensions {
extensions: extensions_map.into_values().collect(),
};
for (key, ext) in extensions_map.iter() {
let (extname, version, owned_by_superuser) = key;
let n_databases = ext.n_databases as u64;
Ok(res)
INSTALLED_EXTENSIONS
.with_label_values(&[extname, version, owned_by_superuser])
.set(n_databases);
}
Ok(InstalledExtensions {
extensions: extensions_map.into_values().collect(),
})
}
static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"compute_installed_extensions",
"Number of databases where the version of extension is installed",
&["extension_name", "version"]
&["extension_name", "version", "owned_by_superuser"]
)
.expect("failed to define a metric")
});

View File

@@ -53,6 +53,7 @@ use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use serde::{Deserialize, Serialize};
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -618,6 +619,7 @@ impl Endpoint {
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: 1,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -808,7 +810,7 @@ impl Endpoint {
}
let client = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.timeout(Duration::from_secs(120))
.build()
.unwrap();
let response = client
@@ -817,6 +819,7 @@ impl Endpoint {
self.http_address.ip(),
self.http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?

View File

@@ -435,7 +435,7 @@ impl PageServerNode {
) -> anyhow::Result<()> {
let config = Self::parse_config(settings)?;
self.http_client
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
.set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
.await?;
Ok(())

View File

@@ -9,8 +9,8 @@ use pageserver_api::{
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
TenantShardSplitResponse,
ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
@@ -116,9 +116,19 @@ enum Command {
#[arg(long)]
tenant_shard_id: TenantShardId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// Set the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
/// Any previous tenant configs are overwritten.
SetTenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
config: String,
},
/// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
/// provided JSON are unset from the tenant config and all fields with non-null values are set.
/// Unspecified fields are not changed.
PatchTenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
Command::SetTenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.tenant_config(&TenantConfigRequest {
.set_tenant_config(&TenantConfigRequest {
tenant_id,
config: tenant_conf,
})
.await?;
}
Command::PatchTenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.patch_tenant_config(&TenantConfigPatchRequest {
tenant_id,
config: tenant_conf,
})
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
threshold,
} => {
vps_client
.tenant_config(&TenantConfigRequest {
.set_tenant_config(&TenantConfigRequest {
tenant_id,
config: TenantConfig {
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(

View File

@@ -42,6 +42,7 @@ allow = [
"MPL-2.0",
"OpenSSL",
"Unicode-DFS-2016",
"Unicode-3.0",
]
confidence-threshold = 0.8
exceptions = [

View File

@@ -1,6 +1,5 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use std::collections::HashSet;
use std::fmt::Display;
use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
#[derive(Clone, Debug, Default, Serialize)]
pub struct InstalledExtension {
pub extname: String,
pub versions: HashSet<String>,
pub version: String,
pub n_databases: u32, // Number of databases using this extension
pub owned_by_superuser: String,
}
#[derive(Clone, Debug, Default, Serialize)]

View File

@@ -19,6 +19,10 @@ pub type PgIdent = String;
/// String type alias representing Postgres extension version
pub type ExtVersion = String;
fn default_reconfigure_concurrency() -> usize {
1
}
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
/// An optinal hint that can be passed to speed up startup time if we know
/// An optional hint that can be passed to speed up startup time if we know
/// that no pg catalog mutations (like role creation, database creation,
/// extension creation) need to be done on the actual database to start.
#[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
#[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
/// Local Proxy configuration used for JWT authentication
#[serde(default)]
pub local_proxy_config: Option<LocalProxySpec>,
/// Number of concurrent connections during the parallel RunInEachDatabase
/// phase of the apply config process.
///
/// We need a higher concurrency during reconfiguration in case of many DBs,
/// but instance is already running and used by client. We can easily get out of
/// `max_connections` limit, and the current code won't handle that.
///
/// Default is 1, but also allow control plane to override this value for specific
/// projects. It's also recommended to bump `superuser_reserved_connections` +=
/// `reconfigure_concurrency` for such projects to ensure that we always have
/// enough spare connections for reconfiguration process to succeed.
#[serde(default = "default_reconfigure_concurrency")]
pub reconfigure_concurrency: usize,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {
// Features list defaults to empty vector.
assert!(spec.features.is_empty());
// Reconfigure concurrency defaults to 1.
assert_eq!(spec.reconfigure_concurrency, 1);
}
#[test]

View File

@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
}
}
/// Scheduling policy enables us to selectively disable some automatic actions that the
/// controller performs on a tenant shard. This is only set to a non-default value by
/// human intervention, and it is reset to the default value (Active) when the tenant's
/// placement policy is modified away from Attached.
///
/// The typical use of a non-Active scheduling policy is one of:
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
///
/// If you're not sure which policy to use to pin a shard to its current location, you probably
/// want Pause.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum ShardSchedulingPolicy {
// Normal mode: the tenant's scheduled locations may be updated at will, including

View File

@@ -24,7 +24,7 @@ pub struct Key {
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
/// a struct of fields.
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
pub struct CompactKey(i128);
/// The storage key size.

View File

@@ -17,7 +17,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_with::serde_as;
use utils::{
completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
}
}
#[derive(Debug, Default, Clone, Eq, PartialEq)]
pub enum FieldPatch<T> {
Upsert(T),
Remove,
#[default]
Noop,
}
impl<T> FieldPatch<T> {
fn is_noop(&self) -> bool {
matches!(self, FieldPatch::Noop)
}
pub fn apply(self, target: &mut Option<T>) {
match self {
Self::Upsert(v) => *target = Some(v),
Self::Remove => *target = None,
Self::Noop => {}
}
}
pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
match self {
Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
Self::Remove => Ok(FieldPatch::<U>::Remove),
Self::Noop => Ok(FieldPatch::<U>::Noop),
}
}
}
impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Option::deserialize(deserializer).map(|opt| match opt {
None => FieldPatch::Remove,
Some(val) => FieldPatch::Upsert(val),
})
}
}
impl<T: Serialize> Serialize for FieldPatch<T> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match self {
FieldPatch::Upsert(val) => serializer.serialize_some(val),
FieldPatch::Remove => serializer.serialize_none(),
FieldPatch::Noop => unreachable!(),
}
}
}
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
#[serde(default)]
pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub checkpoint_distance: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub checkpoint_timeout: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_target_size: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_period: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_threshold: FieldPatch<usize>,
// defer parsing compaction_algorithm, like eviction_policy
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_horizon: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_period: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub image_creation_threshold: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub pitr_interval: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub walreceiver_connect_timeout: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lagging_wal_timeout: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub eviction_policy: FieldPatch<EvictionPolicy>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub min_resident_size_override: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub heatmap_period: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lazy_slru_download: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub image_layer_creation_check_threshold: FieldPatch<u8>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lsn_lease_length: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub lsn_lease_length_for_ts: FieldPatch<String>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub timeline_offloading: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
}
/// An alternative representation of `pageserver::tenant::TenantConf` with
/// simpler types.
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
}
impl TenantConfig {
pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
let Self {
mut checkpoint_distance,
mut checkpoint_timeout,
mut compaction_target_size,
mut compaction_period,
mut compaction_threshold,
mut compaction_algorithm,
mut gc_horizon,
mut gc_period,
mut image_creation_threshold,
mut pitr_interval,
mut walreceiver_connect_timeout,
mut lagging_wal_timeout,
mut max_lsn_wal_lag,
mut eviction_policy,
mut min_resident_size_override,
mut evictions_low_residence_duration_metric_threshold,
mut heatmap_period,
mut lazy_slru_download,
mut timeline_get_throttle,
mut image_layer_creation_check_threshold,
mut lsn_lease_length,
mut lsn_lease_length_for_ts,
mut timeline_offloading,
mut wal_receiver_protocol_override,
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
patch
.compaction_target_size
.apply(&mut compaction_target_size);
patch.compaction_period.apply(&mut compaction_period);
patch.compaction_threshold.apply(&mut compaction_threshold);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.gc_horizon.apply(&mut gc_horizon);
patch.gc_period.apply(&mut gc_period);
patch
.image_creation_threshold
.apply(&mut image_creation_threshold);
patch.pitr_interval.apply(&mut pitr_interval);
patch
.walreceiver_connect_timeout
.apply(&mut walreceiver_connect_timeout);
patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
patch.eviction_policy.apply(&mut eviction_policy);
patch
.min_resident_size_override
.apply(&mut min_resident_size_override);
patch
.evictions_low_residence_duration_metric_threshold
.apply(&mut evictions_low_residence_duration_metric_threshold);
patch.heatmap_period.apply(&mut heatmap_period);
patch.lazy_slru_download.apply(&mut lazy_slru_download);
patch
.timeline_get_throttle
.apply(&mut timeline_get_throttle);
patch
.image_layer_creation_check_threshold
.apply(&mut image_layer_creation_check_threshold);
patch.lsn_lease_length.apply(&mut lsn_lease_length);
patch
.lsn_lease_length_for_ts
.apply(&mut lsn_lease_length_for_ts);
patch.timeline_offloading.apply(&mut timeline_offloading);
patch
.wal_receiver_protocol_override
.apply(&mut wal_receiver_protocol_override);
Self {
checkpoint_distance,
checkpoint_timeout,
compaction_target_size,
compaction_period,
compaction_threshold,
compaction_algorithm,
gc_horizon,
gc_period,
image_creation_threshold,
pitr_interval,
walreceiver_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,
eviction_policy,
min_resident_size_override,
evictions_low_residence_duration_metric_threshold,
heatmap_period,
lazy_slru_download,
timeline_get_throttle,
image_layer_creation_check_threshold,
lsn_lease_length,
lsn_lease_length_for_ts,
timeline_offloading,
wal_receiver_protocol_override,
}
}
}
/// The policy for the aux file storage.
///
/// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantConfigPatchRequest {
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
}
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
#[derive(Serialize, Deserialize, Clone)]
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
);
}
}
#[test]
fn test_tenant_config_patch_request_serde() {
let patch_request = TenantConfigPatchRequest {
tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
config: TenantConfigPatch {
checkpoint_distance: FieldPatch::Upsert(42),
gc_horizon: FieldPatch::Remove,
compaction_threshold: FieldPatch::Noop,
..TenantConfigPatch::default()
},
};
let json = serde_json::to_string(&patch_request).unwrap();
let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
assert_eq!(json, expected);
let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
assert_eq!(decoded.tenant_id, patch_request.tenant_id);
assert_eq!(decoded.config, patch_request.config);
// Now apply the patch to a config to demonstrate semantics
let base = TenantConfig {
checkpoint_distance: Some(28),
gc_horizon: Some(100),
compaction_target_size: Some(1024),
..Default::default()
};
let expected = TenantConfig {
checkpoint_distance: Some(42),
gc_horizon: None,
..base.clone()
};
let patched = base.apply_patch(decoded.config);
assert_eq!(patched, expected);
}
}

View File

@@ -158,7 +158,8 @@ impl ShardIdentity {
key_to_shard_number(self.count, self.stripe_size, key)
}
/// Return true if the key should be ingested by this shard
/// Return true if the key is stored only on this shard. This does not include
/// global keys, see is_key_global().
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
}
/// Return true if the key should be stored on all shards, not just one.
fn is_key_global(&self, key: &Key) -> bool {
pub fn is_key_global(&self, key: &Key) -> bool {
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
// Special keys that are only stored on shard 0
false

View File

@@ -8,15 +8,14 @@ use std::io;
use std::num::NonZeroU32;
use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::SystemTime;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Context;
use anyhow::Result;
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
use azure_core::{Continuable, RetryOptions};
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
use azure_storage_blobs::blob::CopyStatus;
use azure_storage_blobs::prelude::ClientBuilder;
@@ -76,8 +75,9 @@ impl AzureBlobStorage {
let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
StorageCredentials::access_key(account.clone(), access_key)
} else {
let token_credential = DefaultAzureCredential::default();
StorageCredentials::token_credential(Arc::new(token_credential))
let token_credential = azure_identity::create_default_credential()
.context("trying to obtain Azure default credentials")?;
StorageCredentials::token_credential(token_credential)
};
// we have an outer retry
@@ -624,6 +624,10 @@ impl RemoteStorage for AzureBlobStorage {
res
}
fn max_keys_per_delete(&self) -> usize {
super::MAX_KEYS_PER_DELETE_AZURE
}
async fn copy(
&self,
from: &RemotePath,

View File

@@ -70,7 +70,14 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
/// As defined in S3 docs
pub const MAX_KEYS_PER_DELETE: usize = 1000;
///
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
/// As defined in Azure docs
///
/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -340,6 +347,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
cancel: &CancellationToken,
) -> anyhow::Result<()>;
/// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
///
/// The value returned is only an optimization hint, One can pass larger number of objects to
/// `delete_objects` as well.
///
/// The value is guaranteed to be >= 1.
fn max_keys_per_delete(&self) -> usize;
/// Deletes all objects matching the given prefix.
///
/// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
@@ -533,6 +548,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
/// [`RemoteStorage::max_keys_per_delete`]
pub fn max_keys_per_delete(&self) -> usize {
match self {
Self::LocalFs(s) => s.max_keys_per_delete(),
Self::AwsS3(s) => s.max_keys_per_delete(),
Self::AzureBlob(s) => s.max_keys_per_delete(),
Self::Unreliable(s) => s.max_keys_per_delete(),
}
}
/// See [`RemoteStorage::delete_prefix`]
pub async fn delete_prefix(
&self,

View File

@@ -573,6 +573,10 @@ impl RemoteStorage for LocalFs {
Ok(())
}
fn max_keys_per_delete(&self) -> usize {
super::MAX_KEYS_PER_DELETE_S3
}
async fn copy(
&self,
from: &RemotePath,

View File

@@ -48,7 +48,7 @@ use crate::{
metrics::{start_counting_cancelled_wait, start_measuring_requests},
support::PermitCarrying,
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
@@ -355,7 +355,7 @@ impl S3Bucket {
let kind = RequestKind::Delete;
let mut cancel = std::pin::pin!(cancel.cancelled());
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
let started_at = start_measuring_requests(kind);
let req = self
@@ -832,6 +832,10 @@ impl RemoteStorage for S3Bucket {
self.delete_oids(&permit, &delete_objects, cancel).await
}
fn max_keys_per_delete(&self) -> usize {
MAX_KEYS_PER_DELETE_S3
}
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
let paths = std::array::from_ref(path);
self.delete_objects(paths, cancel).await

View File

@@ -203,6 +203,10 @@ impl RemoteStorage for UnreliableWrapper {
Ok(())
}
fn max_keys_per_delete(&self) -> usize {
self.inner.max_keys_per_delete()
}
async fn copy(
&self,
from: &RemotePath,

View File

@@ -94,6 +94,8 @@ pub mod toml_edit_ext;
pub mod circuit_breaker;
pub mod try_rcu;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]
pub use git_version;

View File

@@ -164,6 +164,12 @@ impl TenantShardId {
}
}
impl std::fmt::Display for ShardNumber {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl std::fmt::Display for ShardSlug<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(

77
libs/utils/src/try_rcu.rs Normal file
View File

@@ -0,0 +1,77 @@
//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
pub trait ArcSwapExt<T> {
/// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
where
F: FnMut(&T) -> Result<R, E>,
R: Into<T>;
}
impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
where
T: arc_swap::RefCnt,
S: arc_swap::strategy::CaS<T>,
{
fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
where
F: FnMut(&T) -> Result<R, E>,
R: Into<T>,
{
fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
where
A: arc_swap::AsRaw<Base>,
B: arc_swap::AsRaw<Base>,
{
let a = a.as_raw();
let b = b.as_raw();
std::ptr::eq(a, b)
}
let mut cur = self.load();
loop {
let new = f(&cur)?.into();
let prev = self.compare_and_swap(&*cur, new);
let swapped = ptr_eq(&*cur, &*prev);
if swapped {
return Ok(arc_swap::Guard::into_inner(prev));
} else {
cur = prev;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use arc_swap::ArcSwap;
use std::sync::Arc;
#[test]
fn test_try_rcu_success() {
let swap = ArcSwap::from(Arc::new(42));
let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
assert!(result.is_ok());
assert_eq!(**swap.load(), 43);
}
#[test]
fn test_try_rcu_error() {
let swap = ArcSwap::from(Arc::new(42));
let result = swap.try_rcu(|value| -> Result<i32, _> {
if **value == 42 {
Err("err")
} else {
Ok(**value + 1)
}
});
assert!(result.is_err());
assert_eq!(result.unwrap_err(), "err");
assert_eq!(**swap.load(), 42);
}
}

View File

@@ -37,7 +37,7 @@ message ValueMeta {
}
message CompactKey {
int64 high = 1;
int64 low = 2;
uint64 high = 1;
uint64 low = 2;
}

View File

@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
impl From<CompactKey> for proto::CompactKey {
fn from(value: CompactKey) -> Self {
proto::CompactKey {
high: (value.raw() >> 64) as i64,
low: value.raw() as i64,
high: (value.raw() >> 64) as u64,
low: value.raw() as u64,
}
}
}
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
(((value.high as i128) << 64) | (value.low as i128)).into()
}
}
#[test]
fn test_compact_key_with_large_relnode() {
use pageserver_api::key::Key;
let inputs = vec![
Key {
field1: 0,
field2: 0x100,
field3: 0x200,
field4: 0,
field5: 0x10,
field6: 0x5,
},
Key {
field1: 0,
field2: 0x100,
field3: 0x200,
field4: 0x007FFFFF,
field5: 0x10,
field6: 0x5,
},
Key {
field1: 0,
field2: 0x100,
field3: 0x200,
field4: 0x00800000,
field5: 0x10,
field6: 0x5,
},
Key {
field1: 0,
field2: 0x100,
field3: 0x200,
field4: 0x00800001,
field5: 0x10,
field6: 0x5,
},
Key {
field1: 0,
field2: 0xFFFFFFFF,
field3: 0xFFFFFFFF,
field4: 0xFFFFFFFF,
field5: 0x0,
field6: 0x0,
},
];
for input in inputs {
assert!(input.is_valid_key_on_write_path());
let compact = input.to_compact();
let proto: proto::CompactKey = compact.into();
let from_proto: CompactKey = proto.into();
assert_eq!(
compact, from_proto,
"Round trip failed for key with relnode={:#x}",
input.field4
);
}
}

View File

@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
println!("cargo:rustc-link-lib=static=walproposer");
println!("cargo:rustc-link-lib=static=pgport");
println!("cargo:rustc-link-lib=static=pgcommon");
println!("cargo:rustc-link-lib=static=walproposer");
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
// Rebuild crate when libwalproposer.a changes

View File

@@ -270,12 +270,18 @@ impl Client {
Ok(body)
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
Ok(())
}
pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PATCH, &uri, req).await?;
Ok(())
}
pub async fn tenant_secondary_download(
&self,
tenant_id: TenantShardId,

View File

@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
println!("operating on timeline {}", timeline);
mgmt_api_client
.tenant_config(&TenantConfigRequest {
.set_tenant_config(&TenantConfigRequest {
tenant_id: timeline.tenant_id,
config: TenantConfig::default(),
})

View File

@@ -9,7 +9,6 @@
use remote_storage::GenericRemoteStorage;
use remote_storage::RemotePath;
use remote_storage::TimeoutOrCancel;
use remote_storage::MAX_KEYS_PER_DELETE;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
use tracing::info;
@@ -131,7 +130,8 @@ impl Deleter {
}
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
self.accumulator.reserve(max_keys_per_delete);
loop {
if self.cancel.is_cancelled() {
@@ -156,14 +156,14 @@ impl Deleter {
match msg {
DeleterMessage::Delete(mut list) => {
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
if self.accumulator.len() == max_keys_per_delete {
self.flush().await?;
// If we have received this number of keys, proceed with attempting to execute
assert_eq!(self.accumulator.len(), 0);
}
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
let available_slots = max_keys_per_delete - self.accumulator.len();
let take_count = std::cmp::min(available_slots, list.len());
for path in list.drain(list.len() - take_count..) {
self.accumulator.push(path);

View File

@@ -767,7 +767,27 @@ paths:
/v1/tenant/config:
put:
description: |
Update tenant's config.
Update tenant's config by setting it to the provided value
Invalid fields in the tenant config will cause the request to be rejected with status 400.
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/TenantConfigRequest"
responses:
"200":
description: OK
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/TenantInfo"
patch:
description: |
Update tenant's config additively by patching the updated fields provided.
Null values unset the field and non-null values upsert it.
Invalid fields in the tenant config will cause the request to be rejected with status 400.
requestBody:

View File

@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::OffloadedTimelineInfo;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantConfigPatchRequest;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigRequest;
use pageserver_api::models::TenantLocationConfigResponse;
@@ -87,7 +88,7 @@ use crate::tenant::timeline::offload::offload_timeline;
use crate::tenant::timeline::offload::OffloadError;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactOptions;
use crate::tenant::timeline::CompactRange;
use crate::tenant::timeline::CompactRequest;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
tenant.set_new_tenant_config(new_tenant_conf);
let _ = tenant
.update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
.expect("Closure returns Ok()");
json_response(StatusCode::OK, ())
}
async fn patch_tenant_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let updated = tenant
.update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
.map_err(ApiError::BadRequest)?;
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
updated,
tenant.get_generation(),
&ShardParameters::default(),
);
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
json_response(StatusCode::OK, ())
}
@@ -1978,6 +2019,26 @@ async fn timeline_gc_handler(
json_response(StatusCode::OK, gc_result)
}
// Cancel scheduled compaction tasks
async fn timeline_cancel_compact_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.cancel_scheduled_compaction(timeline_id);
json_response(StatusCode::OK, ())
}
.instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await
}
// Run compaction immediately on given timeline.
async fn timeline_compact_handler(
mut request: Request<Body>,
@@ -1987,7 +2048,7 @@ async fn timeline_compact_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
let state = get_state(&request);
@@ -2012,22 +2073,50 @@ async fn timeline_compact_handler(
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
let wait_until_scheduled_compaction_done =
parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
.unwrap_or(false);
let sub_compaction = compact_request
.as_ref()
.map(|r| r.sub_compaction)
.unwrap_or(false);
let options = CompactOptions {
compact_range,
compact_range: compact_request
.as_ref()
.and_then(|r| r.compact_range.clone()),
compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
flags,
sub_compaction,
};
let scheduled = compact_request
.as_ref()
.map(|r| r.scheduled)
.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
timeline
.compact_with_options(&cancel, options, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?;
if scheduled {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
if wait_until_scheduled_compaction_done {
// It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
rx.await.ok();
}
} else {
timeline
.compact_with_options(&cancel, options, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?;
}
}
json_response(StatusCode::OK, ())
}
@@ -2108,16 +2197,20 @@ async fn timeline_checkpoint_handler(
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
let wait_until_flushed: bool =
parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
timeline
.freeze_and_flush()
.await
.map_err(|e| {
if wait_until_flushed {
timeline.freeze_and_flush().await
} else {
timeline.freeze().await.and(Ok(()))
}.map_err(|e| {
match e {
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(other.into()),
@@ -3236,6 +3329,9 @@ pub fn make_router(
.get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
})
.patch("/v1/tenant/config", |r| {
api_handler(r, patch_tenant_config_handler)
})
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
})
@@ -3301,6 +3397,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| api_handler(r, timeline_compact_handler),
)
.delete(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| api_handler(r, timeline_cancel_compact_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
|r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),

View File

@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
use pq_proto::framed::ConnectionError;
use strum::{EnumCount, VariantNames};
use strum_macros::{IntoStaticStr, VariantNames};
use tracing::warn;
use utils::id::TimelineId;
/// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -464,6 +463,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_disk_consistent_lsn",
"Disk consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_projected_remote_consistent_lsn",
"Projected remote consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_pitr_history_size",
@@ -1205,54 +1222,163 @@ pub(crate) mod virtual_file_io_engine {
});
}
pub(crate) struct SmgrOpTimer {
global_latency_histo: Histogram,
pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
pub(crate) struct SmgrOpTimerInner {
global_execution_latency_histo: Histogram,
per_timeline_execution_latency_histo: Option<Histogram>,
// Optional because not all op types are tracked per-timeline
per_timeline_latency_histo: Option<Histogram>,
global_batch_wait_time: Histogram,
per_timeline_batch_wait_time: Histogram,
start: Instant,
throttled: Duration,
op: SmgrQueryType,
global_flush_in_progress_micros: IntCounter,
per_timeline_flush_in_progress_micros: IntCounter,
timings: SmgrOpTimerState,
}
#[derive(Debug)]
enum SmgrOpTimerState {
Received {
received_at: Instant,
},
ThrottleDoneExecutionStarting {
received_at: Instant,
throttle_started_at: Instant,
started_execution_at: Instant,
},
}
pub(crate) struct SmgrOpFlushInProgress {
flush_started_at: Instant,
global_micros: IntCounter,
per_timeline_micros: IntCounter,
}
impl SmgrOpTimer {
pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
let Some(throttle) = throttle else {
return;
};
self.throttled += *throttle;
pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
let inner = self.0.as_mut().expect("other public methods consume self");
match (&mut inner.timings, throttle) {
(SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
ThrottleResult::NotThrottled { start } => {
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
received_at: *received_at,
throttle_started_at: *start,
started_execution_at: *start,
};
}
ThrottleResult::Throttled { start, end } => {
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
received_at: *start,
throttle_started_at: *start,
started_execution_at: *end,
};
}
},
(x, _) => panic!("called in unexpected state: {x:?}"),
}
}
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
let (flush_start, inner) = self
.smgr_op_end()
.expect("this method consume self, and the only other caller is drop handler");
let SmgrOpTimerInner {
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
..
} = inner;
SmgrOpFlushInProgress {
flush_started_at: flush_start,
global_micros: global_flush_in_progress_micros,
per_timeline_micros: per_timeline_flush_in_progress_micros,
}
}
/// Returns `None`` if this method has already been called, `Some` otherwise.
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
let inner = self.0.take()?;
let now = Instant::now();
let batch;
let execution;
let throttle;
match inner.timings {
SmgrOpTimerState::Received { received_at } => {
batch = (now - received_at).as_secs_f64();
// TODO: use label for dropped requests.
// This is quite rare in practice, only during tenant/pageservers shutdown.
throttle = Duration::ZERO;
execution = Duration::ZERO.as_secs_f64();
}
SmgrOpTimerState::ThrottleDoneExecutionStarting {
received_at,
throttle_started_at,
started_execution_at,
} => {
batch = (throttle_started_at - received_at).as_secs_f64();
throttle = started_execution_at - throttle_started_at;
execution = (now - started_execution_at).as_secs_f64();
}
}
// update time spent in batching
inner.global_batch_wait_time.observe(batch);
inner.per_timeline_batch_wait_time.observe(batch);
// time spent in throttle metric is updated by throttle impl
let _ = throttle;
// update metrics for execution latency
inner.global_execution_latency_histo.observe(execution);
if let Some(per_timeline_execution_latency_histo) =
&inner.per_timeline_execution_latency_histo
{
per_timeline_execution_latency_histo.observe(execution);
}
Some((now, inner))
}
}
impl Drop for SmgrOpTimer {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
self.smgr_op_end();
}
}
let elapsed = match elapsed.checked_sub(self.throttled) {
Some(elapsed) => elapsed,
None => {
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
Lazy::new(|| {
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
RateLimit::new(Duration::from_secs(10))
})))
});
let mut guard = LOGGED.lock().unwrap();
let rate_limit = &mut guard[self.op];
rate_limit.call(|| {
warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
});
elapsed // un-throttled time, more info than just saturating to 0
impl SmgrOpFlushInProgress {
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
where
Fut: std::future::Future<Output = O>,
{
let mut fut = std::pin::pin!(fut);
let now = Instant::now();
// Whenever observe_guard gets called, or dropped,
// it adds the time elapsed since its last call to metrics.
// Last call is tracked in `now`.
let mut observe_guard = scopeguard::guard(
|| {
let elapsed = now - self.flush_started_at;
self.global_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.flush_started_at = now;
},
|mut observe| {
observe();
},
);
loop {
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
Ok(v) => return v,
Err(_timeout) => {
(*observe_guard)();
}
}
};
let elapsed = elapsed.as_secs_f64();
self.global_latency_histo.observe(elapsed);
if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
per_timeline_getpage_histo.observe(elapsed);
}
}
}
@@ -1284,6 +1410,10 @@ pub(crate) struct SmgrQueryTimePerTimeline {
per_timeline_getpage_latency: Histogram,
global_batch_size: Histogram,
per_timeline_batch_size: Histogram,
global_flush_in_progress_micros: IntCounter,
per_timeline_flush_in_progress_micros: IntCounter,
global_batch_wait_time: Histogram,
per_timeline_batch_wait_time: Histogram,
}
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1306,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
.expect("failed to define a metric")
});
// Alias so all histograms recording per-timeline smgr timings use the same buckets.
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
"Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
&["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
)
.expect("failed to define a metric")
});
@@ -1369,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds_global",
"Time spent on smgr query handling, aggregated by query type.",
"Like pageserver_smgr_query_seconds, but aggregated to instance level.",
&["smgr_query_type"],
SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
)
@@ -1446,6 +1579,45 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
.set(value.try_into().unwrap());
}
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_service_pagestream_flush_in_progress_micros",
"Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
easily discoverable in monitoring. \
Hence, this is NOT a completion latency historgram.",
&["tenant_id", "shard_id", "timeline_id"],
)
.expect("failed to define a metric")
});
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_service_pagestream_flush_in_progress_micros_global",
"Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
)
.expect("failed to define a metric")
});
static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_page_service_pagestream_batch_wait_time_seconds",
"Time a request spent waiting in its batch until the batch moved to throttle&execution.",
&["tenant_id", "shard_id", "timeline_id"],
SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
)
.expect("failed to define a metric")
});
static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_service_pagestream_batch_wait_time_seconds_global",
"Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
)
.expect("failed to define a metric")
});
impl SmgrQueryTimePerTimeline {
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1486,6 +1658,17 @@ impl SmgrQueryTimePerTimeline {
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
let global_flush_in_progress_micros =
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
Self {
global_started,
global_latency,
@@ -1493,9 +1676,13 @@ impl SmgrQueryTimePerTimeline {
per_timeline_getpage_started,
global_batch_size,
per_timeline_batch_size,
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
global_batch_wait_time,
per_timeline_batch_wait_time,
}
}
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
self.global_started[op as usize].inc();
let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1505,13 +1692,17 @@ impl SmgrQueryTimePerTimeline {
None
};
SmgrOpTimer {
global_latency_histo: self.global_latency[op as usize].clone(),
per_timeline_latency_histo,
start: started_at,
op,
throttled: Duration::ZERO,
}
SmgrOpTimer(Some(SmgrOpTimerInner {
global_execution_latency_histo: self.global_latency[op as usize].clone(),
per_timeline_execution_latency_histo: per_timeline_latency_histo,
timings: SmgrOpTimerState::Received { received_at },
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
per_timeline_flush_in_progress_micros: self
.per_timeline_flush_in_progress_micros
.clone(),
global_batch_wait_time: self.global_batch_wait_time.clone(),
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
}))
}
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2186,6 +2377,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
.expect("failed to define a metric"),
});
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_timeline_wal_records_received",
"Number of WAL records received per shard",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_seconds",
@@ -2394,7 +2594,8 @@ pub(crate) struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub last_record_lsn_gauge: IntGauge,
pub disk_consistent_lsn_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
@@ -2412,6 +2613,7 @@ pub(crate) struct TimelineMetrics {
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
/// Number of valid LSN leases.
pub valid_lsn_lease_count_gauge: UIntGauge,
pub wal_records_received: IntCounter,
shutdown: std::sync::atomic::AtomicBool,
}
@@ -2475,7 +2677,11 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let last_record_gauge = LAST_RECORD_LSN
let last_record_lsn_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2565,6 +2771,10 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
TimelineMetrics {
tenant_id,
shard_id,
@@ -2578,7 +2788,8 @@ impl TimelineMetrics {
garbage_collect_histo,
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
last_record_lsn_gauge,
disk_consistent_lsn_gauge,
pitr_history_size,
archival_size,
layer_size_image,
@@ -2596,6 +2807,7 @@ impl TimelineMetrics {
evictions_with_low_residence_duration,
),
valid_lsn_lease_count_gauge,
wal_records_received,
shutdown: std::sync::atomic::AtomicBool::default(),
}
}
@@ -2642,6 +2854,7 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
@@ -2732,6 +2945,21 @@ impl TimelineMetrics {
shard_id,
timeline_id,
]);
let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
}
}
@@ -2762,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::throttle::ThrottleResult;
use crate::tenant::Timeline;
/// Maintain a per timeline gauge in addition to the global gauge.
@@ -2805,6 +3034,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
}
impl RemoteTimelineClientMetrics {
@@ -2819,6 +3049,10 @@ impl RemoteTimelineClientMetrics {
.unwrap(),
);
let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
.unwrap();
RemoteTimelineClientMetrics {
tenant_id: tenant_id_str,
shard_id: shard_id_str,
@@ -2827,6 +3061,7 @@ impl RemoteTimelineClientMetrics {
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge,
projected_remote_consistent_lsn_gauge,
}
}
@@ -3040,6 +3275,7 @@ impl Drop for RemoteTimelineClientMetrics {
calls,
bytes_started_counter,
bytes_finished_counter,
projected_remote_consistent_lsn_gauge,
} = self;
for ((a, b), _) in calls.get_mut().unwrap().drain() {
let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3305,14 @@ impl Drop for RemoteTimelineClientMetrics {
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
{
let _ = projected_remote_consistent_lsn_gauge;
let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
}
}
}
@@ -3601,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
&CIRCUIT_BREAKERS_BROKEN,
&CIRCUIT_BREAKERS_UNBROKEN,
&PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
]
.into_iter()
.for_each(|c| {
@@ -3648,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
&WAL_REDO_BYTES_HISTOGRAM,
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
&PAGE_SERVICE_BATCH_SIZE_GLOBAL,
&PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
]
.into_iter()
.for_each(|h| {

View File

@@ -575,7 +575,10 @@ enum BatchedFeMessage {
}
impl BatchedFeMessage {
async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
async fn throttle_and_record_start_processing(
&mut self,
cancel: &CancellationToken,
) -> Result<(), QueryError> {
let (shard, tokens, timers) = match self {
BatchedFeMessage::Exists { shard, timer, .. }
| BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
}
};
for timer in timers {
timer.deduct_throttle(&throttled);
timer.observe_throttle_done_execution_starting(&throttled);
}
Ok(())
}
@@ -1017,10 +1020,8 @@ impl PageServerHandler {
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
let mut timers: smallvec::SmallVec<[_; 1]> =
smallvec::SmallVec::with_capacity(handler_results.len());
for handler_result in handler_results {
let response_msg = match handler_result {
let (response_msg, timer) = match handler_result {
Err(e) => match &e {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1045,66 @@ impl PageServerHandler {
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
(
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
}),
None, // TODO: measure errors
)
}
},
Ok((response_msg, timer)) => {
// Extending the lifetime of the timers so observations on drop
// include the flush time.
timers.push(timer);
response_msg
}
Ok((response_msg, timer)) => (response_msg, Some(timer)),
};
//
// marshal & transmit response message
//
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
}
tokio::select! {
biased;
_ = cancel.cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
}
res = pgb_writer.flush() => {
res?;
// We purposefully don't count flush time into the timer.
//
// The reason is that current compute client will not perform protocol processing
// if the postgres backend process is doing things other than `->smgr_read()`.
// This is especially the case for prefetch.
//
// If the compute doesn't read from the connection, eventually TCP will backpressure
// all the way into our flush call below.
//
// The timer's underlying metric is used for a storage-internal latency SLO and
// we don't want to include latency in it that we can't control.
// And as pointed out above, in this case, we don't control the time that flush will take.
let flushing_timer =
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
// what we want to do
let flush_fut = pgb_writer.flush();
// metric for how long flushing takes
let flush_fut = match flushing_timer {
Some(flushing_timer) => {
futures::future::Either::Left(flushing_timer.measure(flush_fut))
}
None => futures::future::Either::Right(flush_fut),
};
// do it while respecting cancellation
let _: () = async move {
tokio::select! {
biased;
_ = cancel.cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
}
res = flush_fut => {
res?;
}
}
Ok(())
}
// and log the info! line inside the request span
.instrument(span.clone())
.await?;
}
drop(timers);
Ok(())
}
@@ -1200,7 +1233,7 @@ impl PageServerHandler {
}
};
if let Err(cancelled) = msg.throttle(&self.cancel).await {
if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
break cancelled;
}
@@ -1367,7 +1400,9 @@ impl PageServerHandler {
return Err(e);
}
};
batch.throttle(&self.cancel).await?;
batch
.throttle_and_record_start_processing(&self.cancel)
.await?;
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
.await?;
}

View File

@@ -37,14 +37,19 @@ use remote_timeline_client::manifest::{
};
use remote_timeline_client::UploadQueueNotReadyError;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::compaction::ScheduledCompactionTask;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -63,6 +68,7 @@ use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
use utils::timeout::timeout_cancellable;
use utils::timeout::TimeoutCancellableError;
use utils::try_rcu::ArcSwapExt;
use utils::zstd::create_zst_tarball;
use utils::zstd::extract_zst_tarball;
@@ -339,6 +345,11 @@ pub struct Tenant {
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
/// a manual gc-compaction from the manual compaction API.
scheduled_compaction_tasks:
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
/// background warmup.
@@ -2953,27 +2964,109 @@ impl Tenant {
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
{
// pending_task_left == None: cannot compact, maybe still pending tasks
// pending_task_left == Some(true): compaction task left
// pending_task_left == Some(false): no compaction task left
let pending_task_left = if *can_compact {
Some(
timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
let has_pending_l0_compaction_task = timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
}
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
}
})?;
if has_pending_l0_compaction_task {
Some(true)
} else {
let mut has_pending_scheduled_compaction_task;
let next_scheduled_compaction_task = {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
if !tline_pending_tasks.is_empty() {
info!(
"{} tasks left in the compaction schedule queue",
tline_pending_tasks.len()
);
}
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
let next_task = tline_pending_tasks.pop_front();
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
next_task
} else {
has_pending_scheduled_compaction_task = false;
None
}
};
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
{
if !next_scheduled_compaction_task
.options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
} else if next_scheduled_compaction_task.options.sub_compaction {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs = timeline
.gc_compaction_split_jobs(next_scheduled_compaction_task.options)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
} else {
has_pending_scheduled_compaction_task = true;
let jobs_len = jobs.len();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
for (idx, job) in jobs.into_iter().enumerate() {
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
ScheduledCompactionTask {
options: job,
// The last job in the queue sends the signal and releases the gc guard
result_tx: next_scheduled_compaction_task
.result_tx
.take(),
gc_block: next_scheduled_compaction_task
.gc_block
.take(),
}
} else {
ScheduledCompactionTask {
options: job,
result_tx: None,
gc_block: None,
}
});
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
}
})?,
)
} else {
let _ = timeline
.compact_with_options(
cancel,
next_scheduled_compaction_task.options,
ctx,
)
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
.await?;
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
// TODO: we can send compaction statistics in the future
tx.send(()).ok();
}
}
}
Some(has_pending_scheduled_compaction_task)
}
} else {
None
};
@@ -2993,6 +3086,43 @@ impl Tenant {
Ok(has_pending_task)
}
/// Cancel scheduled compaction tasks
pub(crate) fn cancel_scheduled_compaction(
&self,
timeline_id: TimelineId,
) -> Vec<ScheduledCompactionTask> {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
current_tline_pending_tasks.into_iter().collect()
} else {
Vec::new()
}
}
/// Schedule a compaction task for a timeline.
pub(crate) async fn schedule_compaction(
&self,
timeline_id: TimelineId,
options: CompactOptions,
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
let gc_guard = match self.gc_block.start().await {
Ok(guard) => guard,
Err(e) => {
bail!("cannot run gc-compaction because gc is blocked: {}", e);
}
};
let (tx, rx) = tokio::sync::oneshot::channel();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(timeline_id).or_default();
tline_pending_tasks.push_back(ScheduledCompactionTask {
options,
result_tx: Some(tx),
gc_block: Some(gc_guard),
});
Ok(rx)
}
// Call through to all timelines to freeze ephemeral layers if needed. Usually
// this happens during ingest: this background housekeeping is for freezing layers
// that are open but haven't been written to for some time.
@@ -3792,25 +3922,28 @@ impl Tenant {
}
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
&self,
update: F,
) -> anyhow::Result<TenantConfOpt> {
// Use read-copy-update in order to avoid overwriting the location config
// state if this races with [`Tenant::set_new_location_config`]. Note that
// this race is not possible if both request types come from the storage
// controller (as they should!) because an exclusive op lock is required
// on the storage controller side.
self.tenant_conf.rcu(|inner| {
Arc::new(AttachedTenantConf {
tenant_conf: new_tenant_conf.clone(),
location: inner.location,
// Attached location is not changed, no need to update lsn lease deadline.
lsn_lease_deadline: inner.lsn_lease_deadline,
})
});
self.tenant_conf
.try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
Ok(Arc::new(AttachedTenantConf {
tenant_conf: update(attached_conf.tenant_conf.clone())?,
location: attached_conf.location,
lsn_lease_deadline: attached_conf.lsn_lease_deadline,
}))
})?;
let updated = self.tenant_conf.load().clone();
let updated = self.tenant_conf.load();
self.tenant_conf_updated(&new_tenant_conf);
self.tenant_conf_updated(&updated.tenant_conf);
// Don't hold self.timelines.lock() during the notifies.
// There's no risk of deadlock right now, but there could be if we consolidate
// mutexes in struct Timeline in the future.
@@ -3818,6 +3951,8 @@ impl Tenant {
for timeline in timelines {
timeline.tenant_conf_updated(&updated);
}
Ok(updated.tenant_conf.clone())
}
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
@@ -4005,6 +4140,7 @@ impl Tenant {
// use an extremely long backoff.
Some(Duration::from_secs(3600 * 24)),
)),
scheduled_compaction_tasks: Mutex::new(Default::default()),
activate_now_sem: tokio::sync::Semaphore::new(0),
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
cancel: CancellationToken::default(),
@@ -4376,7 +4512,12 @@ impl Tenant {
// - this timeline was created while we were finding cutoffs
// - lsn for timestamp search fails for this timeline repeatedly
if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
target.cutoffs = cutoffs.clone();
let original_cutoffs = target.cutoffs.clone();
// GC cutoffs should never go back
target.cutoffs = GcCutoffs {
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
}
}
}
@@ -8036,6 +8177,12 @@ mod tests {
)
.await?;
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x30);
@@ -8138,6 +8285,12 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
@@ -8518,6 +8671,12 @@ mod tests {
.await?
};
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -8599,6 +8758,12 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x40))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x40);
@@ -9046,6 +9211,12 @@ mod tests {
)
.await?;
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -9163,6 +9334,7 @@ mod tests {
CompactOptions {
flags: dryrun_flags,
compact_range: None,
..Default::default()
},
&ctx,
)
@@ -9187,6 +9359,12 @@ mod tests {
// increase GC horizon and compact again
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x38))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Lsn(0x38);
@@ -9282,6 +9460,12 @@ mod tests {
)
.await?;
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -9399,6 +9583,7 @@ mod tests {
CompactOptions {
flags: dryrun_flags,
compact_range: None,
..Default::default()
},
&ctx,
)
@@ -9525,6 +9710,12 @@ mod tests {
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
{
parent_tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x10))
.wait()
.await;
// Update GC info
let mut guard = parent_tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -9539,6 +9730,12 @@ mod tests {
}
{
branch_tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x50))
.wait()
.await;
// Update GC info
let mut guard = branch_tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -9868,6 +10065,12 @@ mod tests {
.await?;
{
tline
.latest_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
@@ -9885,7 +10088,15 @@ mod tests {
// Do a partial compaction on key range 0..2
tline
.partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(0)..get_key(2)).into()),
..Default::default()
},
&ctx,
)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10135,15 @@ mod tests {
// Do a partial compaction on key range 2..4
tline
.partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(2)..get_key(4)).into()),
..Default::default()
},
&ctx,
)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10187,15 @@ mod tests {
// Do a partial compaction on key range 4..9
tline
.partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(4)..get_key(9)).into()),
..Default::default()
},
&ctx,
)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10238,15 @@ mod tests {
// Do a partial compaction on key range 9..10
tline
.partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(9)..get_key(10)).into()),
..Default::default()
},
&ctx,
)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10294,15 @@ mod tests {
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
tline
.partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(0)..get_key(10)).into()),
..Default::default()
},
&ctx,
)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;

View File

@@ -11,7 +11,7 @@
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
.or(global_conf.wal_receiver_protocol_override),
}
}
pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
let Self {
mut checkpoint_distance,
mut checkpoint_timeout,
mut compaction_target_size,
mut compaction_period,
mut compaction_threshold,
mut compaction_algorithm,
mut gc_horizon,
mut gc_period,
mut image_creation_threshold,
mut pitr_interval,
mut walreceiver_connect_timeout,
mut lagging_wal_timeout,
mut max_lsn_wal_lag,
mut eviction_policy,
mut min_resident_size_override,
mut evictions_low_residence_duration_metric_threshold,
mut heatmap_period,
mut lazy_slru_download,
mut timeline_get_throttle,
mut image_layer_creation_check_threshold,
mut lsn_lease_length,
mut lsn_lease_length_for_ts,
mut timeline_offloading,
mut wal_receiver_protocol_override,
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
patch
.checkpoint_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut checkpoint_timeout);
patch
.compaction_target_size
.apply(&mut compaction_target_size);
patch
.compaction_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut compaction_period);
patch.compaction_threshold.apply(&mut compaction_threshold);
patch.compaction_algorithm.apply(&mut compaction_algorithm);
patch.gc_horizon.apply(&mut gc_horizon);
patch
.gc_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut gc_period);
patch
.image_creation_threshold
.apply(&mut image_creation_threshold);
patch
.pitr_interval
.map(|v| humantime::parse_duration(&v))?
.apply(&mut pitr_interval);
patch
.walreceiver_connect_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut walreceiver_connect_timeout);
patch
.lagging_wal_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lagging_wal_timeout);
patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
patch.eviction_policy.apply(&mut eviction_policy);
patch
.min_resident_size_override
.apply(&mut min_resident_size_override);
patch
.evictions_low_residence_duration_metric_threshold
.map(|v| humantime::parse_duration(&v))?
.apply(&mut evictions_low_residence_duration_metric_threshold);
patch
.heatmap_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut heatmap_period);
patch.lazy_slru_download.apply(&mut lazy_slru_download);
patch
.timeline_get_throttle
.apply(&mut timeline_get_throttle);
patch
.image_layer_creation_check_threshold
.apply(&mut image_layer_creation_check_threshold);
patch
.lsn_lease_length
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lsn_lease_length);
patch
.lsn_lease_length_for_ts
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lsn_lease_length_for_ts);
patch.timeline_offloading.apply(&mut timeline_offloading);
patch
.wal_receiver_protocol_override
.apply(&mut wal_receiver_protocol_override);
Ok(Self {
checkpoint_distance,
checkpoint_timeout,
compaction_target_size,
compaction_period,
compaction_threshold,
compaction_algorithm,
gc_horizon,
gc_period,
image_creation_threshold,
pitr_interval,
walreceiver_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,
eviction_policy,
min_resident_size_override,
evictions_low_residence_duration_metric_threshold,
heatmap_period,
lazy_slru_download,
timeline_get_throttle,
image_layer_creation_check_threshold,
lsn_lease_length,
lsn_lease_length_for_ts,
timeline_offloading,
wal_receiver_protocol_override,
})
}
}
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {

View File

@@ -1,4 +1,4 @@
use std::collections::HashMap;
use std::{collections::HashMap, sync::Arc};
use utils::id::TimelineId;
@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
/// Do not add any more features taking and forbidding taking this lock. It should be
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
/// synchronizes with gc attempts by locking and unlocking this mutex.
blocking: tokio::sync::Mutex<()>,
blocking: Arc<tokio::sync::Mutex<()>>,
}
impl GcBlock {
@@ -30,7 +30,7 @@ impl GcBlock {
/// it's ending, or if not currently possible, a value describing the reasons why not.
///
/// Cancellation safe.
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
let reasons = {
let g = self.reasons.lock().unwrap();
@@ -44,7 +44,7 @@ impl GcBlock {
Err(reasons)
} else {
Ok(Guard {
_inner: self.blocking.lock().await,
_inner: self.blocking.clone().lock_owned().await,
})
}
}
@@ -170,8 +170,8 @@ impl GcBlock {
}
}
pub(super) struct Guard<'a> {
_inner: tokio::sync::MutexGuard<'a, ()>,
pub(crate) struct Guard {
_inner: tokio::sync::OwnedMutexGuard<()>,
}
#[derive(Debug)]

View File

@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
upload_queue.clean.1 = Some(task.task_id);
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
self.metrics
.projected_remote_consistent_lsn_gauge
.set(lsn.0);
if self.generation.is_none() {
// Legacy mode: skip validating generation

View File

@@ -58,6 +58,11 @@ pub struct Stats {
pub sum_throttled_usecs: u64,
}
pub enum ThrottleResult {
NotThrottled { start: Instant },
Throttled { start: Instant, end: Instant },
}
impl<M> Throttle<M>
where
M: Metric,
@@ -122,15 +127,15 @@ where
self.inner.load().rate_limiter.steady_rps()
}
pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
let inner = self.inner.load_full(); // clones the `Inner` Arc
if !inner.enabled {
return None;
}
let start = std::time::Instant::now();
if !inner.enabled {
return ThrottleResult::NotThrottled { start };
}
self.metric.accounting_start();
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
let observation = Observation { wait_time };
self.metric.observe_throttling(&observation);
Some(wait_time)
ThrottleResult::Throttled { start, end: now }
} else {
None
ThrottleResult::NotThrottled { start }
}
}
}

View File

@@ -53,7 +53,7 @@ use utils::{
postgres_client::PostgresClientProtocol,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
Background,
}
#[derive(enumset::EnumSetType)]
#[derive(Debug, enumset::EnumSetType)]
pub(crate) enum CompactFlags {
ForceRepartition,
ForceImageLayerCreation,
@@ -777,6 +777,19 @@ pub(crate) enum CompactFlags {
DryRun,
}
#[serde_with::serde_as]
#[derive(Debug, Clone, serde::Deserialize)]
pub(crate) struct CompactRequest {
pub compact_range: Option<CompactRange>,
pub compact_below_lsn: Option<Lsn>,
/// Whether the compaction job should be scheduled.
#[serde(default)]
pub scheduled: bool,
/// Whether the compaction job should be split across key ranges.
#[serde(default)]
pub sub_compaction: bool,
}
#[serde_with::serde_as]
#[derive(Debug, Clone, serde::Deserialize)]
pub(crate) struct CompactRange {
@@ -786,10 +799,27 @@ pub(crate) struct CompactRange {
pub end: Key,
}
#[derive(Clone, Default)]
impl From<Range<Key>> for CompactRange {
fn from(range: Range<Key>) -> Self {
CompactRange {
start: range.start,
end: range.end,
}
}
}
#[derive(Debug, Clone, Default)]
pub(crate) struct CompactOptions {
pub flags: EnumSet<CompactFlags>,
/// If set, the compaction will only compact the key range specified by this option.
/// This option is only used by GC compaction.
pub compact_range: Option<CompactRange>,
/// If set, the compaction will only compact the LSN below this value.
/// This option is only used by GC compaction.
pub compact_below_lsn: Option<Lsn>,
/// Enable sub-compaction (split compaction job across key ranges).
/// This option is only used by GC compaction.
pub sub_compaction: bool,
}
impl std::fmt::Debug for Timeline {
@@ -1433,23 +1463,31 @@ impl Timeline {
Ok(lease)
}
/// Flush to disk all data that was written with the put_* functions
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
/// Returns the flush request ID which can be awaited with wait_flush_completion().
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
self.freeze0().await
}
/// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
self.freeze_and_flush0().await
}
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
/// Returns the flush request ID which can be awaited with wait_flush_completion().
pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
let mut g = self.write_lock.lock().await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn, &mut g).await
}
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
let token = {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
let mut g = self.write_lock.lock().await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn, &mut g).await?
};
let token = self.freeze0().await?;
self.wait_flush_completion(token).await
}
@@ -1604,6 +1642,8 @@ impl Timeline {
CompactOptions {
flags,
compact_range: None,
compact_below_lsn: None,
sub_compaction: false,
},
ctx,
)
@@ -2359,7 +2399,7 @@ impl Timeline {
result
.metrics
.last_record_gauge
.last_record_lsn_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
@@ -3481,7 +3521,7 @@ impl Timeline {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}
@@ -3849,6 +3889,10 @@ impl Timeline {
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
let old_value = self.disk_consistent_lsn.fetch_max(new_value);
assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
self.metrics
.disk_consistent_lsn_gauge
.set(new_value.0 as i64);
new_value != old_value
}
@@ -5887,6 +5931,23 @@ impl<'a> TimelineWriter<'a> {
return Ok(());
}
// In debug builds, assert that we don't write any keys that don't belong to this shard.
// We don't assert this in release builds, since key ownership policies may change over
// time. Stray keys will be removed during compaction.
if cfg!(debug_assertions) {
for metadata in &batch.metadata {
if let ValueMeta::Serialized(metadata) = metadata {
let key = Key::from_compact(metadata.key);
assert!(
self.shard_identity.is_key_local(&key)
|| self.shard_identity.is_key_global(&key),
"key {key} does not belong on shard {}",
self.shard_identity.shard_index()
);
}
}
}
let batch_max_lsn = batch.max_lsn;
let buf_size: u64 = batch.buffer_size() as u64;

View File

@@ -10,13 +10,12 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
ImageLayerCreationMode, RecordedDuration, Timeline,
};
use anyhow::{anyhow, bail, Context};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::key::KEY_SIZE;
@@ -30,7 +29,6 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -43,7 +41,7 @@ use crate::tenant::storage_layer::{
use crate::tenant::timeline::ImageLayerCreationOutcome;
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::{DeltaLayer, MaybeOffloaded};
use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use pageserver_api::config::tenant_conf_defaults::{
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -64,6 +62,15 @@ use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
/// A scheduled compaction task.
pub(crate) struct ScheduledCompactionTask {
pub options: CompactOptions,
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
pub gc_block: Option<gc_block::Guard>,
}
pub struct GcCompactionJobDescription {
/// All layers to read in the compaction job
selected_layers: Vec<Layer>,
@@ -1174,11 +1181,12 @@ impl Timeline {
.await
.map_err(CompactionError::Other)?;
} else {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
if cfg!(debug_assertions) {
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
debug!("dropping key {key} during compaction (it belongs on shard {owner})");
}
if !new_layers.is_empty() {
@@ -1746,22 +1754,113 @@ impl Timeline {
Ok(())
}
pub(crate) async fn compact_with_gc(
/// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
/// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
/// ad-hoc information about gc compaction itself.
pub(crate) async fn gc_compaction_split_jobs(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.partial_compact_with_gc(
options
.compact_range
.map(|range| range.start..range.end)
.unwrap_or_else(|| Key::MIN..Key::MAX),
cancel,
options.flags,
ctx,
)
.await
) -> anyhow::Result<Vec<CompactOptions>> {
if !options.sub_compaction {
return Ok(vec![options]);
}
let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
start: Key::MIN,
end: Key::MAX,
});
let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
compact_below_lsn
} else {
*self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
};
let mut compact_jobs = Vec::new();
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
let Ok(partition) = self.partitioning.try_lock() else {
bail!("failed to acquire partition lock");
};
let ((dense_ks, sparse_ks), _) = &*partition;
// Truncate the key range to be within user specified compaction range.
fn truncate_to(
source_start: &Key,
source_end: &Key,
target_start: &Key,
target_end: &Key,
) -> Option<(Key, Key)> {
let start = source_start.max(target_start);
let end = source_end.min(target_end);
if start < end {
Some((*start, *end))
} else {
None
}
}
let mut split_key_ranges = Vec::new();
let ranges = dense_ks
.parts
.iter()
.map(|partition| partition.ranges.iter())
.chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
.flatten()
.cloned()
.collect_vec();
for range in ranges.iter() {
let Some((start, end)) = truncate_to(
&range.start,
&range.end,
&compact_range.start,
&compact_range.end,
) else {
continue;
};
split_key_ranges.push((start, end));
}
split_key_ranges.sort();
let guard = self.layers.read().await;
let layer_map = guard.layer_map()?;
let mut current_start = None;
// Split compaction job to about 2GB each
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
let ranges_num = split_key_ranges.len();
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
if current_start.is_none() {
current_start = Some(start);
}
let start = current_start.unwrap();
if start >= end {
// We have already processed this partition.
continue;
}
let res = layer_map.range_search(start..end, compact_below_lsn);
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
let mut compact_options = options.clone();
// Try to extend the compaction range so that we include at least one full layer file.
let extended_end = res
.found
.keys()
.map(|layer| layer.layer.key_range.end)
.min();
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
// In this case, we simply use the specified key range end.
let end = if let Some(extended_end) = extended_end {
extended_end.max(end)
} else {
end
};
info!(
"splitting compaction job: {}..{}, estimated_size={}",
start, end, total_size
);
compact_options.compact_range = Some(CompactRange { start, end });
compact_options.compact_below_lsn = Some(compact_below_lsn);
compact_options.sub_compaction = false;
compact_jobs.push(compact_options);
current_start = Some(end);
}
}
drop(guard);
Ok(compact_jobs)
}
/// An experimental compaction building block that combines compaction with garbage collection.
@@ -1771,19 +1870,51 @@ impl Timeline {
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
/// and create delta layers with all deltas >= gc horizon.
///
/// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
/// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
/// Partial compaction will read and process all layers overlapping with the key range, even if it might
/// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
/// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
/// part of the range.
pub(crate) async fn partial_compact_with_gc(
///
/// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
/// the LSN. Otherwise, it will use the gc cutoff by default.
pub(crate) async fn compact_with_gc(
self: &Arc<Self>,
compaction_key_range: Range<Key>,
cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
if options.sub_compaction {
info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs = self.gc_compaction_split_jobs(options).await?;
let jobs_len = jobs.len();
for (idx, job) in jobs.into_iter().enumerate() {
info!(
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
idx + 1,
jobs_len
);
self.compact_with_gc_inner(cancel, job, ctx).await?;
}
if jobs_len == 0 {
info!("no jobs to run, skipping gc bottom-most compaction");
}
return Ok(());
}
self.compact_with_gc_inner(cancel, options, ctx).await
}
async fn compact_with_gc_inner(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
assert!(
!options.sub_compaction,
"sub-compaction should be handled by the outer function"
);
// Block other compaction/GC tasks from running for now. GC-compaction could run along
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1803,6 +1934,12 @@ impl Timeline {
)
.await?;
let flags = options.flags;
let compaction_key_range = options
.compact_range
.map(|range| range.start..range.end)
.unwrap_or_else(|| Key::MIN..Key::MAX);
let dry_run = flags.contains(CompactFlags::DryRun);
if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1963,22 @@ impl Timeline {
let layers = guard.layer_map()?;
let gc_info = self.gc_info.read().unwrap();
let mut retain_lsns_below_horizon = Vec::new();
let gc_cutoff = gc_info.cutoffs.select_min();
let gc_cutoff = {
// Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
// Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
// cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
// to get the truth data.
let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
// each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
// the real cutoff.
let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
if gc_cutoff > real_gc_cutoff {
warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
gc_cutoff = real_gc_cutoff;
}
gc_cutoff
};
for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
if lsn < &gc_cutoff {
retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1998,7 @@ impl Timeline {
.map(|desc| desc.get_lsn_range().end)
.max()
else {
info!("no layers to compact with gc");
info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
return Ok(());
};
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +2021,7 @@ impl Timeline {
}
}
if selected_layers.is_empty() {
info!("no layers to compact with gc");
info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
return Ok(());
}
retain_lsns_below_horizon.sort();
@@ -1936,14 +2088,15 @@ impl Timeline {
// Step 1: construct a k-merge iterator over all layers.
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
let layer_names = job_desc
.selected_layers
.iter()
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
}
// disable the check for now because we need to adjust the check for partial compactions, will enable later.
// let layer_names = job_desc
// .selected_layers
// .iter()
// .map(|layer| layer.layer_desc().layer_name())
// .collect_vec();
// if let Some(err) = check_valid_layermap(&layer_names) {
// warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
// }
// The maximum LSN we are processing in this compaction loop
let end_lsn = job_desc
.selected_layers
@@ -2048,6 +2201,11 @@ impl Timeline {
// This is not handled in the filter iterator because shard is determined by hash.
// Therefore, it does not give us any performance benefit to do things like skip
// a whole layer file as handling key spaces (ranges).
if cfg!(debug_assertions) {
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
continue;
}
if !job_desc.compaction_key_range.contains(&key) {

View File

@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
// advances it to its end LSN. 0 is just an initialization placeholder.
let mut modification = timeline.begin_modification(Lsn(0));
if !records.is_empty() {
timeline
.metrics
.wal_records_received
.inc_by(records.len() as u64);
}
for interpreted in records {
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
}
// Ingest the records without immediately committing them.
timeline.metrics.wal_records_received.inc();
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await

View File

@@ -582,18 +582,21 @@ impl WalIngest {
forknum: FSM_FORKNUM,
};
// Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
// and instead of digging in the FSM bitmap format we just clear the whole page.
let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
// Tail of last remaining FSM page has to be zeroed.
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
&& self
.shard
.is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
{
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
fsm_physical_page_no += 1;
}
// TODO: re-examine the None case here wrt. sharding; should we error?
// Truncate this shard's view of the FSM relation size, if it even has one.
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
.await?;
}
@@ -617,7 +620,7 @@ impl WalIngest {
// tail bits in the last remaining map page, representing truncated heap
// blocks, need to be cleared. This is not only tidy, but also necessary
// because we don't get a chance to clear the bits if the heap is extended
// again.
// again. Only do this on the shard that owns the page.
if (trunc_byte != 0 || trunc_offs != 0)
&& self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
{
@@ -631,10 +634,9 @@ impl WalIngest {
)?;
vm_page_no += 1;
}
// TODO: re-examine the None case here wrt. sharding; should we error?
// Truncate this shard's view of the VM relation size, if it even has one.
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
.await?;
}
@@ -875,22 +877,24 @@ impl WalIngest {
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
if modification.tline.get_shard_identity().is_shard_zero() {
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
});
let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
});
if may_delete {
modification
.drop_slru_segment(SlruKind::Clog, segno, ctx)
.await?;
trace!("Drop CLOG segment {:>04X}", segno);
if may_delete {
modification
.drop_slru_segment(SlruKind::Clog, segno, ctx)
.await?;
trace!("Drop CLOG segment {:>04X}", segno);
}
}
}
@@ -1045,16 +1049,18 @@ impl WalIngest {
// Delete all the segments except the last one. The last segment can still
// contain, possibly partially, valid data.
while segment != endsegment {
modification
.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
.await?;
if modification.tline.get_shard_identity().is_shard_zero() {
while segment != endsegment {
modification
.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
.await?;
/* move to next segment, handling wraparound correctly */
if segment == maxsegment {
segment = 0;
} else {
segment += 1;
/* move to next segment, handling wraparound correctly */
if segment == maxsegment {
segment = 0;
} else {
segment += 1;
}
}
}

View File

@@ -22,6 +22,7 @@
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/interrupt.h"
#include "storage/buf_internals.h"
#include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
*/
PSConnectionState state;
PGconn *conn;
/* request / response counters for debugging */
uint64 nrequests_sent;
uint64 nresponses_received;
/*---
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
}
shard->state = PS_Connected;
shard->nrequests_sent = 0;
shard->nresponses_received = 0;
}
/* FALLTHROUGH */
case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
int ret;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
instr_time now,
start_ts,
since_start,
last_log_ts,
since_last_log;
bool logged = false;
/*
* As a debugging aid, if we don't get a response for a long time, print a
* log message.
*
* 10 s is a very generous threshold, normally we expect a response in a
* few milliseconds. We have metrics to track latencies in normal ranges,
* but in the cases that take exceptionally long, it's useful to log the
* exact timestamps.
*/
#define LOG_INTERVAL_US UINT64CONST(10 * 1000000)
INSTR_TIME_SET_CURRENT(now);
start_ts = last_log_ts = now;
INSTR_TIME_SET_ZERO(since_last_log);
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
if (ret == 0)
{
WaitEvent event;
long timeout;
timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
WAIT_EVENT_NEON_PS_READ);
ResetLatch(MyLatch);
@@ -684,9 +716,40 @@ retry:
}
}
/*
* Print a message to the log if a long time has passed with no
* response.
*/
INSTR_TIME_SET_CURRENT(now);
since_last_log = now;
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
{
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
INSTR_TIME_GET_DOUBLE(since_start),
shard->nrequests_sent, shard->nresponses_received);
last_log_ts = now;
logged = true;
}
goto retry;
}
/*
* If we logged earlier that the response is taking a long time, log
* another message when the response is finally received.
*/
if (logged)
{
INSTR_TIME_SET_CURRENT(now);
since_start = now;
INSTR_TIME_SUBTRACT(since_start, start_ts);
neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
INSTR_TIME_GET_DOUBLE(since_start));
}
return ret;
}
@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
* point, but on the grand scheme of things it's only a small issue.
*/
shard->nrequests_sent++;
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
}
shard->nresponses_received++;
return (NeonResponse *) resp;
}

View File

@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
* ensuring we have received all but the last n requests (n = newsize).
*/
if (MyPState->n_requests_inflight > newsize)
prefetch_wait_for(MyPState->ring_unused - newsize);
{
Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
Assert(MyPState->n_requests_inflight <= newsize);
}
/* construct the new PrefetchState, and copy over the memory contexts */
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
newPState->ring_last = newsize;
newPState->ring_unused = newsize;
newPState->ring_receive = newsize;
newPState->ring_flush = newsize;
newPState->max_shard_no = MyPState->max_shard_no;
memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
}
newPState->n_unused -= 1;
}
newPState->ring_flush = newPState->ring_receive;
MyNeonCounters->getpage_prefetches_buffered =
MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
{
PrefetchRequest *slot = GetPrfSlot(end);
Assert(slot->status != PRFS_REQUESTED);
if (slot->status == PRFS_RECEIVED)
{
pfree(slot->response);
@@ -610,6 +615,9 @@ prefetch_read(PrefetchRequest *slot)
{
NeonResponse *response;
MemoryContext old;
BufferTag buftag;
shardno_t shard_no;
uint64 my_ring_index;
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->response == NULL);
@@ -623,11 +631,29 @@ prefetch_read(PrefetchRequest *slot)
slot->status, slot->response,
(long)slot->my_ring_index, (long)MyPState->ring_receive);
/*
* Copy the request info so that if an error happens and the prefetch
* queue is flushed during the receive call, we can print the original
* values in the error message
*/
buftag = slot->buftag;
shard_no = slot->shard_no;
my_ring_index = slot->my_ring_index;
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive(slot->shard_no);
response = (NeonResponse *) page_server->receive(shard_no);
MemoryContextSwitchTo(old);
if (response)
{
/* The slot should still be valid */
if (slot->status != PRFS_REQUESTED ||
slot->response != NULL ||
slot->my_ring_index != MyPState->ring_receive)
neon_shard_log(shard_no, ERROR,
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
slot->status, slot->response,
(long) slot->my_ring_index, (long) MyPState->ring_receive);
/* update prefetch state */
MyPState->n_responses_buffered += 1;
MyPState->n_requests_inflight -= 1;
@@ -642,11 +668,15 @@ prefetch_read(PrefetchRequest *slot)
}
else
{
neon_shard_log(slot->shard_no, LOG,
/*
* Note: The slot might no longer be valid, if the connection was lost
* and the prefetch queue was flushed during the receive call
*/
neon_shard_log(shard_no, LOG,
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
(long)slot->my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
slot->buftag.forkNum, slot->buftag.blockNum);
(long) my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
buftag.forkNum, buftag.blockNum);
return false;
}
}

View File

@@ -115,7 +115,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
};
if !self.limiter.lock().unwrap().check(subnet_key, 1) {
tracing::debug!("Rate limit exceeded. Skipping cancellation message");
// log only the subnet part of the IP address to know which subnet is rate limited
tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
Metrics::get()
.proxy
.cancellation_requests_total

View File

@@ -163,32 +163,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
let (mut stream, params) =
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
HandshakeData::Startup(stream, params) => (stream, params),
HandshakeData::Cancel(cancel_key_data) => {
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.await,
);
}
});
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
.await??
{
HandshakeData::Startup(stream, params) => (stream, params),
HandshakeData::Cancel(cancel_key_data) => {
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
cancel_span.follows_from(tracing::Span::current());
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.instrument(cancel_span)
.await,
);
}
});
return Ok(None);
}
};
return Ok(None);
}
};
drop(pause);
ctx.set_db_options(params.clone());

View File

@@ -272,32 +272,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
let (mut stream, params) =
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
HandshakeData::Startup(stream, params) => (stream, params),
HandshakeData::Cancel(cancel_key_data) => {
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.await,
);
}
});
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
.await??
{
HandshakeData::Startup(stream, params) => (stream, params),
HandshakeData::Cancel(cancel_key_data) => {
// spawn a task to cancel the session, but don't wait for it
cancellations.spawn({
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session_id = ctx.session_id();
let peer_ip = ctx.peer_addr();
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
cancel_span.follows_from(tracing::Span::current());
async move {
drop(
cancellation_handler_clone
.cancel_session(
cancel_key_data,
session_id,
peer_ip,
config.authentication_config.ip_allowlist_check_enabled,
)
.instrument(cancel_span)
.await,
);
}
});
return Ok(None);
}
};
return Ok(None);
}
};
drop(pause);
ctx.set_db_options(params.clone());

View File

@@ -13,6 +13,7 @@ use crate::cache::project_info::ProjectInfoCache;
use crate::cancellation::{CancelMap, CancellationHandler};
use crate::intern::{ProjectIdInt, RoleNameInt};
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
use tracing::Instrument;
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
@@ -143,6 +144,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
let peer_addr = cancel_session
.peer_addr
.unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
cancel_span.follows_from(tracing::Span::current());
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
match self
.cancellation_handler
@@ -152,6 +155,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
peer_addr,
cancel_session.peer_addr.is_some(),
)
.instrument(cancel_span)
.await
{
Ok(()) => {}

View File

@@ -83,14 +83,20 @@ impl Env {
node_id: NodeId,
ttid: TenantTimelineId,
) -> anyhow::Result<Arc<Timeline>> {
let conf = self.make_conf(node_id);
let conf = Arc::new(self.make_conf(node_id));
let timeline_dir = get_timeline_dir(&conf, &ttid);
let remote_path = remote_timeline_path(&ttid)?;
let safekeeper = self.make_safekeeper(node_id, ttid).await?;
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
let timeline = Timeline::new(
ttid,
&timeline_dir,
&remote_path,
shared_state,
conf.clone(),
);
timeline.bootstrap(
&mut timeline.write_shared_state().await,
&conf,

View File

@@ -338,7 +338,7 @@ async fn main() -> anyhow::Result<()> {
}
};
let conf = SafeKeeperConf {
let conf = Arc::new(SafeKeeperConf {
workdir,
my_id: id,
listen_pg_addr: args.listen_pg,
@@ -368,7 +368,7 @@ async fn main() -> anyhow::Result<()> {
control_file_save_interval: args.control_file_save_interval,
partial_backup_concurrency: args.partial_backup_concurrency,
eviction_min_resident: args.eviction_min_resident,
};
});
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(
@@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> {
/// complete, e.g. panicked, inner is error produced by task itself.
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
// fsync the datadir to make sure we have a consistent state on disk.
if !conf.no_sync {
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
@@ -428,9 +428,11 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
e
})?;
let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
// Register metrics collector for active timelines. It's important to do this
// after daemonizing, otherwise process collector will be upset.
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
metrics::register_internal(Box::new(timeline_collector))?;
wal_backup::init_remote_storage(&conf).await;
@@ -447,9 +449,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
.then(|| Handle::try_current().expect("no runtime in main"));
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone()).await?;
global_timelines.init().await?;
let conf_ = conf.clone();
// Run everything in current thread rt, if asked.
if conf.current_thread_runtime {
info!("running in current thread runtime");
@@ -459,14 +460,16 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
.as_ref()
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
.spawn(wal_service::task_main(
conf_,
conf.clone(),
pg_listener,
Scope::SafekeeperData,
global_timelines.clone(),
))
// wrap with task name for error reporting
.map(|res| ("WAL service main".to_owned(), res));
tasks_handles.push(Box::pin(wal_service_handle));
let global_timelines_ = global_timelines.clone();
let timeline_housekeeping_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
@@ -474,40 +477,45 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
loop {
tokio::time::sleep(TOMBSTONE_TTL).await;
GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
global_timelines_.housekeeping(&TOMBSTONE_TTL);
}
})
.map(|res| ("Timeline map housekeeping".to_owned(), res));
tasks_handles.push(Box::pin(timeline_housekeeping_handle));
if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
let conf_ = conf.clone();
let wal_service_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
.spawn(wal_service::task_main(
conf_,
conf.clone(),
pg_listener_tenant_only,
Scope::Tenant,
global_timelines.clone(),
))
// wrap with task name for error reporting
.map(|res| ("WAL service tenant only main".to_owned(), res));
tasks_handles.push(Box::pin(wal_service_handle));
}
let conf_ = conf.clone();
let http_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| HTTP_RUNTIME.handle())
.spawn(http::task_main(conf_, http_listener))
.spawn(http::task_main(
conf.clone(),
http_listener,
global_timelines.clone(),
))
.map(|res| ("HTTP service main".to_owned(), res));
tasks_handles.push(Box::pin(http_handle));
let conf_ = conf.clone();
let broker_task_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| BROKER_RUNTIME.handle())
.spawn(broker::task_main(conf_).instrument(info_span!("broker")))
.spawn(
broker::task_main(conf.clone(), global_timelines.clone())
.instrument(info_span!("broker")),
)
.map(|res| ("broker main".to_owned(), res));
tasks_handles.push(Box::pin(broker_task_handle));

View File

@@ -39,14 +39,17 @@ const RETRY_INTERVAL_MSEC: u64 = 1000;
const PUSH_INTERVAL_MSEC: u64 = 1000;
/// Push once in a while data about all active timelines to the broker.
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
async fn push_loop(
conf: Arc<SafeKeeperConf>,
global_timelines: Arc<GlobalTimelines>,
) -> anyhow::Result<()> {
if conf.disable_periodic_broker_push {
info!("broker push_loop is disabled, doing nothing...");
futures::future::pending::<()>().await; // sleep forever
return Ok(());
}
let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
let active_timelines_set = global_timelines.get_global_broker_active_set();
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
@@ -87,8 +90,13 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
/// Subscribe and fetch all the interesting data from the broker.
#[instrument(name = "broker_pull", skip_all)]
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
async fn pull_loop(
conf: Arc<SafeKeeperConf>,
global_timelines: Arc<GlobalTimelines>,
stats: Arc<BrokerStats>,
) -> Result<()> {
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
// TODO: subscribe only to local timelines instead of all
let request = SubscribeSafekeeperInfoRequest {
@@ -113,7 +121,7 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
.as_ref()
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
let ttid = parse_proto_ttid(proto_ttid)?;
if let Ok(tli) = GlobalTimelines::get(ttid) {
if let Ok(tli) = global_timelines.get(ttid) {
// Note that we also receive *our own* info. That's
// important, as it is used as an indication of live
// connection to the broker.
@@ -135,7 +143,11 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
/// Process incoming discover requests. This is done in a separate task to avoid
/// interfering with the normal pull/push loops.
async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
async fn discover_loop(
conf: Arc<SafeKeeperConf>,
global_timelines: Arc<GlobalTimelines>,
stats: Arc<BrokerStats>,
) -> Result<()> {
let mut client =
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
@@ -171,7 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
.as_ref()
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
let ttid = parse_proto_ttid(proto_ttid)?;
if let Ok(tli) = GlobalTimelines::get(ttid) {
if let Ok(tli) = global_timelines.get(ttid) {
// we received a discovery request for a timeline we know about
discover_counter.inc();
@@ -210,7 +222,10 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
bail!("end of stream");
}
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
pub async fn task_main(
conf: Arc<SafeKeeperConf>,
global_timelines: Arc<GlobalTimelines>,
) -> anyhow::Result<()> {
info!("started, broker endpoint {:?}", conf.broker_endpoint);
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
@@ -261,13 +276,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
},
_ = ticker.tick() => {
if push_handle.is_none() {
push_handle = Some(tokio::spawn(push_loop(conf.clone())));
push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone())));
}
if pull_handle.is_none() {
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone())));
}
if discover_handle.is_none() {
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone())));
}
},
_ = &mut stats_task => {}

View File

@@ -1,9 +1,7 @@
use std::sync::Arc;
use anyhow::{bail, Result};
use camino::Utf8PathBuf;
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
use std::sync::Arc;
use tokio::{
fs::OpenOptions,
io::{AsyncSeekExt, AsyncWriteExt},
@@ -14,7 +12,7 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
control_file::FileStorage,
state::TimelinePersistentState,
timeline::{Timeline, TimelineError, WalResidentTimeline},
timeline::{TimelineError, WalResidentTimeline},
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
wal_backup::copy_s3_segments,
wal_storage::{wal_file_paths, WalReader},
@@ -25,16 +23,19 @@ use crate::{
const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
pub struct Request {
pub source: Arc<Timeline>,
pub source_ttid: TenantTimelineId,
pub until_lsn: Lsn,
pub destination_ttid: TenantTimelineId,
}
pub async fn handle_request(request: Request) -> Result<()> {
pub async fn handle_request(
request: Request,
global_timelines: Arc<GlobalTimelines>,
) -> Result<()> {
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
match GlobalTimelines::get(request.destination_ttid) {
match global_timelines.get(request.destination_ttid) {
// timeline already exists. would be good to check that this timeline is the copy
// of the source timeline, but it isn't obvious how to do that
Ok(_) => return Ok(()),
@@ -46,9 +47,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
}
}
let source_tli = request.source.wal_residence_guard().await?;
let source = global_timelines.get(request.source_ttid)?;
let source_tli = source.wal_residence_guard().await?;
let conf = &GlobalTimelines::get_global_config();
let conf = &global_timelines.get_global_config();
let ttid = request.destination_ttid;
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
@@ -127,7 +129,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
copy_s3_segments(
wal_seg_size,
&request.source.ttid,
&request.source_ttid,
&request.destination_ttid,
first_segment,
first_ondisk_segment,
@@ -158,7 +160,9 @@ pub async fn handle_request(request: Request) -> Result<()> {
// now we have a ready timeline in a temp directory
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
global_timelines
.load_temp_timeline(request.destination_ttid, &tli_dir_path, true)
.await?;
Ok(())
}

View File

@@ -207,23 +207,23 @@ pub struct FileInfo {
}
/// Build debug dump response, using the provided [`Args`] filters.
pub async fn build(args: Args) -> Result<Response> {
pub async fn build(args: Args, global_timelines: Arc<GlobalTimelines>) -> Result<Response> {
let start_time = Utc::now();
let timelines_count = GlobalTimelines::timelines_count();
let config = GlobalTimelines::get_global_config();
let timelines_count = global_timelines.timelines_count();
let config = global_timelines.get_global_config();
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
// If both tenant_id and timeline_id are specified, we can just get the
// timeline directly, without taking a snapshot of the whole list.
let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
if let Ok(tli) = GlobalTimelines::get(ttid) {
if let Ok(tli) = global_timelines.get(ttid) {
vec![tli]
} else {
vec![]
}
} else {
// Otherwise, take a snapshot of the whole list.
GlobalTimelines::get_all()
global_timelines.get_all()
};
let mut timelines = Vec::new();
@@ -344,12 +344,12 @@ fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
/// Converts SafeKeeperConf to Config, filtering out the fields that are not
/// supposed to be exposed.
fn build_config(config: SafeKeeperConf) -> Config {
fn build_config(config: Arc<SafeKeeperConf>) -> Config {
Config {
id: config.my_id,
workdir: config.workdir.into(),
listen_pg_addr: config.listen_pg_addr,
listen_http_addr: config.listen_http_addr,
workdir: config.workdir.clone().into(),
listen_pg_addr: config.listen_pg_addr.clone(),
listen_http_addr: config.listen_http_addr.clone(),
no_sync: config.no_sync,
max_offloader_lag_bytes: config.max_offloader_lag_bytes,
wal_backup_enabled: config.wal_backup_enabled,

View File

@@ -33,7 +33,7 @@ use utils::{
/// Safekeeper handler of postgres commands
pub struct SafekeeperPostgresHandler {
pub conf: SafeKeeperConf,
pub conf: Arc<SafeKeeperConf>,
/// assigned application name
pub appname: Option<String>,
pub tenant_id: Option<TenantId>,
@@ -43,6 +43,7 @@ pub struct SafekeeperPostgresHandler {
pub protocol: Option<PostgresClientProtocol>,
/// Unique connection id is logged in spans for observability.
pub conn_id: ConnectionId,
pub global_timelines: Arc<GlobalTimelines>,
/// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
auth: Option<(Scope, Arc<JwtAuth>)>,
claims: Option<Claims>,
@@ -314,10 +315,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
impl SafekeeperPostgresHandler {
pub fn new(
conf: SafeKeeperConf,
conf: Arc<SafeKeeperConf>,
conn_id: u32,
io_metrics: Option<TrafficMetrics>,
auth: Option<(Scope, Arc<JwtAuth>)>,
global_timelines: Arc<GlobalTimelines>,
) -> Self {
SafekeeperPostgresHandler {
conf,
@@ -331,6 +333,7 @@ impl SafekeeperPostgresHandler {
claims: None,
auth,
io_metrics,
global_timelines,
}
}
@@ -360,7 +363,7 @@ impl SafekeeperPostgresHandler {
pgb: &mut PostgresBackend<IO>,
) -> Result<(), QueryError> {
// Get timeline, handling "not found" error
let tli = match GlobalTimelines::get(self.ttid) {
let tli = match self.global_timelines.get(self.ttid) {
Ok(tli) => Ok(Some(tli)),
Err(TimelineError::NotFound(_)) => Ok(None),
Err(e) => Err(QueryError::Other(e.into())),
@@ -394,7 +397,10 @@ impl SafekeeperPostgresHandler {
&mut self,
pgb: &mut PostgresBackend<IO>,
) -> Result<(), QueryError> {
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
let tli = self
.global_timelines
.get(self.ttid)
.map_err(|e| QueryError::Other(e.into()))?;
let lsn = if self.is_walproposer_recovery() {
// walproposer should get all local WAL until flush_lsn

View File

@@ -3,14 +3,16 @@ pub mod routes;
pub use routes::make_router;
pub use safekeeper_api::models;
use std::sync::Arc;
use crate::SafeKeeperConf;
use crate::{GlobalTimelines, SafeKeeperConf};
pub async fn task_main(
conf: SafeKeeperConf,
conf: Arc<SafeKeeperConf>,
http_listener: std::net::TcpListener,
global_timelines: Arc<GlobalTimelines>,
) -> anyhow::Result<()> {
let router = make_router(conf)
let router = make_router(conf, global_timelines)
.build()
.map_err(|err| anyhow::anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();

View File

@@ -66,6 +66,13 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
.as_ref()
}
fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
request
.data::<Arc<GlobalTimelines>>()
.expect("unknown state type")
.clone()
}
/// Same as TermLsn, but serializes LSN using display serializer
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
@@ -123,9 +130,11 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(tenant_id))?;
ensure_no_body(&mut request).await?;
let global_timelines = get_global_timelines(&request);
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
// Using an `InternalServerError` should be fixed when the types support it
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
let delete_info = global_timelines
.delete_force_all_for_tenant(&tenant_id, only_local)
.await
.map_err(ApiError::InternalServerError)?;
json_response(
@@ -156,7 +165,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.commit_lsn
.segment_lsn(server_info.wal_seg_size as usize)
});
GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
let global_timelines = get_global_timelines(&request);
global_timelines
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
.await
.map_err(ApiError::InternalServerError)?;
@@ -167,7 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
/// Note: it is possible to do the same with debug_dump.
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
let global_timelines = get_global_timelines(&request);
let res: Vec<TenantTimelineId> = global_timelines
.get_all()
.iter()
.map(|tli| tli.ttid)
.collect();
@@ -182,7 +195,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
);
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
let (inmem, state) = tli.get_state().await;
let flush_lsn = tli.get_flush_lsn().await;
@@ -233,9 +247,11 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
check_permission(&request, Some(ttid.tenant_id))?;
ensure_no_body(&mut request).await?;
let global_timelines = get_global_timelines(&request);
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
// error handling here when we're able to.
let resp = GlobalTimelines::delete(&ttid, only_local)
let resp = global_timelines
.delete(&ttid, only_local)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, resp)
@@ -247,8 +263,9 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
let data: pull_timeline::Request = json_request(&mut request).await?;
let conf = get_conf(&request);
let global_timelines = get_global_timelines(&request);
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone(), global_timelines)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, resp)
@@ -263,7 +280,8 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
);
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
// so create the chan and write to it in another task.
@@ -293,19 +311,19 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
check_permission(&request, None)?;
let request_data: TimelineCopyRequest = json_request(&mut request).await?;
let ttid = TenantTimelineId::new(
let source_ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "source_timeline_id")?,
);
let source = GlobalTimelines::get(ttid)?;
let global_timelines = get_global_timelines(&request);
copy_timeline::handle_request(copy_timeline::Request{
source,
source_ttid,
until_lsn: request_data.until_lsn,
destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
})
.instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
}, global_timelines)
.instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
.await
.map_err(ApiError::InternalServerError)?;
@@ -322,7 +340,8 @@ async fn patch_control_file_handler(
parse_request_param(&request, "timeline_id")?,
);
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
let patch_request: patch_control_file::Request = json_request(&mut request).await?;
let response = patch_control_file::handle_request(tli, patch_request)
@@ -341,7 +360,8 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
parse_request_param(&request, "timeline_id")?,
);
let tli = GlobalTimelines::get(ttid)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid)?;
tli.write_shared_state()
.await
.sk
@@ -359,6 +379,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
);
check_permission(&request, Some(ttid.tenant_id))?;
let global_timelines = get_global_timelines(&request);
let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
@@ -371,7 +392,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
)))?,
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
let tli = tli
.wal_residence_guard()
.await
@@ -393,7 +414,8 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
);
check_permission(&request, Some(ttid.tenant_id))?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
let response = tli
.backup_partial_reset()
@@ -415,7 +437,8 @@ async fn timeline_term_bump_handler(
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
let response = tli
.term_bump(request_data.term)
.await
@@ -452,7 +475,8 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
standby_horizon: sk_info.standby_horizon.0,
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let global_timelines = get_global_timelines(&request);
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
tli.record_safekeeper_info(proto_sk_info)
.await
.map_err(ApiError::InternalServerError)?;
@@ -506,6 +530,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
let dump_term_history = dump_term_history.unwrap_or(true);
let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
let global_timelines = get_global_timelines(&request);
let args = debug_dump::Args {
dump_all,
dump_control_file,
@@ -517,7 +543,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
timeline_id,
};
let resp = debug_dump::build(args)
let resp = debug_dump::build(args, global_timelines)
.await
.map_err(ApiError::InternalServerError)?;
@@ -570,7 +596,10 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
}
/// Safekeeper http router.
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
pub fn make_router(
conf: Arc<SafeKeeperConf>,
global_timelines: Arc<GlobalTimelines>,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router();
if conf.http_auth.is_some() {
router = router.middleware(auth_middleware(|request| {
@@ -592,7 +621,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
let auth = conf.http_auth.clone();
router
.data(Arc::new(conf))
.data(conf)
.data(global_timelines)
.data(auth)
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))

View File

@@ -11,7 +11,6 @@ use postgres_backend::QueryError;
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::*;
use utils::id::TenantTimelineId;
use crate::handler::SafekeeperPostgresHandler;
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -21,7 +20,6 @@ use crate::safekeeper::{
use crate::safekeeper::{Term, TermHistory, TermLsn};
use crate::state::TimelinePersistentState;
use crate::timeline::WalResidentTimeline;
use crate::GlobalTimelines;
use postgres_backend::PostgresBackend;
use postgres_ffi::encode_logical_message;
use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -70,7 +68,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
info!("JSON_CTRL request: {append_request:?}");
// need to init safekeeper state before AppendRequest
let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
let tli = prepare_safekeeper(spg, append_request.pg_version).await?;
// if send_proposer_elected is true, we need to update local history
if append_request.send_proposer_elected {
@@ -99,20 +97,22 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
/// Prepare safekeeper to process append requests without crashes,
/// by sending ProposerGreeting with default server.wal_seg_size.
async fn prepare_safekeeper(
ttid: TenantTimelineId,
spg: &SafekeeperPostgresHandler,
pg_version: u32,
) -> anyhow::Result<WalResidentTimeline> {
let tli = GlobalTimelines::create(
ttid,
ServerInfo {
pg_version,
wal_seg_size: WAL_SEGMENT_SIZE as u32,
system_id: 0,
},
Lsn::INVALID,
Lsn::INVALID,
)
.await?;
let tli = spg
.global_timelines
.create(
spg.ttid,
ServerInfo {
pg_version,
wal_seg_size: WAL_SEGMENT_SIZE as u32,
system_id: 0,
},
Lsn::INVALID,
Lsn::INVALID,
)
.await?;
tli.wal_residence_guard().await
}

View File

@@ -455,6 +455,7 @@ pub struct FullTimelineInfo {
/// Collects metrics for all active timelines.
pub struct TimelineCollector {
global_timelines: Arc<GlobalTimelines>,
descs: Vec<Desc>,
commit_lsn: GenericGaugeVec<AtomicU64>,
backup_lsn: GenericGaugeVec<AtomicU64>,
@@ -478,14 +479,8 @@ pub struct TimelineCollector {
active_timelines_count: IntGauge,
}
impl Default for TimelineCollector {
fn default() -> Self {
Self::new()
}
}
impl TimelineCollector {
pub fn new() -> TimelineCollector {
pub fn new(global_timelines: Arc<GlobalTimelines>) -> TimelineCollector {
let mut descs = Vec::new();
let commit_lsn = GenericGaugeVec::new(
@@ -676,6 +671,7 @@ impl TimelineCollector {
descs.extend(active_timelines_count.desc().into_iter().cloned());
TimelineCollector {
global_timelines,
descs,
commit_lsn,
backup_lsn,
@@ -728,17 +724,18 @@ impl Collector for TimelineCollector {
self.written_wal_seconds.reset();
self.flushed_wal_seconds.reset();
let timelines_count = GlobalTimelines::get_all().len();
let timelines_count = self.global_timelines.get_all().len();
let mut active_timelines_count = 0;
// Prometheus Collector is sync, and data is stored under async lock. To
// bridge the gap with a crutch, collect data in spawned thread with
// local tokio runtime.
let global_timelines = self.global_timelines.clone();
let infos = std::thread::spawn(|| {
let rt = tokio::runtime::Builder::new_current_thread()
.build()
.expect("failed to create rt");
rt.block_on(collect_timeline_metrics())
rt.block_on(collect_timeline_metrics(global_timelines))
})
.join()
.expect("collect_timeline_metrics thread panicked");
@@ -857,9 +854,9 @@ impl Collector for TimelineCollector {
}
}
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec<FullTimelineInfo> {
let mut res = vec![];
let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
let active_timelines = global_timelines.get_global_broker_active_set().get_all();
for tli in active_timelines {
if let Some(info) = tli.info_for_metrics().await {

View File

@@ -409,8 +409,9 @@ pub struct DebugDumpResponse {
pub async fn handle_request(
request: Request,
sk_auth_token: Option<SecretString>,
global_timelines: Arc<GlobalTimelines>,
) -> Result<Response> {
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
let existing_tli = global_timelines.get(TenantTimelineId::new(
request.tenant_id,
request.timeline_id,
));
@@ -453,13 +454,14 @@ pub async fn handle_request(
assert!(status.tenant_id == request.tenant_id);
assert!(status.timeline_id == request.timeline_id);
pull_timeline(status, safekeeper_host, sk_auth_token).await
pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await
}
async fn pull_timeline(
status: TimelineStatus,
host: String,
sk_auth_token: Option<SecretString>,
global_timelines: Arc<GlobalTimelines>,
) -> Result<Response> {
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
info!(
@@ -472,7 +474,7 @@ async fn pull_timeline(
status.acceptor_state.epoch
);
let conf = &GlobalTimelines::get_global_config();
let conf = &global_timelines.get_global_config();
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
@@ -531,7 +533,9 @@ async fn pull_timeline(
assert!(status.commit_lsn <= status.flush_lsn);
// Finally, load the timeline.
let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
let _tli = global_timelines
.load_temp_timeline(ttid, &tli_dir_path, false)
.await?;
Ok(Response {
safekeeper_host: host,

View File

@@ -267,6 +267,7 @@ impl SafekeeperPostgresHandler {
pgb_reader: &mut pgb_reader,
peer_addr,
acceptor_handle: &mut acceptor_handle,
global_timelines: self.global_timelines.clone(),
};
// Read first message and create timeline if needed.
@@ -331,6 +332,7 @@ struct NetworkReader<'a, IO> {
// WalAcceptor is spawned when we learn server info from walproposer and
// create timeline; handle is put here.
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
global_timelines: Arc<GlobalTimelines>,
}
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
@@ -350,10 +352,11 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
system_id: greeting.system_id,
wal_seg_size: greeting.wal_seg_size,
};
let tli =
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
.await
.context("create timeline")?;
let tli = self
.global_timelines
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
.await
.context("create timeline")?;
tli.wal_residence_guard().await?
}
_ => {

View File

@@ -10,7 +10,6 @@ use crate::timeline::WalResidentTimeline;
use crate::wal_reader_stream::WalReaderStreamBuilder;
use crate::wal_service::ConnectionId;
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
use anyhow::{bail, Context as AnyhowContext};
use bytes::Bytes;
use futures::future::Either;
@@ -400,7 +399,10 @@ impl SafekeeperPostgresHandler {
start_pos: Lsn,
term: Option<Term>,
) -> Result<(), QueryError> {
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
let tli = self
.global_timelines
.get(self.ttid)
.map_err(|e| QueryError::Other(e.into()))?;
let residence_guard = tli.wal_residence_guard().await?;
if let Err(end) = self

View File

@@ -44,8 +44,8 @@ use crate::wal_backup_partial::PartialRemoteSegment;
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
use crate::SafeKeeperConf;
use crate::{debug_dump, timeline_manager, wal_storage};
use crate::{GlobalTimelines, SafeKeeperConf};
/// Things safekeeper should know about timeline state on peers.
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -467,6 +467,7 @@ pub struct Timeline {
walreceivers: Arc<WalReceivers>,
timeline_dir: Utf8PathBuf,
manager_ctl: ManagerCtl,
conf: Arc<SafeKeeperConf>,
/// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding
/// this gate, you must respect [`Timeline::cancel`]
@@ -489,6 +490,7 @@ impl Timeline {
timeline_dir: &Utf8Path,
remote_path: &RemotePath,
shared_state: SharedState,
conf: Arc<SafeKeeperConf>,
) -> Arc<Self> {
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
watch::channel(shared_state.sk.state().commit_lsn);
@@ -516,6 +518,7 @@ impl Timeline {
gate: Default::default(),
cancel: CancellationToken::default(),
manager_ctl: ManagerCtl::new(),
conf,
broker_active: AtomicBool::new(false),
wal_backup_active: AtomicBool::new(false),
last_removed_segno: AtomicU64::new(0),
@@ -524,11 +527,14 @@ impl Timeline {
}
/// Load existing timeline from disk.
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
pub fn load_timeline(
conf: Arc<SafeKeeperConf>,
ttid: TenantTimelineId,
) -> Result<Arc<Timeline>> {
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
let shared_state = SharedState::restore(conf, &ttid)?;
let timeline_dir = get_timeline_dir(conf, &ttid);
let shared_state = SharedState::restore(conf.as_ref(), &ttid)?;
let timeline_dir = get_timeline_dir(conf.as_ref(), &ttid);
let remote_path = remote_timeline_path(&ttid)?;
Ok(Timeline::new(
@@ -536,6 +542,7 @@ impl Timeline {
&timeline_dir,
&remote_path,
shared_state,
conf,
))
}
@@ -604,8 +611,7 @@ impl Timeline {
// it is cancelled, so WAL storage won't be opened again.
shared_state.sk.close_wal_store();
let conf = GlobalTimelines::get_global_config();
if !only_local && conf.is_wal_backup_enabled() {
if !only_local && self.conf.is_wal_backup_enabled() {
// Note: we concurrently delete remote storage data from multiple
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
// do some retries anyway.
@@ -951,7 +957,7 @@ impl WalResidentTimeline {
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
let (_, persisted_state) = self.get_state().await;
let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
let enable_remote_read = self.conf.is_wal_backup_enabled();
WalReader::new(
&self.ttid,
@@ -1061,7 +1067,6 @@ impl ManagerTimeline {
/// Try to switch state Offloaded->Present.
pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
let conf = GlobalTimelines::get_global_config();
let mut shared = self.write_shared_state().await;
// trying to restore WAL storage
@@ -1069,7 +1074,7 @@ impl ManagerTimeline {
&self.ttid,
&self.timeline_dir,
shared.sk.state(),
conf.no_sync,
self.conf.no_sync,
)?;
// updating control file
@@ -1096,7 +1101,7 @@ impl ManagerTimeline {
// now we can switch shared.sk to Present, shouldn't fail
let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
let cfile_state = prev_sk.take_state();
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, self.conf.my_id)?);
Ok(())
}

View File

@@ -13,7 +13,6 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
use anyhow::{bail, Context, Result};
use camino::Utf8PathBuf;
use camino_tempfile::Utf8TempDir;
use once_cell::sync::Lazy;
use serde::Serialize;
use std::collections::HashMap;
use std::str::FromStr;
@@ -42,23 +41,16 @@ struct GlobalTimelinesState {
// this map is dropped on restart.
tombstones: HashMap<TenantTimelineId, Instant>,
conf: Option<SafeKeeperConf>,
conf: Arc<SafeKeeperConf>,
broker_active_set: Arc<TimelinesSet>,
global_rate_limiter: RateLimiter,
}
impl GlobalTimelinesState {
/// Get configuration, which must be set once during init.
fn get_conf(&self) -> &SafeKeeperConf {
self.conf
.as_ref()
.expect("GlobalTimelinesState conf is not initialized")
}
/// Get dependencies for a timeline constructor.
fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
(
self.get_conf().clone(),
self.conf.clone(),
self.broker_active_set.clone(),
self.global_rate_limiter.clone(),
)
@@ -82,35 +74,39 @@ impl GlobalTimelinesState {
}
}
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
tombstones: HashMap::new(),
conf: None,
broker_active_set: Arc::new(TimelinesSet::default()),
global_rate_limiter: RateLimiter::new(1, 1),
})
});
/// A zero-sized struct used to manage access to the global timelines map.
pub struct GlobalTimelines;
/// A struct used to manage access to the global timelines map.
pub struct GlobalTimelines {
state: Mutex<GlobalTimelinesState>,
}
impl GlobalTimelines {
/// Create a new instance of the global timelines map.
pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
Self {
state: Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
tombstones: HashMap::new(),
conf,
broker_active_set: Arc::new(TimelinesSet::default()),
global_rate_limiter: RateLimiter::new(1, 1),
}),
}
}
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
pub async fn init(conf: SafeKeeperConf) -> Result<()> {
pub async fn init(&self) -> Result<()> {
// clippy isn't smart enough to understand that drop(state) releases the
// lock, so use explicit block
let tenants_dir = {
let mut state = TIMELINES_STATE.lock().unwrap();
let mut state = self.state.lock().unwrap();
state.global_rate_limiter = RateLimiter::new(
conf.partial_backup_concurrency,
state.conf.partial_backup_concurrency,
DEFAULT_EVICTION_CONCURRENCY,
);
state.conf = Some(conf);
// Iterate through all directories and load tenants for all directories
// named as a valid tenant_id.
state.get_conf().workdir.clone()
state.conf.workdir.clone()
};
let mut tenant_count = 0;
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
@@ -122,7 +118,7 @@ impl GlobalTimelines {
TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
{
tenant_count += 1;
GlobalTimelines::load_tenant_timelines(tenant_id).await?;
self.load_tenant_timelines(tenant_id).await?;
}
}
Err(e) => error!(
@@ -135,7 +131,7 @@ impl GlobalTimelines {
info!(
"found {} tenants directories, successfully loaded {} timelines",
tenant_count,
TIMELINES_STATE.lock().unwrap().timelines.len()
self.state.lock().unwrap().timelines.len()
);
Ok(())
}
@@ -143,13 +139,13 @@ impl GlobalTimelines {
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
/// errors if any.
///
/// It is async, but TIMELINES_STATE lock is sync and there is no important
/// It is async, but self.state lock is sync and there is no important
/// reason to make it async (it is always held for a short while), so we
/// just lock and unlock it for each timeline -- this function is called
/// during init when nothing else is running, so this is fine.
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
let (conf, broker_active_set, partial_backup_rate_limiter) = {
let state = TIMELINES_STATE.lock().unwrap();
let state = self.state.lock().unwrap();
state.get_dependencies()
};
@@ -163,10 +159,10 @@ impl GlobalTimelines {
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
{
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
match Timeline::load_timeline(&conf, ttid) {
match Timeline::load_timeline(conf.clone(), ttid) {
Ok(tli) => {
let mut shared_state = tli.write_shared_state().await;
TIMELINES_STATE
self.state
.lock()
.unwrap()
.timelines
@@ -200,29 +196,30 @@ impl GlobalTimelines {
}
/// Get the number of timelines in the map.
pub fn timelines_count() -> usize {
TIMELINES_STATE.lock().unwrap().timelines.len()
pub fn timelines_count(&self) -> usize {
self.state.lock().unwrap().timelines.len()
}
/// Get the global safekeeper config.
pub fn get_global_config() -> SafeKeeperConf {
TIMELINES_STATE.lock().unwrap().get_conf().clone()
pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
self.state.lock().unwrap().conf.clone()
}
pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
self.state.lock().unwrap().broker_active_set.clone()
}
/// Create a new timeline with the given id. If the timeline already exists, returns
/// an existing timeline.
pub(crate) async fn create(
&self,
ttid: TenantTimelineId,
server_info: ServerInfo,
commit_lsn: Lsn,
local_start_lsn: Lsn,
) -> Result<Arc<Timeline>> {
let (conf, _, _) = {
let state = TIMELINES_STATE.lock().unwrap();
let state = self.state.lock().unwrap();
if let Ok(timeline) = state.get(&ttid) {
// Timeline already exists, return it.
return Ok(timeline);
@@ -245,7 +242,7 @@ impl GlobalTimelines {
let state =
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
Ok(timeline)
}
@@ -261,13 +258,14 @@ impl GlobalTimelines {
/// 2) move the directory and load the timeline
/// 3) take lock again and insert the timeline into the global map.
pub async fn load_temp_timeline(
&self,
ttid: TenantTimelineId,
tmp_path: &Utf8PathBuf,
check_tombstone: bool,
) -> Result<Arc<Timeline>> {
// Check for existence and mark that we're creating it.
let (conf, broker_active_set, partial_backup_rate_limiter) = {
let mut state = TIMELINES_STATE.lock().unwrap();
let mut state = self.state.lock().unwrap();
match state.timelines.get(&ttid) {
Some(GlobalMapTimeline::CreationInProgress) => {
bail!(TimelineError::CreationInProgress(ttid));
@@ -295,10 +293,10 @@ impl GlobalTimelines {
};
// Do the actual move and reflect the result in the map.
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
Ok(timeline) => {
let mut timeline_shared_state = timeline.write_shared_state().await;
let mut state = TIMELINES_STATE.lock().unwrap();
let mut state = self.state.lock().unwrap();
assert!(matches!(
state.timelines.get(&ttid),
Some(GlobalMapTimeline::CreationInProgress)
@@ -319,7 +317,7 @@ impl GlobalTimelines {
}
Err(e) => {
// Init failed, remove the marker from the map
let mut state = TIMELINES_STATE.lock().unwrap();
let mut state = self.state.lock().unwrap();
assert!(matches!(
state.timelines.get(&ttid),
Some(GlobalMapTimeline::CreationInProgress)
@@ -334,10 +332,10 @@ impl GlobalTimelines {
async fn install_temp_timeline(
ttid: TenantTimelineId,
tmp_path: &Utf8PathBuf,
conf: &SafeKeeperConf,
conf: Arc<SafeKeeperConf>,
) -> Result<Arc<Timeline>> {
let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
let timeline_path = get_timeline_dir(conf, &ttid);
let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
// We must have already checked that timeline doesn't exist in the map,
// but there might be existing datadir: if timeline is corrupted it is
@@ -382,9 +380,9 @@ impl GlobalTimelines {
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
/// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
/// i.e. loaded in memory and not cancelled.
pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
let tli_res = {
let state = TIMELINES_STATE.lock().unwrap();
let state = self.state.lock().unwrap();
state.get(&ttid)
};
match tli_res {
@@ -399,8 +397,8 @@ impl GlobalTimelines {
}
/// Returns all timelines. This is used for background timeline processes.
pub fn get_all() -> Vec<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
pub fn get_all(&self) -> Vec<Arc<Timeline>> {
let global_lock = self.state.lock().unwrap();
global_lock
.timelines
.values()
@@ -419,8 +417,8 @@ impl GlobalTimelines {
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
/// and that's why it can return cancelled timelines, to retry deleting them.
fn get_all_for_tenant(tenant_id: TenantId) -> Vec<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
let global_lock = self.state.lock().unwrap();
global_lock
.timelines
.values()
@@ -435,11 +433,12 @@ impl GlobalTimelines {
/// Cancels timeline, then deletes the corresponding data directory.
/// If only_local, doesn't remove WAL segments in remote storage.
pub(crate) async fn delete(
&self,
ttid: &TenantTimelineId,
only_local: bool,
) -> Result<TimelineDeleteForceResult> {
let tli_res = {
let state = TIMELINES_STATE.lock().unwrap();
let state = self.state.lock().unwrap();
if state.tombstones.contains_key(ttid) {
// Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
@@ -472,7 +471,7 @@ impl GlobalTimelines {
}
Err(_) => {
// Timeline is not memory, but it may still exist on disk in broken state.
let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
let dir_existed = delete_dir(dir_path)?;
Ok(TimelineDeleteForceResult {
@@ -485,7 +484,7 @@ impl GlobalTimelines {
// Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones
// are used to prevent still-running computes from re-creating the same timeline when they send data,
// and to speed up repeated deletion calls by avoiding re-listing objects.
TIMELINES_STATE.lock().unwrap().delete(*ttid);
self.state.lock().unwrap().delete(*ttid);
result
}
@@ -497,17 +496,18 @@ impl GlobalTimelines {
///
/// If only_local, doesn't remove WAL segments in remote storage.
pub async fn delete_force_all_for_tenant(
&self,
tenant_id: &TenantId,
only_local: bool,
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
let to_delete = Self::get_all_for_tenant(*tenant_id);
let to_delete = self.get_all_for_tenant(*tenant_id);
let mut err = None;
let mut deleted = HashMap::new();
for tli in &to_delete {
match Self::delete(&tli.ttid, only_local).await {
match self.delete(&tli.ttid, only_local).await {
Ok(result) => {
deleted.insert(tli.ttid, result);
}
@@ -529,15 +529,15 @@ impl GlobalTimelines {
// so the directory may be not empty. In this case timelines will have bad state
// and timeline background jobs can panic.
delete_dir(get_tenant_dir(
TIMELINES_STATE.lock().unwrap().get_conf(),
self.state.lock().unwrap().conf.as_ref(),
tenant_id,
))?;
Ok(deleted)
}
pub fn housekeeping(tombstone_ttl: &Duration) {
let mut state = TIMELINES_STATE.lock().unwrap();
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
let mut state = self.state.lock().unwrap();
// We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
// timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they

View File

@@ -4,6 +4,7 @@
//!
use anyhow::{Context, Result};
use postgres_backend::QueryError;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::TcpStream;
use tokio_io_timeout::TimeoutReader;
@@ -11,9 +12,9 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{auth::Scope, measured_stream::MeasuredStream};
use crate::handler::SafekeeperPostgresHandler;
use crate::metrics::TrafficMetrics;
use crate::SafeKeeperConf;
use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
use postgres_backend::{AuthType, PostgresBackend};
/// Accept incoming TCP connections and spawn them into a background thread.
@@ -22,9 +23,10 @@ use postgres_backend::{AuthType, PostgresBackend};
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
pub async fn task_main(
conf: SafeKeeperConf,
conf: Arc<SafeKeeperConf>,
pg_listener: std::net::TcpListener,
allowed_auth_scope: Scope,
global_timelines: Arc<GlobalTimelines>,
) -> anyhow::Result<()> {
// Tokio's from_std won't do this for us, per its comment.
pg_listener.set_nonblocking(true)?;
@@ -37,10 +39,10 @@ pub async fn task_main(
debug!("accepted connection from {}", peer_addr);
let conf = conf.clone();
let conn_id = issue_connection_id(&mut connection_count);
let global_timelines = global_timelines.clone();
tokio::spawn(
async move {
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
error!("connection handler exited: {}", err);
}
}
@@ -53,9 +55,10 @@ pub async fn task_main(
///
async fn handle_socket(
socket: TcpStream,
conf: SafeKeeperConf,
conf: Arc<SafeKeeperConf>,
conn_id: ConnectionId,
allowed_auth_scope: Scope,
global_timelines: Arc<GlobalTimelines>,
) -> Result<(), QueryError> {
socket.set_nodelay(true)?;
let peer_addr = socket.peer_addr()?;
@@ -96,8 +99,13 @@ async fn handle_socket(
Some(_) => AuthType::NeonJWT,
};
let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
let mut conn_handler =
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
let mut conn_handler = SafekeeperPostgresHandler::new(
conf,
conn_id,
Some(traffic_metrics.clone()),
auth_pair,
global_timelines,
);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
// libpq protocol between safekeeper and walproposer / pageserver
// We don't use shutdown.

View File

@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
use tokio_util::sync::CancellationToken;
use utils::id::NodeId;
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
#[derive(Copy, Clone)]
pub(crate) struct Drain {

View File

@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
ShardsPreferredAzsRequest, TenantCreateRequest,
};
use pageserver_api::models::{
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
TimelineCreateRequest,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
)
}
async fn handle_tenant_config_patch(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let mut req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
json_response(
StatusCode::OK,
service.tenant_config_patch(config_req).await?,
)
}
async fn handle_tenant_config_set(
service: Arc<Service>,
req: Request<Body>,
@@ -857,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Infra)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let state = get_state(&req);
let safekeepers = state.service.safekeepers_list().await?;
json_response(StatusCode::OK, safekeepers)
}
async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Scrubber)?;
@@ -1181,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
///
/// Not used by anything except manual testing.
async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
check_permissions(&req, Scope::Infra)?;
let id = parse_request_param::<i64>(&req, "id")?;
@@ -1199,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
match res {
Ok(b) => json_response(StatusCode::OK, b),
Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
Err(ApiError::NotFound("unknown instance_id".into()))
Err(ApiError::NotFound("unknown instance id".into()))
}
Err(other) => Err(other.into()),
}
@@ -1795,6 +1832,21 @@ pub fn make_router(
RequestName("control_v1_metadata_health_list_outdated"),
)
})
// Safekeepers
.get("/control/v1/safekeeper", |r| {
named_request_span(
r,
handle_safekeeper_list,
RequestName("control_v1_safekeeper_list"),
)
})
.get("/control/v1/safekeeper/:id", |r| {
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
})
.post("/control/v1/safekeeper/:id", |r| {
// id is in the body
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
})
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(
@@ -1847,13 +1899,6 @@ pub fn make_router(
.put("/control/v1/step_down", |r| {
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
})
.get("/control/v1/safekeeper/:id", |r| {
named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
})
.post("/control/v1/safekeeper/:id", |r| {
// id is in the body
named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -1863,6 +1908,13 @@ pub fn make_router(
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
})
.patch("/v1/tenant/config", |r| {
tenant_service_handler(
r,
handle_tenant_config_patch,
RequestName("v1_tenant_config"),
)
})
.put("/v1/tenant/config", |r| {
tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
})

View File

@@ -104,6 +104,7 @@ pub(crate) enum DatabaseOperation {
ListMetadataHealth,
ListMetadataHealthUnhealthy,
ListMetadataHealthOutdated,
ListSafekeepers,
GetLeader,
UpdateLeader,
SetPreferredAzs,
@@ -636,6 +637,13 @@ impl Persistence {
.into_boxed(),
};
// Clear generation_pageserver if we are moving into a state where we won't have
// any attached pageservers.
let input_generation_pageserver = match input_placement_policy {
None | Some(PlacementPolicy::Attached(_)) => None,
Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
};
#[derive(AsChangeset)]
#[diesel(table_name = crate::schema::tenant_shards)]
struct ShardUpdate {
@@ -643,6 +651,7 @@ impl Persistence {
placement_policy: Option<String>,
config: Option<String>,
scheduling_policy: Option<String>,
generation_pageserver: Option<Option<i64>>,
}
let update = ShardUpdate {
@@ -655,6 +664,7 @@ impl Persistence {
.map(|c| serde_json::to_string(&c).unwrap()),
scheduling_policy: input_scheduling_policy
.map(|p| serde_json::to_string(&p).unwrap()),
generation_pageserver: input_generation_pageserver,
};
query.set(update).execute(conn)?;
@@ -1002,6 +1012,22 @@ impl Persistence {
Ok(())
}
/// At startup, populate the list of nodes which our shards may be placed on
pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
let safekeepers: Vec<SafekeeperPersistence> = self
.with_measured_conn(
DatabaseOperation::ListNodes,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
},
)
.await?;
tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
Ok(safekeepers)
}
pub(crate) async fn safekeeper_get(
&self,
id: i64,

View File

@@ -29,6 +29,19 @@ diesel::table! {
}
}
diesel::table! {
safekeepers (id) {
id -> Int8,
region_id -> Text,
version -> Int8,
host -> Text,
port -> Int4,
active -> Bool,
http_port -> Int4,
availability_zone_id -> Text,
}
}
diesel::table! {
tenant_shards (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
@@ -45,18 +58,10 @@ diesel::table! {
}
}
diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
diesel::table! {
safekeepers {
id -> Int8,
region_id -> Text,
version -> Int8,
instance_id -> Text,
host -> Text,
port -> Int4,
active -> Bool,
http_port -> Int4,
availability_zone_id -> Text,
}
}
diesel::allow_tables_to_appear_in_same_query!(
controllers,
metadata_health,
nodes,
safekeepers,
tenant_shards,
);

View File

@@ -52,8 +52,8 @@ use pageserver_api::{
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
},
models::{
SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
TopTenantShardsRequest,
SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
TimelineArchivalConfigRequest, TopTenantShardsRequest,
},
};
use reqwest::StatusCode;
@@ -100,6 +100,8 @@ use crate::{
use context_iterator::TenantShardContextIterator;
const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
// For operations that should be quick, like attaching a new tenant
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -139,6 +141,7 @@ enum TenantOperations {
Create,
LocationConfig,
ConfigSet,
ConfigPatch,
TimeTravelRemoteStorage,
Delete,
UpdatePolicy,
@@ -513,6 +516,9 @@ struct ShardUpdate {
/// If this is None, generation is not updated.
generation: Option<Generation>,
/// If this is None, scheduling policy is not updated.
scheduling_policy: Option<ShardSchedulingPolicy>,
}
enum StopReconciliationsReason {
@@ -789,7 +795,7 @@ impl Service {
node_list_futs.push({
async move {
tracing::info!("Scanning shards on node {node}...");
let timeout = Duration::from_secs(1);
let timeout = Duration::from_secs(5);
let response = node
.with_client_retries(
|client| async move { client.list_location_config().await },
@@ -2376,6 +2382,23 @@ impl Service {
}
};
// Ordinarily we do not update scheduling policy, but when making major changes
// like detaching or demoting to secondary-only, we need to force the scheduling
// mode to Active, or the caller's expected outcome (detach it) will not happen.
let scheduling_policy = match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// Special case: when making major changes like detaching or demoting to secondary-only,
// we need to force the scheduling mode to Active, or nothing will happen.
Some(ShardSchedulingPolicy::Active)
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// While attached, continue to respect whatever the existing scheduling mode is.
None
}
};
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
@@ -2401,6 +2424,7 @@ impl Service {
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
scheduling_policy,
});
}
@@ -2497,6 +2521,7 @@ impl Service {
placement_policy,
tenant_config,
generation,
scheduling_policy,
} in &updates
{
self.persistence
@@ -2505,7 +2530,7 @@ impl Service {
Some(placement_policy.clone()),
Some(tenant_config.clone()),
*generation,
None,
*scheduling_policy,
)
.await?;
}
@@ -2521,6 +2546,7 @@ impl Service {
placement_policy,
tenant_config,
generation: update_generation,
scheduling_policy,
} in updates
{
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2539,6 +2565,10 @@ impl Service {
shard.generation = Some(generation);
}
if let Some(scheduling_policy) = scheduling_policy {
shard.set_scheduling_policy(scheduling_policy);
}
shard.schedule(scheduler, &mut schedule_context)?;
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -2575,6 +2605,55 @@ impl Service {
Ok(result)
}
pub(crate) async fn tenant_config_patch(
&self,
req: TenantConfigPatchRequest,
) -> Result<(), ApiError> {
let _tenant_lock = trace_exclusive_lock(
&self.tenant_op_locks,
req.tenant_id,
TenantOperations::ConfigPatch,
)
.await;
let tenant_id = req.tenant_id;
let patch = req.config;
let base = {
let locked = self.inner.read().unwrap();
let shards = locked
.tenants
.range(TenantShardId::tenant_range(req.tenant_id));
let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
let first = match configs.peek() {
Some(first) => (*first).clone(),
None => {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
));
}
};
if !configs.all_equal() {
tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
// This can't happen because we atomically update the database records
// of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Tenant configs for {} are mismatched",
req.tenant_id
)));
}
first
};
let updated_config = base.apply_patch(patch);
self.set_tenant_config_and_reconcile(tenant_id, updated_config)
.await
}
pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
// We require an exclusive lock, because we are updating persistent and in-memory state
let _tenant_lock = trace_exclusive_lock(
@@ -2584,12 +2663,32 @@ impl Service {
)
.await;
let tenant_id = req.tenant_id;
let config = req.config;
let tenant_exists = {
let locked = self.inner.read().unwrap();
let mut r = locked
.tenants
.range(TenantShardId::tenant_range(req.tenant_id));
r.next().is_some()
};
if !tenant_exists {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
));
}
self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
.await
}
async fn set_tenant_config_and_reconcile(
&self,
tenant_id: TenantId,
config: TenantConfig,
) -> Result<(), ApiError> {
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(req.tenant_id),
TenantFilter::Tenant(tenant_id),
None,
Some(config.clone()),
None,
@@ -2992,9 +3091,17 @@ impl Service {
let TenantPolicyRequest {
placement,
scheduling,
mut scheduling,
} = req;
if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
// When someone configures a tenant to detach, we force the scheduling policy to enable
// this to take effect.
if scheduling.is_none() {
scheduling = Some(ShardSchedulingPolicy::Active);
}
}
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(tenant_id),
@@ -6693,7 +6800,7 @@ impl Service {
}
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
.await;
failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -6946,7 +7053,7 @@ impl Service {
}
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
.await;
}
@@ -7078,6 +7185,12 @@ impl Service {
global_observed
}
pub(crate) async fn safekeepers_list(
&self,
) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
self.persistence.list_safekeepers().await
}
pub(crate) async fn get_safekeeper(
&self,
id: i64,

View File

@@ -533,8 +533,9 @@ async fn list_timeline_blobs_impl(
}
pub(crate) struct RemoteTenantManifestInfo {
pub(crate) latest_generation: Option<Generation>,
pub(crate) manifests: Vec<(Generation, ListingObject)>,
pub(crate) generation: Generation,
pub(crate) manifest: TenantManifest,
pub(crate) listing_object: ListingObject,
}
pub(crate) enum ListTenantManifestResult {
@@ -543,7 +544,10 @@ pub(crate) enum ListTenantManifestResult {
#[allow(dead_code)]
unknown_keys: Vec<ListingObject>,
},
NoErrors(RemoteTenantManifestInfo),
NoErrors {
latest_generation: Option<RemoteTenantManifestInfo>,
manifests: Vec<(Generation, ListingObject)>,
},
}
/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
@@ -592,14 +596,6 @@ pub(crate) async fn list_tenant_manifests(
unknown_keys.push(obj);
}
if manifests.is_empty() {
tracing::debug!("No manifest for timeline.");
return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}
if !unknown_keys.is_empty() {
errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
@@ -609,6 +605,15 @@ pub(crate) async fn list_tenant_manifests(
});
}
if manifests.is_empty() {
tracing::debug!("No manifest for timeline.");
return Ok(ListTenantManifestResult::NoErrors {
latest_generation: None,
manifests,
});
}
// Find the manifest with the highest generation
let (latest_generation, latest_listing_object) = manifests
.iter()
@@ -616,6 +621,8 @@ pub(crate) async fn list_tenant_manifests(
.map(|(g, obj)| (*g, obj.clone()))
.unwrap();
manifests.retain(|(gen, _obj)| gen != &latest_generation);
let manifest_bytes =
match download_object_with_retries(remote_client, &latest_listing_object.key).await {
Ok(bytes) => bytes,
@@ -634,13 +641,15 @@ pub(crate) async fn list_tenant_manifests(
};
match TenantManifest::from_json_bytes(&manifest_bytes) {
Ok(_manifest) => {
return Ok(ListTenantManifestResult::NoErrors(
RemoteTenantManifestInfo {
latest_generation: Some(latest_generation),
manifests,
},
));
Ok(manifest) => {
return Ok(ListTenantManifestResult::NoErrors {
latest_generation: Some(RemoteTenantManifestInfo {
generation: latest_generation,
manifest,
listing_object: latest_listing_object,
}),
manifests,
});
}
Err(parse_error) => errors.push((
latest_listing_object.key.get_path().as_str().to_owned(),

View File

@@ -459,12 +459,10 @@ pub async fn get_timeline_objects(
Ok(list.keys)
}
const MAX_KEYS_PER_DELETE: usize = 1000;
/// Drain a buffer of keys into DeleteObjects requests
///
/// If `drain` is true, drains keys completely; otherwise stops when <
/// MAX_KEYS_PER_DELETE keys are left.
/// `max_keys_per_delete`` keys are left.
/// `num_deleted` returns number of deleted keys.
async fn do_delete(
remote_client: &GenericRemoteStorage,
@@ -474,9 +472,10 @@ async fn do_delete(
progress_tracker: &mut DeletionProgressTracker,
) -> anyhow::Result<()> {
let cancel = CancellationToken::new();
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
let max_keys_per_delete = remote_client.max_keys_per_delete();
while (!keys.is_empty() && drain) || (keys.len() >= max_keys_per_delete) {
let request_keys =
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
keys.split_off(keys.len() - (std::cmp::min(max_keys_per_delete, keys.len())));
let request_keys: Vec<RemotePath> = request_keys.into_iter().map(|o| o.key).collect();
@@ -617,7 +616,7 @@ pub async fn purge_garbage(
}
objects_to_delete.append(&mut object_list);
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
if objects_to_delete.len() >= remote_client.max_keys_per_delete() {
do_delete(
&remote_client,
&mut objects_to_delete,

View File

@@ -268,7 +268,7 @@ impl BucketConfig {
config.bucket_name, config.bucket_region
),
RemoteStorageKind::AzureContainer(config) => format!(
"bucket {}, storage account {:?}, region {}",
"container {}, storage account {:?}, region {}",
config.container_name, config.storage_account, config.container_region
),
}

View File

@@ -86,6 +86,8 @@ enum Command {
/// For safekeeper node_kind only, json list of timelines and their lsn info
#[arg(long, default_value = None)]
timeline_lsns: Option<String>,
#[arg(long, default_value_t = false)]
verbose: bool,
},
TenantSnapshot {
#[arg(long = "tenant-id")]
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
dump_db_connstr,
dump_db_table,
timeline_lsns,
verbose,
} => {
if let NodeKind::Safekeeper = node_kind {
let db_or_list = match (timeline_lsns, dump_db_connstr) {
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
tenant_ids,
json,
post_to_storcon,
verbose,
cli.exit_code,
)
.await
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
Vec::new(),
true,
post_to_storcon,
false, // default to non-verbose mode
exit_code,
)
.await?;
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
tenant_shard_ids: Vec<TenantShardId>,
json: bool,
post_to_storcon: bool,
verbose: bool,
exit_code: bool,
) -> anyhow::Result<()> {
if controller_client.is_none() && post_to_storcon {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)

View File

@@ -4,11 +4,13 @@ use std::time::Duration;
use crate::checks::{
list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
RemoteTenantManifestInfo,
};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
use pageserver::tenant::remote_timeline_client::{
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
};
@@ -527,7 +529,7 @@ async fn gc_tenant_manifests(
target: &RootTarget,
mode: GcMode,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<GcSummary> {
) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
let mut gc_summary = GcSummary::default();
match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
ListTenantManifestResult::WithErrors {
@@ -537,33 +539,35 @@ async fn gc_tenant_manifests(
for (_key, error) in errors {
tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
}
Ok((gc_summary, None))
}
ListTenantManifestResult::NoErrors(mut manifest_info) => {
let Some(latest_gen) = manifest_info.latest_generation else {
return Ok(gc_summary);
ListTenantManifestResult::NoErrors {
latest_generation,
mut manifests,
} => {
let Some(latest_generation) = latest_generation else {
return Ok((gc_summary, None));
};
manifest_info
.manifests
.sort_by_key(|(generation, _obj)| *generation);
manifests.sort_by_key(|(generation, _obj)| *generation);
// skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
let candidates = manifest_info.manifests.iter().rev().skip(2);
let candidates = manifests.iter().rev().skip(2);
for (_generation, key) in candidates {
maybe_delete_tenant_manifest(
remote_client,
&min_age,
latest_gen,
latest_generation.generation,
key,
mode,
&mut gc_summary,
)
.instrument(
info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
)
.await;
}
Ok((gc_summary, Some(latest_generation)))
}
}
Ok(gc_summary)
}
async fn gc_timeline(
@@ -573,6 +577,7 @@ async fn gc_timeline(
mode: GcMode,
ttid: TenantShardTimelineId,
accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
) -> anyhow::Result<GcSummary> {
let mut summary = GcSummary::default();
let data = list_timeline_blobs(remote_client, ttid, target).await?;
@@ -597,6 +602,60 @@ async fn gc_timeline(
}
};
if let Some(tenant_manifest_info) = &*tenant_manifest_info {
// TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
let maybe_offloaded = tenant_manifest_info
.manifest
.offloaded_timelines
.iter()
.find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
if let Some(offloaded) = maybe_offloaded {
let warnings = validate_index_part_with_offloaded(index_part, offloaded);
let warn = if warnings.is_empty() {
false
} else {
// Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
ListTenantManifestResult::WithErrors {
errors,
unknown_keys: _,
} => {
for (_key, error) in errors {
tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
}
true
}
ListTenantManifestResult::NoErrors {
latest_generation,
manifests: _,
} => {
if let Some(new_latest_gen) = latest_generation {
let manifest_changed = (
new_latest_gen.generation,
new_latest_gen.listing_object.last_modified,
) == (
tenant_manifest_info.generation,
tenant_manifest_info.listing_object.last_modified,
);
if manifest_changed {
tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
}
manifest_changed
} else {
// The latest generation is gone. This timeline is in the progress of being deleted?
false
}
}
}
};
if warn {
for warning in warnings {
tracing::warn!(%ttid, "{}", warning);
}
}
}
}
accumulator.lock().unwrap().update(ttid, index_part);
for key in candidates {
@@ -608,6 +667,35 @@ async fn gc_timeline(
Ok(summary)
}
fn validate_index_part_with_offloaded(
index_part: &IndexPart,
offloaded: &OffloadedTimelineManifest,
) -> Vec<String> {
let mut warnings = Vec::new();
if let Some(archived_at_index_part) = index_part.archived_at {
if archived_at_index_part
.signed_duration_since(offloaded.archived_at)
.num_seconds()
!= 0
{
warnings.push(format!(
"index-part archived_at={} differs from manifest archived_at={}",
archived_at_index_part, offloaded.archived_at
));
}
} else {
warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
}
if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
warnings.push(format!(
"index-part anestor={:?} differs from manifest ancestor={:?}",
index_part.metadata.ancestor_timeline(),
offloaded.ancestor_timeline_id
));
}
warnings
}
/// Physical garbage collection: removing unused S3 objects.
///
/// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
@@ -650,29 +738,38 @@ pub async fn pageserver_physical_gc(
let target_ref = &target;
let remote_client_ref = &remote_client;
async move {
let summaries_from_manifests = match gc_tenant_manifests(
let gc_manifest_result = gc_tenant_manifests(
remote_client_ref,
min_age,
target_ref,
mode,
tenant_shard_id,
)
.await
{
Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
gc_summary,
))],
.await;
let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
Err(e) => {
tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
Vec::new()
(GcSummary::default(), None)
}
};
let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
summary_from_manifest,
));
stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
.await
.map(|stream| {
stream
.map_ok(GcSummaryOrContent::Content)
.chain(futures::stream::iter(summaries_from_manifests.into_iter()))
.zip(futures::stream::iter(std::iter::repeat(
tenant_manifest_arc,
)))
.map(|(ttid_res, tenant_manifest_arc)| {
ttid_res.map(move |ttid| {
GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
})
})
.chain(futures::stream::iter([summary_from_manifest].into_iter()))
})
}
});
@@ -684,14 +781,17 @@ pub async fn pageserver_physical_gc(
// Drain futures for per-shard GC, populating accumulator as a side effect
{
let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
&remote_client,
&min_age,
&target,
mode,
ttid,
&accumulator,
)),
GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
futures::future::Either::Left(gc_timeline(
&remote_client,
&min_age,
&target,
mode,
ttid,
&accumulator,
tenant_manifest_arc,
))
}
GcSummaryOrContent::GcSummary(gc_summary) => {
futures::future::Either::Right(futures::future::ok(gc_summary))
}

View File

@@ -21,8 +21,12 @@ pub struct MetadataSummary {
tenant_count: usize,
timeline_count: usize,
timeline_shard_count: usize,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
/// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
/// The key is generated using `TenantShardTimelineId::to_string()`.
with_errors: HashMap<String, Vec<String>>,
/// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
/// The key is generated using `TenantShardTimelineId::to_string()`.
with_warnings: HashMap<String, Vec<String>>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
@@ -52,7 +56,12 @@ impl MetadataSummary {
}
}
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
fn update_analysis(
&mut self,
id: &TenantShardTimelineId,
analysis: &TimelineAnalysis,
verbose: bool,
) {
if analysis.is_healthy() {
self.healthy_tenant_shards.insert(id.tenant_shard_id);
} else {
@@ -61,11 +70,17 @@ impl MetadataSummary {
}
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
let entry = self.with_errors.entry(id.to_string()).or_default();
if verbose {
entry.extend(analysis.errors.iter().cloned());
}
}
if !analysis.warnings.is_empty() {
self.with_warnings.insert(*id);
let entry = self.with_warnings.entry(id.to_string()).or_default();
if verbose {
entry.extend(analysis.warnings.iter().cloned());
}
}
}
@@ -120,6 +135,7 @@ Index versions: {version_summary}
pub async fn scan_pageserver_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantShardId>,
verbose: bool,
) -> anyhow::Result<MetadataSummary> {
let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
mut tenant_objects: TenantObjectListing,
timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
highest_shard_count: ShardCount,
verbose: bool,
) {
summary.tenant_count += 1;
@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
Some(data),
)
.await;
summary.update_analysis(&ttid, &analysis);
summary.update_analysis(&ttid, &analysis, verbose);
timeline_ids.insert(ttid.timeline_id);
} else {
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
summary.update_data(&data);
match tenant_id {
None => {
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
Some(prev_tenant_id) => {
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
// New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
tenant_objects,
timelines,
highest_shard_count,
verbose,
)
.instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
.await;
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
}
None => {
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
}
}
match &data.blob_data {
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
tenant_objects,
tenant_timeline_results,
highest_shard_count,
verbose,
)
.instrument(info_span!("analyze-tenant", tenant = %tenant_id))
.await;

View File

@@ -0,0 +1,21 @@
# How to run the `pg_regress` tests on a cloud Neon instance.
* Create a Neon project on staging.
* Grant the superuser privileges to the DB user.
* (Optional) create a branch for testing
* Configure the endpoint by updating the control-plane database with the following settings:
* `Timeone`: `America/Los_Angeles`
* `DateStyle`: `Postgres,MDY`
* `compute_query_id`: `off`
* Checkout the actual `Neon` sources
* Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
```bash
$ cd vendor/postgres-v17
$ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
```
* Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
* Set the environment variable `PG_VERSION` to the version of your project.
* Run
```bash
$ pytest -m remote_cluster -k cloud_regress
```

View File

@@ -5,68 +5,15 @@ Run the regression tests on the cloud instance of Neon
from __future__ import annotations
from pathlib import Path
from typing import Any
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import RemotePostgres
from fixtures.pg_version import PgVersion
@pytest.fixture
def setup(remote_pg: RemotePostgres):
"""
Setup and teardown of the tests
"""
with psycopg2.connect(remote_pg.connstr()) as conn:
with conn.cursor() as cur:
log.info("Creating the extension")
cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
conn.commit()
# TODO: Migrate to branches and remove this code
log.info("Looking for subscriptions in the regress database")
cur.execute(
"SELECT subname FROM pg_catalog.pg_subscription WHERE "
"subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
)
if cur.rowcount > 0:
with psycopg2.connect(
dbname="regression",
host=remote_pg.default_options["host"],
user=remote_pg.default_options["user"],
password=remote_pg.default_options["password"],
) as regress_conn:
with regress_conn.cursor() as regress_cur:
for sub in cur:
regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
regress_cur.execute(
f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
)
regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
regress_conn.commit()
yield
# TODO: Migrate to branches and remove this code
log.info("Looking for extra roles...")
with psycopg2.connect(remote_pg.connstr()) as conn:
with conn.cursor() as cur:
cur.execute(
"SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
)
roles: list[Any] = []
for role in cur:
log.info("Role found: %s", role[0])
roles.append(role[0])
for role in roles:
cur.execute(f"DROP ROLE {role}")
conn.commit()
@pytest.mark.timeout(7200)
@pytest.mark.remote_cluster
def test_cloud_regress(
setup,
remote_pg: RemotePostgres,
pg_version: PgVersion,
pg_distrib_dir: Path,

View File

@@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer
if TYPE_CHECKING:
from collections.abc import Iterator
from ssl import SSLContext
from fixtures.port_distributor import PortDistributor
# TODO: mypy fails with:
# Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor" [attr-defined]
# from fixtures.neon_fixtures import PortDistributor
ListenAddress = tuple[str, int]
# compared to the fixtures from pytest_httpserver with same names, these are
# always function scoped, so you can check and stop the server in tests.
@pytest.fixture(scope="function")
def httpserver_ssl_context():
return None
def httpserver_ssl_context() -> Iterator[SSLContext | None]:
yield None
@pytest.fixture(scope="function")
def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
def make_httpserver(
httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
) -> Iterator[HTTPServer]:
host, port = httpserver_listen_address
if not host:
host = HTTPServer.DEFAULT_LISTEN_HOST
@@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:
@pytest.fixture(scope="function")
def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
port = port_distributor.get_port()
return ("localhost", port)

View File

@@ -152,6 +152,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_disk_consistent_lsn",
"pageserver_projected_remote_consistent_lsn",
"pageserver_standby_horizon",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
@@ -173,7 +175,10 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
counter("pageserver_tenant_throttling_count_accounted_finish"),
counter("pageserver_tenant_throttling_wait_usecs_sum"),
counter("pageserver_tenant_throttling_count"),
counter("pageserver_timeline_wal_records_received"),
counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
*histogram("pageserver_page_service_batch_size"),
*histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken

View File

@@ -2329,6 +2329,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
return None
raise e
def get_safekeepers(self) -> list[dict[str, Any]]:
response = self.request(
"GET",
f"{self.api}/control/v1/safekeeper",
headers=self.headers(TokenScope.ADMIN),
)
json = response.json()
assert isinstance(json, list)
return json
def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
response = self.request(
"PUT",
@@ -4556,6 +4566,7 @@ class StorageScrubber:
def __init__(self, env: NeonEnv, log_dir: Path):
self.env = env
self.log_dir = log_dir
self.allowed_errors: list[str] = []
def scrubber_cli(
self, args: list[str], timeout, extra_env: dict[str, str] | None = None
@@ -4633,19 +4644,70 @@ class StorageScrubber:
if timeline_lsns is not None:
args.append("--timeline-lsns")
args.append(json.dumps(timeline_lsns))
if node_kind == NodeKind.PAGESERVER:
args.append("--verbose")
stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
try:
summary = json.loads(stdout)
# summary does not contain "with_warnings" if node_kind is the safekeeper
no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
healthy = not summary["with_errors"] and no_warnings
healthy = self._check_run_healthy(summary)
return healthy, summary
except:
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
log.error(stdout)
raise
def _check_line_allowed(self, line: str) -> bool:
for a in self.allowed_errors:
try:
if re.match(a, line):
return True
except re.error:
log.error(f"Invalid regex: '{a}'")
raise
return False
def _check_line_list_allowed(self, lines: list[str]) -> bool:
for line in lines:
if not self._check_line_allowed(line):
return False
return True
def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
# summary does not contain "with_warnings" if node_kind is the safekeeper
healthy = True
with_warnings = summary.get("with_warnings", None)
if with_warnings is not None:
if isinstance(with_warnings, list):
if len(with_warnings) > 0:
# safekeeper scan_metadata output is a list of tenants
healthy = False
else:
for _, warnings in with_warnings.items():
assert (
len(warnings) > 0
), "with_warnings value should not be empty, running without verbose mode?"
if not self._check_line_list_allowed(warnings):
healthy = False
break
if not healthy:
return healthy
with_errors = summary.get("with_errors", None)
if with_errors is not None:
if isinstance(with_errors, list):
if len(with_errors) > 0:
# safekeeper scan_metadata output is a list of tenants
healthy = False
else:
for _, errors in with_errors.items():
assert (
len(errors) > 0
), "with_errors value should not be empty, running without verbose mode?"
if not self._check_line_list_allowed(errors):
healthy = False
break
return healthy
def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
stdout = self.scrubber_cli(
["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],

View File

@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
)
self.verbose_error(res)
def patch_tenant_config_client_side(
def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
"""
Only use this via storage_controller.pageserver_api().
See `set_tenant_config` for more information.
"""
assert "tenant_id" not in updates.keys()
res = self.patch(
f"http://localhost:{self.port}/v1/tenant/config",
json={**updates, "tenant_id": str(tenant_id)},
)
self.verbose_error(res)
def update_tenant_config(
self,
tenant_id: TenantId,
inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
See `set_tenant_config` for more information.
"""
current = self.tenant_config(tenant_id).tenant_specific_overrides
if inserts is not None:
current.update(inserts)
if removes is not None:
for key in removes:
del current[key]
self.set_tenant_config(tenant_id, current)
if inserts is None:
inserts = {}
if removes is None:
removes = []
patch = inserts | {remove: None for remove in removes}
self.patch_tenant_config(tenant_id, patch)
def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
return self.tenant_size_and_modelinputs(tenant_id)[0]
@@ -850,6 +863,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
force_repartition=False,
force_image_layer_creation=False,
force_l0_compaction=False,
wait_until_flushed=True,
wait_until_uploaded=False,
compact: bool | None = None,
**kwargs,
@@ -862,6 +876,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_image_layer_creation"] = "true"
if force_l0_compaction:
query["force_l0_compaction"] = "true"
if not wait_until_flushed:
query["wait_until_flushed"] = "false"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
@@ -869,7 +885,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["compact"] = "true" if compact else "false"
log.info(
f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
)
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

View File

@@ -54,23 +54,15 @@ def wait_for_upload(
tenant: TenantId | TenantShardId,
timeline: TimelineId,
lsn: Lsn,
timeout=20,
):
"""waits for local timeline upload up to specified lsn"""
"""Waits for local timeline upload up to specified LSN"""
current_lsn = Lsn(0)
for i in range(20):
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
if current_lsn >= lsn:
log.info("wait finished")
return
lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
log.info(
f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
)
time.sleep(1)
raise Exception(
f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
)
def is_uploaded():
remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):

View File

@@ -0,0 +1,142 @@
from __future__ import annotations
import random
from concurrent.futures import ThreadPoolExecutor
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import s3_storage
@pytest.mark.timeout(900)
@pytest.mark.parametrize("size", [8, 1024, 8192])
@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
def test_ingest_insert_bulk(
request: pytest.FixtureRequest,
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
fsync: bool,
backpressure: bool,
s3: bool,
size: int,
):
"""
Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
isolation.
"""
CONCURRENCY = 1 # 1 is optimal without fsync or backpressure
VOLUME = 5 * 1024**3
rows = VOLUME // (size + 64) # +64 roughly accounts for per-row WAL overhead
neon_env_builder.safekeepers_enable_fsync = fsync
if s3:
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
# NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
# would compete with Pageserver for bandwidth.
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main",
config_lines=[
f"fsync = {fsync}",
"max_replication_apply_lag = 0",
f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
# NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
],
)
endpoint.safe_psql("create extension neon")
# Wait for the timeline to be propagated to the pageserver.
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
# Ingest rows.
log.info("Ingesting data")
start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
def insert_rows(endpoint, table, count, value):
with endpoint.connect().cursor() as cur:
cur.execute("set statement_timeout = 0")
cur.execute(f"create table {table} (id int, data bytea)")
cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
with zenbenchmark.record_duration("upload"):
with zenbenchmark.record_duration("ingest"):
with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
for i in range(CONCURRENCY):
# Write a random value for all rows. This is sufficient to prevent compression,
# e.g. in TOAST. Randomly generating every row is too slow.
value = random.randbytes(size)
worker_rows = rows / CONCURRENCY
pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
# Wait for pageserver to ingest the WAL.
client = env.pageserver.http_client()
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
# Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
client.timeline_checkpoint(
env.initial_tenant,
env.initial_timeline,
compact=False,
wait_until_flushed=False,
)
wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
# Empty out upload queue for next benchmark.
wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
# Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
# reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
# pageserver can ingest this WAL in isolation.
status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
assert status is not None
endpoint.stop() # avoid spurious getpage errors
client.tenant_delete(env.initial_tenant)
env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
with zenbenchmark.record_duration("recover"):
log.info("Recovering WAL into pageserver")
client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
# Emit metrics.
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
zenbenchmark.record(
"backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
)
props = {p["name"]: p["value"] for _, p in request.node.user_properties}
for name in ("ingest", "upload", "recover"):
throughput = int(wal_written_mb / props[name])
zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
# Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
env.stop(immediate=True)

View File

@@ -15,7 +15,7 @@ from fixtures.pageserver.http import PageserverApiException
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.workload import Workload
AGGRESIVE_COMPACTION_TENANT_CONF = {
AGGRESSIVE_COMPACTION_TENANT_CONF = {
# Disable gc and compaction. The test runs compaction manually.
"gc_period": "0s",
"compaction_period": "0s",
@@ -24,6 +24,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 2,
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
}
@@ -51,7 +52,7 @@ def test_pageserver_compaction_smoke(
page_cache_size=10
"""
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -120,14 +121,25 @@ page_cache_size=10
assert vectored_average < 8
@skip_in_debug_build("only run with release build")
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
SMOKE_CONF = {
# Run both gc and gc-compaction.
"gc_period": "5s",
"compaction_period": "5s",
# No PiTR interval and small GC horizon
"pitr_interval": "0s",
"gc_horizon": f"{1024 ** 2}",
"lsn_lease_length": "0s",
}
env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 1000
churn_rounds = 10
row_count = 10000
churn_rounds = 50
ps_http = env.pageserver.http_client()
@@ -141,23 +153,35 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
if i % 10 == 0:
log.info(f"Running churn round {i}/{churn_rounds} ...")
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
ps_http.timeline_compact(
tenant_id,
timeline_id,
enhanced_gc_bottom_most_compaction=True,
body={
"scheduled": True,
"sub_compaction": True,
"compact_range": {
"start": "000000000000000000000000000000000000",
"end": "030000000000000000000000000000000000",
},
},
)
workload.churn_rows(row_count, env.pageserver.id)
# Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
ps_http.timeline_compact(
tenant_id,
timeline_id,
enhanced_gc_bottom_most_compaction=True,
body={
"start": "000000000000000000000000000000000000",
"end": "030000000000000000000000000000000000",
},
)
# ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
env.pageserver.assert_log_contains(
"scheduled_compact_timeline.*picked .* layers for compaction"
)
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
ps_http.timeline_gc(tenant_id, timeline_id, None)
# Stripe sizes in number of pages.
TINY_STRIPES = 16

View File

@@ -215,7 +215,7 @@ if SQL_EXPORTER is None:
#
# The "host" network mode allows sql_exporter to talk to the
# endpoint which is running on the host.
super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
self.__logs_dir = logs_dir
self.__port = port

View File

@@ -15,6 +15,8 @@ from werkzeug.wrappers.response import Response
if TYPE_CHECKING:
from typing import Any, Self
from fixtures.httpserver import ListenAddress
def handle_db(dbs, roles, operation):
if operation["op"] == "set":
@@ -120,7 +122,7 @@ class DdlForwardingContext:
@pytest.fixture(scope="function")
def ddl(
httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: ListenAddress
):
(host, port) = httpserver_listen_address
with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:

View File

@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
assert (
du_by_timeline[large_tenant] > min_resident_size
), "ensure the larger tenant will get a haircut"
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
small_tenant[0], {"min_resident_size_override": min_resident_size}
)
env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
env.neon_env.storage_controller.pageserver_api().update_tenant_config(
large_tenant[0], {"min_resident_size_override": min_resident_size}
)

Some files were not shown because too many files have changed in this diff Show More