mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 00:50:36 +00:00
Compare commits
2 Commits
allow_even
...
sk-bump-te
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
48e4a93cc5 | ||
|
|
2c4720d0da |
@@ -23,30 +23,10 @@ platforms = [
|
||||
]
|
||||
|
||||
[final-excludes]
|
||||
workspace-members = [
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
"vm_monitor",
|
||||
# All of these exist in libs and are not usually built independently.
|
||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"tenant_size_model",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"wal_craft",
|
||||
"walproposer",
|
||||
]
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
workspace-members = ["vm_monitor"]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
|
||||
@@ -43,7 +43,7 @@ inputs:
|
||||
pg_version:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v16'
|
||||
default: 'v14'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -169,8 +169,10 @@ runs:
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
@@ -48,8 +48,6 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
|
||||
15
.github/workflows/_build-and-test-locally.yml
vendored
15
.github/workflows/_build-and-test-locally.yml
vendored
@@ -94,16 +94,11 @@ jobs:
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
@@ -163,8 +158,6 @@ jobs:
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
@@ -179,7 +172,7 @@ jobs:
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
@@ -250,8 +243,8 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
# Don't run regression tests on debug arm64 builds
|
||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
||||
# Run test on x64 only
|
||||
if: inputs.arch == 'x64'
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
|
||||
13
.github/workflows/build_and_test.yml
vendored
13
.github/workflows/build_and_test.yml
vendored
@@ -198,7 +198,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
arch: [ x64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
@@ -280,7 +280,6 @@ jobs:
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
pg_version: v16
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -986,10 +985,10 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
@@ -999,14 +998,14 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
@@ -1016,7 +1015,7 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f branch=main \
|
||||
|
||||
27
Cargo.lock
generated
27
Cargo.lock
generated
@@ -1208,6 +1208,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1320,6 +1321,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1668,13 +1670,14 @@ dependencies = [
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.2.3"
|
||||
version = "2.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
|
||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
@@ -3144,6 +3147,7 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3787,6 +3791,7 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4188,6 +4193,7 @@ dependencies = [
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4200,6 +4206,7 @@ dependencies = [
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
"url",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4222,6 +4229,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4259,6 +4267,7 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4823,6 +4832,7 @@ dependencies = [
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5347,6 +5357,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5590,12 +5601,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.125"
|
||||
version = "1.0.96"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
|
||||
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@@ -6183,6 +6193,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6783,6 +6794,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7000,6 +7012,7 @@ dependencies = [
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7078,6 +7091,7 @@ dependencies = [
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7098,6 +7112,7 @@ dependencies = [
|
||||
"bindgen",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7654,6 +7669,8 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "16";
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
|
||||
@@ -217,7 +217,7 @@ impl StorageController {
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
@@ -382,6 +382,7 @@ impl StorageController {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
@@ -453,11 +454,6 @@ impl StorageController {
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
|
||||
let peer_claims = Claims::new(None, Scope::Admin);
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
|
||||
@@ -147,9 +147,9 @@ enum Command {
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// outside of the specified set.
|
||||
BulkMigrate {
|
||||
Drain {
|
||||
// Set of pageserver node ids to drain.
|
||||
#[arg(long)]
|
||||
nodes: Vec<NodeId>,
|
||||
@@ -163,34 +163,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
dry_run: Option<bool>,
|
||||
},
|
||||
/// Start draining the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel draining the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Start filling the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel filling the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -277,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_scheduling_policy<F>(
|
||||
client: Client,
|
||||
node_id: NodeId,
|
||||
timeout: Duration,
|
||||
f: F,
|
||||
) -> anyhow::Result<NodeSchedulingPolicy>
|
||||
where
|
||||
F: Fn(NodeSchedulingPolicy) -> bool,
|
||||
{
|
||||
let waiter = tokio::time::timeout(timeout, async move {
|
||||
loop {
|
||||
let node = client
|
||||
.dispatch::<(), NodeDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if f(node.scheduling) {
|
||||
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(waiter.await??)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
@@ -684,7 +628,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::BulkMigrate {
|
||||
Command::Drain {
|
||||
nodes,
|
||||
concurrency,
|
||||
max_shards,
|
||||
@@ -713,7 +657,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if nodes.len() != node_to_drain_descs.len() {
|
||||
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||
}
|
||||
|
||||
node_to_fill_descs.retain(|desc| {
|
||||
@@ -725,7 +669,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
if node_to_fill_descs.is_empty() {
|
||||
anyhow::bail!("There are no nodes to migrate to")
|
||||
anyhow::bail!("There are no nodes to drain to")
|
||||
}
|
||||
|
||||
// Set the node scheduling policy to draining for the nodes which
|
||||
@@ -746,7 +690,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Perform the migration: move each tenant shard scheduled on a node to
|
||||
// Perform the drain: move each tenant shard scheduled on a node to
|
||||
// be drained to a node which is being filled. A simple round robin
|
||||
// strategy is used to pick the new node.
|
||||
let tenants = storcon_client
|
||||
@@ -759,13 +703,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mut selected_node_idx = 0;
|
||||
|
||||
struct MigrationMove {
|
||||
struct DrainMove {
|
||||
tenant_shard_id: TenantShardId,
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
}
|
||||
|
||||
let mut moves: Vec<MigrationMove> = Vec::new();
|
||||
let mut moves: Vec<DrainMove> = Vec::new();
|
||||
|
||||
let shards = tenants
|
||||
.into_iter()
|
||||
@@ -795,7 +739,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
moves.push(MigrationMove {
|
||||
moves.push(DrainMove {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
from: shard
|
||||
.node_attached
|
||||
@@ -872,67 +816,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
failure
|
||||
);
|
||||
}
|
||||
Command::StartDrain { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Drain started for {node_id}");
|
||||
}
|
||||
Command::CancelDrain { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active | PauseForRestart)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::StartFill { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
||||
.await?;
|
||||
|
||||
println!("Fill started for {node_id}");
|
||||
}
|
||||
Command::CancelFill { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/fill"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
|
||||
during the restart at 2024-04-03 16:37 UTC.
|
||||
|
||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||
|
||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||
|
||||
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch-15 REL_15_STABLE_neon
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
|
||||
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git merge REL_15_4
|
||||
git rebase REL_15_4
|
||||
```
|
||||
|
||||
In the commit message of the merge commit, mention if there were
|
||||
any non-trivial conflicts or other issues.
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
@@ -48,7 +57,7 @@ Postgres in a negative way.
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch-15
|
||||
git push origin my-branch
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
@@ -65,7 +74,7 @@ branch.
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
@@ -80,12 +89,14 @@ minor Postgres release.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch-15:REL_15_STABLE_neon
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
|
||||
@@ -14,3 +14,5 @@ regex.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -6,8 +6,10 @@ license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
chrono.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,3 +14,5 @@ parking_lot.workspace = true
|
||||
hex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -12,6 +12,8 @@ chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
measured-process.workspace = true
|
||||
|
||||
@@ -21,9 +21,11 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
chrono.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::models::PageserverUtilization;
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
@@ -141,11 +140,23 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Debug)]
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active(PageserverUtilization),
|
||||
Active(UtilizationScore),
|
||||
// Node is warming up, but we expect it to become available soon. Covers
|
||||
// the time span between the re-attach response being composed on the storage controller
|
||||
// and the first successful heartbeat after the processing of the re-attach response
|
||||
@@ -184,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => {
|
||||
NodeAvailability::Active(PageserverUtilization::full())
|
||||
}
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
|
||||
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V2
|
||||
Self::V1
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
|
||||
pub max_shard_count: u32,
|
||||
|
||||
/// Cached result of [`Self::score`]
|
||||
pub utilization_score: Option<u64>,
|
||||
pub utilization_score: u64,
|
||||
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
@@ -50,8 +50,6 @@ fn unity_percent() -> Percent {
|
||||
Percent::new(0).unwrap()
|
||||
}
|
||||
|
||||
pub type RawScore = u64;
|
||||
|
||||
impl PageserverUtilization {
|
||||
const UTILIZATION_FULL: u64 = 1000000;
|
||||
|
||||
@@ -64,7 +62,7 @@ impl PageserverUtilization {
|
||||
/// - Negative values are forbidden
|
||||
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
||||
/// layer eviction.
|
||||
pub fn score(&self) -> RawScore {
|
||||
pub fn score(&self) -> u64 {
|
||||
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
||||
* self.disk_usable_pct.get() as u64)
|
||||
/ 100;
|
||||
@@ -76,30 +74,8 @@ impl PageserverUtilization {
|
||||
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
||||
}
|
||||
|
||||
pub fn cached_score(&mut self) -> RawScore {
|
||||
match self.utilization_score {
|
||||
None => {
|
||||
let s = self.score();
|
||||
self.utilization_score = Some(s);
|
||||
s
|
||||
}
|
||||
Some(s) => s,
|
||||
}
|
||||
}
|
||||
|
||||
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
||||
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
||||
pub fn is_overloaded(score: RawScore) -> bool {
|
||||
score >= Self::UTILIZATION_FULL
|
||||
}
|
||||
|
||||
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
||||
if self.shard_count < shard_count {
|
||||
self.shard_count = shard_count;
|
||||
|
||||
// Dirty cache: this will be calculated next time someone retrives the score
|
||||
self.utilization_score = None;
|
||||
}
|
||||
pub fn refresh_score(&mut self) {
|
||||
self.utilization_score = self.score();
|
||||
}
|
||||
|
||||
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
||||
@@ -112,38 +88,7 @@ impl PageserverUtilization {
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count: 1,
|
||||
max_shard_count: 1,
|
||||
utilization_score: Some(Self::UTILIZATION_FULL),
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper
|
||||
pub mod test_utilization {
|
||||
use super::PageserverUtilization;
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
serde_percent::Percent,
|
||||
serde_system_time::{self},
|
||||
};
|
||||
|
||||
// Parameters of the imaginary node used for test utilization instances
|
||||
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
|
||||
const TEST_SHARDS_MAX: u32 = 1000;
|
||||
|
||||
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
|
||||
/// not abuse this function from non-test code.
|
||||
///
|
||||
/// Emulates a node with a 1000 shard limit and a 1TB disk.
|
||||
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
|
||||
PageserverUtilization {
|
||||
disk_usage_bytes: disk_wanted_bytes,
|
||||
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
|
||||
disk_wanted_bytes,
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count,
|
||||
max_shard_count: TEST_SHARDS_MAX,
|
||||
utilization_score: None,
|
||||
utilization_score: Self::UTILIZATION_FULL,
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
@@ -175,7 +120,7 @@ mod tests {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
disk_wanted_bytes: u64::MAX,
|
||||
utilization_score: Some(13),
|
||||
utilization_score: 13,
|
||||
disk_usable_pct: Percent::new(90).unwrap(),
|
||||
shard_count: 100,
|
||||
max_shard_count: 200,
|
||||
|
||||
@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -11,5 +11,7 @@ postgres.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -19,6 +19,8 @@ thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -14,6 +14,8 @@ postgres.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
regex.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -11,7 +11,9 @@ itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -32,7 +32,7 @@ scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
azure_core.workspace = true
|
||||
azure_identity.workspace = true
|
||||
azure_storage.workspace = true
|
||||
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
@@ -383,48 +383,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
|
||||
let properties_future = blob_client.get_properties().into_future();
|
||||
|
||||
let properties_future = tokio::time::timeout(self.timeout, properties_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = properties_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
let data = match res {
|
||||
Ok(Ok(data)) => Ok(data),
|
||||
Ok(Err(sdk)) => Err(to_download_error(sdk)),
|
||||
Err(_timeout) => Err(DownloadError::Timeout),
|
||||
}?;
|
||||
|
||||
let properties = data.blob.properties;
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::from(properties.last_modified),
|
||||
size: properties.content_length,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -150,7 +150,7 @@ pub enum ListingMode {
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
@@ -215,13 +215,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
Ok(combined)
|
||||
}
|
||||
|
||||
/// Obtain metadata information about an object.
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
@@ -370,20 +363,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// See [`RemoteStorage::head_object`].
|
||||
pub async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.head_object(key, cancel).await,
|
||||
Self::AwsS3(s) => s.head_object(key, cancel).await,
|
||||
Self::AzureBlob(s) => s.head_object(key, cancel).await,
|
||||
Self::Unreliable(s) => s.head_object(key, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -619,7 +598,6 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
RequestKind::Head => &self.read,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,20 +445,6 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let target_file_path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&target_file_path).await?;
|
||||
Ok(ListingObject {
|
||||
key: key.clone(),
|
||||
last_modified: metadata.modified()?,
|
||||
size: metadata.len(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
|
||||
@@ -13,7 +13,6 @@ pub(crate) enum RequestKind {
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
Head = 6,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
@@ -28,7 +27,6 @@ impl RequestKind {
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
Head => "head_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -36,8 +34,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
const REQUEST_KIND_COUNT: usize = 7;
|
||||
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -46,8 +43,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
|
||||
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -23,7 +23,7 @@ use aws_config::{
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::{get_object::GetObjectError, head_object::HeadObjectError},
|
||||
operation::get_object::GetObjectError,
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||
Client,
|
||||
};
|
||||
@@ -604,78 +604,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let head_future = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket_name())
|
||||
.key(self.relative_path_to_s3_object(key))
|
||||
.send();
|
||||
|
||||
let head_future = tokio::time::timeout(self.timeout, head_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = head_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let res = res.map_err(|_e| DownloadError::Timeout)?;
|
||||
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
let data = match res {
|
||||
Ok(object_output) => object_output,
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
return Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("s3 head object"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"head_object doesn't contain last_modified or content_length"
|
||||
)))?;
|
||||
};
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
|
||||
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
|
||||
})?,
|
||||
size: size as u64,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -30,7 +30,6 @@ pub struct UnreliableWrapper {
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
enum RemoteOp {
|
||||
ListPrefixes(Option<RemotePath>),
|
||||
HeadObject(RemotePath),
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
@@ -138,16 +137,6 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::ListingObject, DownloadError> {
|
||||
self.attempt(RemoteOp::HeadObject(key.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.head_object(key, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -9,3 +9,5 @@ serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -60,3 +60,16 @@ pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpRequest {
|
||||
/// bump to
|
||||
pub term: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineTermBumpResponse {
|
||||
// before the request
|
||||
pub previous_term: u64,
|
||||
pub current_term: u64,
|
||||
}
|
||||
|
||||
@@ -9,3 +9,5 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -39,7 +39,7 @@ thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = ["serde"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
@@ -54,6 +54,7 @@ walkdir.workspace = true
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
@@ -70,7 +71,6 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
serde_assert.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -9,6 +9,8 @@ anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
|
||||
@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("PG_VERSION_NUM")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
|
||||
@@ -282,11 +282,7 @@ mod tests {
|
||||
use std::cell::UnsafeCell;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{
|
||||
api_bindings::Level,
|
||||
bindings::{NeonWALReadResult, PG_VERSION_NUM},
|
||||
walproposer::Wrapper,
|
||||
};
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -493,79 +489,41 @@ mod tests {
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
// Messages definitions are at walproposer.h
|
||||
// xxx: it would be better to extract them from safekeeper crate and
|
||||
// use serialization/deserialization here.
|
||||
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
||||
let proto_version = 2_u32.to_ne_bytes();
|
||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let system_id = 0_u64.to_ne_bytes();
|
||||
let tenant_id = ttid.tenant_id.as_arr();
|
||||
let timeline_id = ttid.timeline_id.as_arr();
|
||||
let pg_tli = 1_u32.to_ne_bytes();
|
||||
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
||||
let proposer_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
proto_version.as_slice(),
|
||||
pg_version.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
system_id.as_slice(),
|
||||
tenant_id.as_slice(),
|
||||
timeline_id.as_slice(),
|
||||
pg_tli.as_slice(),
|
||||
wal_seg_size.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let voting_tag = (b'v' as u64).to_ne_bytes();
|
||||
let vote_request_term = 3_u64.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let vote_request = [
|
||||
voting_tag.as_slice(),
|
||||
vote_request_term.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
||||
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
||||
let acceptor_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
acceptor_greeting_term.as_slice(),
|
||||
acceptor_greeting_node_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let vote_response_term = 3_u64.to_ne_bytes();
|
||||
let vote_given = 1_u64.to_ne_bytes();
|
||||
let flush_lsn = 0x539_u64.to_ne_bytes();
|
||||
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
||||
let th_len = 1_u32.to_ne_bytes();
|
||||
let th_term = 2_u64.to_ne_bytes();
|
||||
let th_lsn = 0x539_u64.to_ne_bytes();
|
||||
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
||||
let vote_response = [
|
||||
voting_tag.as_slice(),
|
||||
vote_response_term.as_slice(),
|
||||
vote_given.as_slice(),
|
||||
flush_lsn.as_slice(),
|
||||
truncate_lsn.as_slice(),
|
||||
th_len.as_slice(),
|
||||
th_term.as_slice(),
|
||||
th_lsn.as_slice(),
|
||||
timeline_start_lsn.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![proposer_greeting, vote_request],
|
||||
expected_messages: vec![
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![acceptor_greeting, vote_response],
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||
|
||||
@@ -10,7 +10,6 @@ use pageserver::{
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -68,16 +67,12 @@ async fn ingest(
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
const BATCH_SIZE: usize = 16;
|
||||
let mut batch = Vec::new();
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
@@ -100,17 +95,7 @@ async fn ingest(
|
||||
}
|
||||
}
|
||||
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
|
||||
@@ -1706,6 +1706,11 @@ async fn timeline_compact_handler(
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||
if !cfg!(feature = "testing") {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||
)));
|
||||
}
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
@@ -2937,7 +2942,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
@@ -20,12 +25,14 @@ impl Default for L0FlushConfig {
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
|
||||
impl L0FlushGlobalState {
|
||||
pub fn new(config: L0FlushConfig) -> Self {
|
||||
match config {
|
||||
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||
L0FlushConfig::Direct { max_concurrency } => {
|
||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||
Self(Arc::new(Inner::Direct { semaphore }))
|
||||
@@ -37,3 +44,13 @@ impl L0FlushGlobalState {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl L0FlushConfig {
|
||||
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||
use L0FlushConfig::*;
|
||||
match self {
|
||||
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
@@ -88,8 +88,6 @@ pub async fn shutdown_pageserver(
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
@@ -243,10 +241,7 @@ pub async fn shutdown_pageserver(
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"Shut down successfully completed"
|
||||
);
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
|
||||
@@ -1803,23 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_utilization_score",
|
||||
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_secondary_heatmap_total_size",
|
||||
"The total size in bytes of all layers in the most recently downloaded heatmap.",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -1870,64 +1853,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub struct BackgroundLoopSemaphoreMetrics {
|
||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
||||
|| {
|
||||
let counters = register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let durations = register_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_duration_seconds",
|
||||
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
BackgroundLoopSemaphoreMetrics {
|
||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
counters.with_label_values(&[kind.into()])
|
||||
})),
|
||||
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
durations.with_label_values(&[kind.into()])
|
||||
})),
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
impl BackgroundLoopSemaphoreMetrics {
|
||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
||||
struct Record<'a> {
|
||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||
task: BackgroundLoopKind,
|
||||
_counter_guard: metrics::IntCounterPairGuard,
|
||||
start: Instant,
|
||||
}
|
||||
impl Drop for Record<'_> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed().as_secs_f64();
|
||||
self.metrics.durations[self.task].inc_by(elapsed);
|
||||
}
|
||||
}
|
||||
Record {
|
||||
metrics: self,
|
||||
task,
|
||||
_counter_guard: self.counters[task].guard(),
|
||||
start: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
@@ -2609,7 +2544,6 @@ use std::time::{Duration, Instant};
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
|
||||
@@ -15,11 +15,12 @@ use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
@@ -172,7 +174,6 @@ impl Timeline {
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_bytes: 0,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -726,17 +727,7 @@ impl Timeline {
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) => {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
|
||||
self.list_aux_files_v1(lsn, ctx).await
|
||||
}
|
||||
None => {
|
||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||
if !res.is_empty() {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
@@ -1031,33 +1022,21 @@ pub struct DatadirModification<'a> {
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
|
||||
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
||||
pending_bytes: usize,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
||||
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
||||
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
|
||||
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
|
||||
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
||||
self.pending_bytes
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -1597,7 +1576,6 @@ impl<'a> DatadirModification<'a> {
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
warn!("this timeline is using deprecated aux file policy V1");
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
@@ -1791,25 +1769,21 @@ impl<'a> DatadirModification<'a> {
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
let mut write_batch = Vec::new();
|
||||
for (lsn, value_ser_size, value) in values {
|
||||
for (lsn, value) in values {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
write_batch.push((key.to_compact(), lsn, value_ser_size, value));
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates.entry(key).or_default().push((
|
||||
lsn,
|
||||
value_ser_size,
|
||||
value,
|
||||
));
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
}
|
||||
}
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1835,20 +1809,17 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index.
|
||||
let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
|
||||
.pending_updates
|
||||
.drain()
|
||||
.flat_map(|(key, values)| {
|
||||
values.into_iter().map(move |(lsn, val_ser_size, value)| {
|
||||
(key.to_compact(), lsn, val_ser_size, value)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1873,8 +1844,6 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
self.pending_bytes = 0;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1891,7 +1860,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, _, value)) = values.last() {
|
||||
if let Some((_, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
@@ -1919,17 +1888,13 @@ impl<'a> DatadirModification<'a> {
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_bytes += val_serialized_size;
|
||||
values.push((self.lsn, val_serialized_size, val));
|
||||
values.push((self.lsn, val));
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
@@ -2059,7 +2024,7 @@ mod tests {
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
|
||||
@@ -393,7 +393,7 @@ struct PageServerTask {
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
@@ -405,7 +405,7 @@ struct PageServerTask {
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
future: F,
|
||||
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
|
||||
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
@@ -573,8 +573,13 @@ pub async fn shutdown_tasks(
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
if log_all {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
if tenant_shard_id.is_none() {
|
||||
// there are quite few of these
|
||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||
} else {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
|
||||
@@ -798,7 +798,7 @@ impl Tenant {
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Attach,
|
||||
tenant_shard_id,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
"attach tenant",
|
||||
async move {
|
||||
@@ -3741,21 +3741,13 @@ impl Tenant {
|
||||
/// less than this (via eviction and on-demand downloads), but this function enables
|
||||
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
||||
/// by keeping important things on local disk.
|
||||
///
|
||||
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
|
||||
/// than they report here, due to layer eviction. Tenants with many active branches may
|
||||
/// actually use more than they report here.
|
||||
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
||||
let mut wanted = 0;
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
|
||||
// reflects the observation that on tenants with multiple large branches, typically only one
|
||||
// of them is used actively enough to occupy space on disk.
|
||||
timelines
|
||||
.values()
|
||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
for timeline in timelines.values() {
|
||||
wanted += timeline.metrics.visible_physical_size_gauge.get();
|
||||
}
|
||||
wanted
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5940,10 +5932,10 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v2 if not set by the admins
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
assert_eq!(
|
||||
harness.tenant_conf.switch_aux_file_policy,
|
||||
AuxFilePolicy::default_tenant_config()
|
||||
AuxFilePolicy::V1
|
||||
);
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -5987,8 +5979,8 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
||||
Some(AuxFilePolicy::V1),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
||||
);
|
||||
|
||||
// we can read everything from the storage
|
||||
@@ -6010,8 +6002,8 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V2),
|
||||
"keep v2 storage format when new files are written"
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep v1 storage format when new files are written"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
@@ -6027,7 +6019,7 @@ mod tests {
|
||||
|
||||
// child copies the last flag even if that is not on remote storage yet
|
||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
|
||||
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||
mod zero_padded_read_write;
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -51,10 +52,12 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, gate_guard),
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -79,8 +82,6 @@ impl EphemeralFile {
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
// This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
@@ -88,30 +89,17 @@ impl EphemeralFile {
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
let mut len_bytes = std::io::Cursor::new(Vec::new());
|
||||
crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
|
||||
srcbuf.len(),
|
||||
&mut len_bytes,
|
||||
);
|
||||
let len_bytes = len_bytes.into_inner();
|
||||
|
||||
// Write the length field
|
||||
self.rw.write_all_borrowed(&len_bytes, ctx).await?;
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
|
||||
/// Returns the offset at which the first byte of the input was written, for use
|
||||
/// in constructing indices over the written value.
|
||||
pub(crate) async fn write_raw(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
//!
|
||||
//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use std::io::{self};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
@@ -18,17 +18,33 @@ use super::zero_padded_read_write;
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PrewarmOnWrite {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
|
||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||
page_cache_file_id,
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -68,10 +84,10 @@ impl RW {
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let file_size_tracking_writer = self.rw.as_writer();
|
||||
let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
|
||||
let mut vec = file_size_tracking_writer
|
||||
.as_inner()
|
||||
let writer = self.rw.as_writer();
|
||||
let flushed_range = writer.written_range();
|
||||
let mut vec = writer
|
||||
.file
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
@@ -106,7 +122,7 @@ impl RW {
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_writer().as_inner().path,
|
||||
self.rw.as_writer().file.path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
@@ -116,7 +132,7 @@ impl RW {
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = writer
|
||||
.as_inner()
|
||||
.file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
@@ -138,16 +154,137 @@ impl Drop for RW {
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = &self.rw.as_writer().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.rw.as_writer().file.path,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PreWarmingWriter {
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
nwritten_blocks: u32,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
impl PreWarmingWriter {
|
||||
fn new(
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
) -> Self {
|
||||
Self {
|
||||
prewarm_on_write,
|
||||
nwritten_blocks: 0,
|
||||
page_cache_file_id,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the byte range within `file` that has been written though `write_all`.
|
||||
///
|
||||
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||
struct Wrapper(Range<usize>);
|
||||
impl Deref for Wrapper {
|
||||
type Target = Range<usize>;
|
||||
fn deref(&self) -> &Range<usize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||
async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let buflen = buf.len();
|
||||
assert_eq!(
|
||||
buflen % PAGE_SZ,
|
||||
0,
|
||||
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
|
||||
);
|
||||
|
||||
// Do the IO.
|
||||
let buf = match self.file.write_all(buf, ctx).await {
|
||||
(buf, Ok(nwritten)) => {
|
||||
assert_eq!(nwritten, buflen);
|
||||
buf
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
// order error before path because path is long and error is short
|
||||
format!(
|
||||
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
|
||||
self.nwritten_blocks, buflen, e, self.file.path,
|
||||
),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let nblocks = buflen / PAGE_SZ;
|
||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||
|
||||
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer =
|
||||
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||
Ok((buflen, buf))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -464,7 +464,7 @@ impl LayerMap {
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
@@ -483,7 +483,7 @@ impl LayerMap {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -600,8 +600,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Check if the key range resembles that of an L0 layer.
|
||||
pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
|
||||
is_delta_layer && key_range == &(Key::MIN..Key::MAX)
|
||||
pub fn is_l0(key_range: &Range<Key>) -> bool {
|
||||
key_range == &(Key::MIN..Key::MAX)
|
||||
}
|
||||
|
||||
/// This function determines which layers are counted in `count_deltas`:
|
||||
@@ -628,7 +628,7 @@ impl LayerMap {
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(&layer.key_range, layer.is_delta) {
|
||||
if !Self::is_l0(&layer.key_range) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
@@ -574,7 +574,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 16, // pg_version (4 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
|
||||
task_mgr::spawn(
|
||||
&self.runtime,
|
||||
TaskKind::RemoteUploadTask,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
async move {
|
||||
|
||||
@@ -8,7 +8,6 @@ use std::{sync::Arc, time::SystemTime};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
};
|
||||
|
||||
@@ -106,9 +105,6 @@ pub(crate) struct SecondaryTenant {
|
||||
|
||||
// Sum of layer sizes on local disk
|
||||
pub(super) resident_size_metric: UIntGauge,
|
||||
|
||||
// Sum of layer sizes in the most recently downloaded heatmap
|
||||
pub(super) heatmap_total_size_metric: UIntGauge,
|
||||
}
|
||||
|
||||
impl Drop for SecondaryTenant {
|
||||
@@ -116,7 +112,6 @@ impl Drop for SecondaryTenant {
|
||||
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,10 +128,6 @@ impl SecondaryTenant {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
@@ -154,7 +145,6 @@ impl SecondaryTenant {
|
||||
progress: std::sync::Mutex::default(),
|
||||
|
||||
resident_size_metric,
|
||||
heatmap_total_size_metric,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -829,12 +829,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
|
||||
// Also expose heatmap bytes_total as a metric
|
||||
self.secondary_state
|
||||
.heatmap_total_size_metric
|
||||
.set(heatmap_stats.bytes);
|
||||
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod image_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
|
||||
@@ -232,18 +232,6 @@ pub struct DeltaLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"delta {}..{} {}..{}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn_range().start,
|
||||
self.lsn_range().end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -1539,10 +1527,6 @@ pub struct DeltaLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.delta_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
|
||||
@@ -167,17 +167,6 @@ pub struct ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"image {}..{} {}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
@@ -1035,10 +1024,6 @@ pub struct ImageLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.image_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
@@ -33,7 +33,7 @@ use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
@@ -249,7 +249,9 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
println!(
|
||||
@@ -257,6 +259,39 @@ impl InMemoryLayer {
|
||||
self.timeline_id, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -320,82 +355,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Offset of a particular Value within a serialized batch.
|
||||
struct SerializedBatchOffset {
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
/// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
pub struct SerializedBatch {
|
||||
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
|
||||
pub(crate) raw: Vec<u8>,
|
||||
|
||||
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
|
||||
offsets: Vec<SerializedBatchOffset>,
|
||||
|
||||
/// The highest LSN of any value in the batch
|
||||
pub(crate) max_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SerializedBatch {
|
||||
/// Write a blob length in the internal format of the EphemeralFile
|
||||
pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
|
||||
use std::io::Write;
|
||||
|
||||
if len < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [len as u8];
|
||||
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(len as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
|
||||
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
|
||||
|
||||
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = cursor.position();
|
||||
|
||||
Self::write_blob_length(val_ser_size, &mut cursor);
|
||||
val.ser_into(&mut cursor)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
offsets.push(SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
});
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
|
||||
let buffer = cursor.into_inner();
|
||||
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buffer.len() <= buffer_size);
|
||||
|
||||
Self {
|
||||
raw: buffer,
|
||||
offsets,
|
||||
max_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
|
||||
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
|
||||
}
|
||||
@@ -456,20 +415,37 @@ impl InMemoryLayer {
|
||||
})
|
||||
}
|
||||
|
||||
// Write path.
|
||||
pub async fn put_batch(
|
||||
// Write operations
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(
|
||||
&self,
|
||||
serialized_batch: SerializedBatch,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
}
|
||||
|
||||
let base_off = {
|
||||
inner
|
||||
async fn put_value_locked(
|
||||
&self,
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
locked_inner
|
||||
.file
|
||||
.write_raw(
|
||||
&serialized_batch.raw,
|
||||
.write_blob(
|
||||
buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
@@ -477,23 +453,15 @@ impl InMemoryLayer {
|
||||
.await?
|
||||
};
|
||||
|
||||
for SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
} in serialized_batch.offsets
|
||||
{
|
||||
let off = base_off + relative_off;
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
let vec_map = locked_inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.maybe_publish_size(size);
|
||||
let size = locked_inner.file.len();
|
||||
locked_inner.resource_units.maybe_publish_size(size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -568,6 +536,7 @@ impl InMemoryLayer {
|
||||
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
|
||||
@@ -599,6 +568,34 @@ impl InMemoryLayer {
|
||||
.await?;
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let (tmp, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
buf.slice_len(),
|
||||
will_init,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
|
||||
@@ -1296,10 +1296,7 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote: !resident,
|
||||
access_stats,
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(
|
||||
&self.layer_desc().key_range,
|
||||
self.layer_desc().is_delta,
|
||||
),
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
|
||||
@@ -256,10 +256,6 @@ impl LayerName {
|
||||
LayerName::Delta(layer) => &layer.key_range,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
matches!(self, LayerName::Delta(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerName {
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::{
|
||||
collections::{binary_heap, BinaryHeap},
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -27,13 +26,6 @@ impl<'a> LayerRef<'a> {
|
||||
Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
|
||||
}
|
||||
}
|
||||
|
||||
fn layer_dbg_info(&self) -> String {
|
||||
match self {
|
||||
Self::Image(x) => x.layer_dbg_info(),
|
||||
Self::Delta(x) => x.layer_dbg_info(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum LayerIterRef<'a> {
|
||||
@@ -48,13 +40,6 @@ impl LayerIterRef<'_> {
|
||||
Self::Image(x) => x.next().await,
|
||||
}
|
||||
}
|
||||
|
||||
fn layer_dbg_info(&self) -> String {
|
||||
match self {
|
||||
Self::Image(x) => x.layer_dbg_info(),
|
||||
Self::Delta(x) => x.layer_dbg_info(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This type plays several roles at once
|
||||
@@ -90,11 +75,6 @@ impl<'a> PeekableLayerIterRef<'a> {
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let result = self.peeked.take();
|
||||
self.peeked = self.iter.next().await?;
|
||||
if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
|
||||
if (k1, l1) < (k2, l2) {
|
||||
bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
@@ -198,12 +178,7 @@ impl<'a> IteratorWrapper<'a> {
|
||||
let iter = PeekableLayerIterRef::create(iter).await?;
|
||||
if let Some((k1, l1, _)) = iter.peek() {
|
||||
let (k2, l2) = first_key_lower_bound;
|
||||
if (k1, l1) < (k2, l2) {
|
||||
bail!(
|
||||
"layer key range did not include the first key in the layer: {}",
|
||||
layer.layer_dbg_info()
|
||||
);
|
||||
}
|
||||
debug_assert!((k1, l1) >= (k2, l2));
|
||||
}
|
||||
*self = Self::Loaded { iter };
|
||||
Ok(())
|
||||
|
||||
@@ -61,12 +61,21 @@ impl BackgroundLoopKind {
|
||||
}
|
||||
}
|
||||
|
||||
static PERMIT_GAUGES: once_cell::sync::Lazy<
|
||||
enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
|
||||
> = once_cell::sync::Lazy::new(|| {
|
||||
enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
|
||||
}))
|
||||
});
|
||||
|
||||
/// Cancellation safe.
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
_ctx: &RequestContext,
|
||||
) -> tokio::sync::SemaphorePermit<'static> {
|
||||
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
|
||||
let _guard = PERMIT_GAUGES[loop_kind].guard();
|
||||
|
||||
pausable_failpoint!(
|
||||
"initial-size-calculation-permit-pause",
|
||||
@@ -89,7 +98,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
tenant_shard_id,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -112,7 +121,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
tenant_shard_id,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -135,7 +144,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
tenant_shard_id,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
{
|
||||
|
||||
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -44,8 +44,10 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
fs_ext, pausable_failpoint,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
|
||||
use std::pin::pin;
|
||||
@@ -135,10 +137,7 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::{
|
||||
config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
|
||||
upload_queue::NotInitialized,
|
||||
};
|
||||
use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{
|
||||
@@ -2234,11 +2233,6 @@ impl Timeline {
|
||||
|
||||
handles: Default::default(),
|
||||
};
|
||||
|
||||
if aux_file_policy == Some(AuxFilePolicy::V1) {
|
||||
warn!("this timeline is using deprecated aux file policy V1");
|
||||
}
|
||||
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
@@ -2287,7 +2281,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
async move {
|
||||
@@ -2641,7 +2635,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"initial size calculation",
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
@@ -2809,7 +2803,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"ondemand logical size calculation",
|
||||
async move {
|
||||
@@ -3002,10 +2996,7 @@ impl Timeline {
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((
|
||||
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
|
||||
desc.lsn_range.end,
|
||||
))
|
||||
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
|
||||
});
|
||||
|
||||
let layers = layers
|
||||
@@ -3598,6 +3589,34 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
|
||||
// This code path will not be hit during regression tests. After #7099 we have a single partition
|
||||
// with two key ranges. If someone wants to fix initdb optimization in the future, this might need
|
||||
// to be fixed.
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
Some(
|
||||
metadata_keyspace.0.ranges.first().unwrap().start
|
||||
..metadata_keyspace.0.ranges.last().unwrap().end,
|
||||
),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
@@ -3608,27 +3627,13 @@ impl Timeline {
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
&metadata_partition.into_dense(),
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
|
||||
(layers_to_upload, None)
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
layers_to_upload.push(delta_layer.clone());
|
||||
(layers_to_upload, Some(delta_layer))
|
||||
} else {
|
||||
(layers_to_upload, None)
|
||||
}
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
@@ -4038,6 +4043,8 @@ impl Timeline {
|
||||
mode: ImageLayerCreationMode,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let data = self
|
||||
@@ -4203,13 +4210,15 @@ impl Timeline {
|
||||
"metadata keys must be partitioned separately"
|
||||
);
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
|
||||
// Skip compaction if there are not enough updates. Metadata compaction will do a scan and
|
||||
// might mess up with evictions.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
// For initial and force modes, we always generate image layers for metadata keys.
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
@@ -4217,8 +4226,7 @@ impl Timeline {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let ImageLayerCreationMode::Force = mode {
|
||||
} else if let ImageLayerCreationMode::Force = mode {
|
||||
// When forced to create image layers, we might try and create them where they already
|
||||
// exist. This mode is only used in tests/debug.
|
||||
let layers = self.layers.read().await;
|
||||
@@ -4232,7 +4240,6 @@ impl Timeline {
|
||||
img_range.start,
|
||||
img_range.end
|
||||
);
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -4588,7 +4595,7 @@ impl Timeline {
|
||||
// for compact_level0_phase1 creating an L0, which does not happen in practice
|
||||
// because we have not implemented L0 => L0 compaction.
|
||||
duplicated_layers.insert(l.layer_desc().key());
|
||||
} else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
|
||||
} else if LayerMap::is_l0(&l.layer_desc().key_range) {
|
||||
return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
|
||||
} else {
|
||||
insert_layers.push(l.clone());
|
||||
@@ -5155,7 +5162,7 @@ impl Timeline {
|
||||
let task_id = task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
"download all remote layers task",
|
||||
async move {
|
||||
@@ -5583,6 +5590,44 @@ enum OpenLayerAction {
|
||||
}
|
||||
|
||||
impl<'a> TimelineWriter<'a> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub(crate) async fn put(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
value.ser_into(&mut buf)?;
|
||||
let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
|
||||
|
||||
let action = self.get_open_layer_action(lsn, buf_size);
|
||||
let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
|
||||
let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
|
||||
|
||||
if res.is_ok() {
|
||||
// Update the current size only when the entire write was ok.
|
||||
// In case of failures, we may have had partial writes which
|
||||
// render the size tracking out of sync. That's ok because
|
||||
// the checkpoint distance should be significantly smaller
|
||||
// than the S3 single shot upload limit of 5GiB.
|
||||
let state = self.write_guard.as_mut().unwrap();
|
||||
|
||||
state.current_size += buf_size;
|
||||
state.prev_lsn = Some(lsn);
|
||||
state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
async fn handle_open_layer_action(
|
||||
&mut self,
|
||||
at: Lsn,
|
||||
@@ -5688,58 +5733,18 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
|
||||
/// Put a batch of keys at the specified Lsns.
|
||||
///
|
||||
/// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
|
||||
pub(crate) async fn put_batch(
|
||||
&mut self,
|
||||
batch: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||
batch: VecMap<Lsn, (Key, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if batch.is_empty() {
|
||||
return Ok(());
|
||||
for (lsn, (key, val)) in batch {
|
||||
self.put(key, lsn, &val, ctx).await?
|
||||
}
|
||||
|
||||
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
|
||||
let batch_max_lsn = serialized_batch.max_lsn;
|
||||
let buf_size: u64 = serialized_batch.raw.len() as u64;
|
||||
|
||||
let action = self.get_open_layer_action(batch_max_lsn, buf_size);
|
||||
let layer = self
|
||||
.handle_open_layer_action(batch_max_lsn, action, ctx)
|
||||
.await?;
|
||||
|
||||
let res = layer.put_batch(serialized_batch, ctx).await;
|
||||
|
||||
if res.is_ok() {
|
||||
// Update the current size only when the entire write was ok.
|
||||
// In case of failures, we may have had partial writes which
|
||||
// render the size tracking out of sync. That's ok because
|
||||
// the checkpoint distance should be significantly smaller
|
||||
// than the S3 single shot upload limit of 5GiB.
|
||||
let state = self.write_guard.as_mut().unwrap();
|
||||
|
||||
state.current_size += buf_size;
|
||||
state.prev_lsn = Some(batch_max_lsn);
|
||||
state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Test helper, for tests that would like to poke individual values without composing a batch
|
||||
pub(crate) async fn put(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use utils::bin_ser::BeSer;
|
||||
let val_ser_size = value.serialized_size().unwrap() as usize;
|
||||
self.put_batch(
|
||||
vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_batch(
|
||||
@@ -5880,7 +5885,7 @@ mod tests {
|
||||
};
|
||||
|
||||
// Apart from L0s, newest Layers should come first
|
||||
if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
|
||||
if !LayerMap::is_l0(layer.name.key_range()) {
|
||||
assert!(layer_lsn <= last_lsn);
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
|
||||
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
tenant_shard_id,
|
||||
Some(tenant_shard_id),
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
|
||||
@@ -60,7 +60,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Eviction,
|
||||
self.tenant_shard_id,
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
&format!(
|
||||
"layer eviction for {}/{}",
|
||||
|
||||
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||
pgdatadir_mapping::DatadirModification,
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
@@ -345,10 +345,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
if uncommitted_records >= ingest_batch_size {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;
|
||||
|
||||
use pageserver_api::models::PageserverUtilization;
|
||||
|
||||
use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
|
||||
use crate::{config::PageServerConf, tenant::mgr::TenantManager};
|
||||
|
||||
pub(crate) fn regenerate(
|
||||
conf: &PageServerConf,
|
||||
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
|
||||
disk_usable_pct,
|
||||
shard_count,
|
||||
max_shard_count: MAX_SHARDS,
|
||||
utilization_score: None,
|
||||
utilization_score: 0,
|
||||
captured_at: utils::serde_system_time::SystemTime(captured_at),
|
||||
};
|
||||
|
||||
// Initialize `PageserverUtilization::utilization_score`
|
||||
let score = doc.cached_score();
|
||||
NODE_UTILIZATION_SCORE.set(score);
|
||||
doc.refresh_score();
|
||||
|
||||
// TODO: make utilization_score into a metric
|
||||
|
||||
Ok(doc)
|
||||
}
|
||||
|
||||
@@ -756,23 +756,11 @@ impl VirtualFile {
|
||||
})
|
||||
}
|
||||
|
||||
/// The function aborts the process if the error is fatal.
|
||||
async fn write_at<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
|
||||
let result = result.maybe_fatal_err("write_at");
|
||||
(slice, result)
|
||||
}
|
||||
|
||||
async fn write_at_inner<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
|
||||
@@ -110,8 +110,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
|
||||
|
||||
tag.rinfo = rinfo;
|
||||
tag.forknum = forknum;
|
||||
/* We need exclusive lock here because of LRU list manipulation */
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
LWLockAcquire(relsize_lock, LW_SHARED);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
|
||||
@@ -1038,9 +1038,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
|
||||
{
|
||||
/*
|
||||
* However, allow to proceed if previously elected leader was me;
|
||||
* plain restart of walproposer not intervened by concurrent
|
||||
* compute (who could generate WAL) is ok.
|
||||
* However, allow to proceed if last_log_term on the node which gave
|
||||
* the highest vote (i.e. point where we are going to start writing)
|
||||
* actually had been won by me; plain restart of walproposer not
|
||||
* intervened by concurrent compute which wrote WAL is ok.
|
||||
*
|
||||
* This avoids compute crash after manual term_bump.
|
||||
*/
|
||||
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
|
||||
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
|
||||
@@ -1442,12 +1445,17 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/*
|
||||
* Another compute with higher term is running. Panic to restart
|
||||
* PG as we likely need to retake basebackup. However, don't dump
|
||||
* core as this is kinda expected scenario.
|
||||
*
|
||||
* Term has changed to higher one, probably another compute is
|
||||
* running. If this is the case we could PANIC as well because
|
||||
* likely it inserted some data and our basebackup is unsuitable
|
||||
* anymore. However, we also bump term manually (term_bump endpoint)
|
||||
* on safekeepers for migration purposes, in this case we do want
|
||||
* compute to stay alive. So restart walproposer with FATAL instead
|
||||
* of panicking; if basebackup is spoiled next election will notice
|
||||
* this.
|
||||
*/
|
||||
disable_core_dump();
|
||||
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
import argparse
|
||||
import enum
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import List
|
||||
@@ -94,7 +93,7 @@ if __name__ == "__main__":
|
||||
"--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
|
||||
default=not sys.stdout.isatty(),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -1,21 +1,15 @@
|
||||
use std::{
|
||||
future::Future,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use std::{future::Future, sync::Arc, time::Duration};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
|
||||
use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
@@ -23,56 +17,30 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(
|
||||
&self,
|
||||
role_name: RoleName,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
|
||||
}
|
||||
|
||||
pub struct AuthRule {
|
||||
pub id: String,
|
||||
pub jwks_url: url::Url,
|
||||
pub audience: Option<String>,
|
||||
#[derive(Clone)]
|
||||
struct FetchAuthRulesFromCplane {
|
||||
#[allow(dead_code)]
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for FetchAuthRulesFromCplane {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Err(anyhow::anyhow!("not yet implemented"))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AuthRules {
|
||||
jwks_urls: Vec<url::Url>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<String, KeySet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntry {
|
||||
fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets.values().find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySet {
|
||||
jwks: jose_jwk::JwkSet,
|
||||
audience: Option<String>,
|
||||
}
|
||||
|
||||
impl KeySet {
|
||||
fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
|
||||
self.jwks
|
||||
.keys
|
||||
.iter()
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
|
||||
}
|
||||
map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntryLock {
|
||||
@@ -89,6 +57,15 @@ impl Default for JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntryLock {
|
||||
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
|
||||
JwkRenewalPermit::acquire_permit(self).await
|
||||
@@ -102,7 +79,6 @@ impl JwkCacheEntryLock {
|
||||
&self,
|
||||
_permit: JwkRenewalPermit<'_>,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
@@ -115,19 +91,20 @@ impl JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules.fetch_auth_rules(role_name).await?;
|
||||
let mut key_sets =
|
||||
ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
|
||||
let rules = auth_rules.fetch_auth_rules().await?;
|
||||
let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
|
||||
rules.jwks_urls.len(),
|
||||
ahash::RandomState::new(),
|
||||
);
|
||||
// TODO(conrad): run concurrently
|
||||
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
|
||||
for rule in rules {
|
||||
let req = client.get(rule.jwks_url.clone());
|
||||
for url in rules.jwks_urls {
|
||||
let req = client.get(url.clone());
|
||||
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
|
||||
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
|
||||
match req.send().await.and_then(|r| r.error_for_status()) {
|
||||
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
|
||||
// I expect these failures would be quite sparse.
|
||||
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
|
||||
Ok(r) => {
|
||||
let resp: http::Response<reqwest::Body> = r.into();
|
||||
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
|
||||
@@ -136,17 +113,9 @@ impl JwkCacheEntryLock {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
|
||||
}
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
|
||||
Ok(jwks) => {
|
||||
key_sets.insert(
|
||||
rule.id,
|
||||
KeySet {
|
||||
jwks,
|
||||
audience: rule.audience,
|
||||
},
|
||||
);
|
||||
key_sets.insert(url, jwks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -164,9 +133,7 @@ impl JwkCacheEntryLock {
|
||||
|
||||
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
@@ -174,20 +141,18 @@ impl JwkCacheEntryLock {
|
||||
|
||||
// if we have no cached JWKs, try and get some
|
||||
let Some(cached) = guard else {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
return self.renew_jwks(permit, client, role_name, fetch).await;
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
|
||||
// check if the cached JWKs need updating.
|
||||
if last_update > MAX_RENEW {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self.renew_jwks(permit, client, role_name, fetch).await;
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
@@ -199,7 +164,7 @@ impl JwkCacheEntryLock {
|
||||
let client = client.clone();
|
||||
let fetch = fetch.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
}
|
||||
});
|
||||
@@ -213,10 +178,8 @@ impl JwkCacheEntryLock {
|
||||
|
||||
async fn check_jwt<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestMonitoring,
|
||||
jwt: &str,
|
||||
jwt: String,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
@@ -226,36 +189,36 @@ impl JwkCacheEntryLock {
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once(".")
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let (header, payload) = header_payload
|
||||
let (header, _payload) = header_payload
|
||||
.split_once(".")
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
|
||||
let header = serde_json::from_slice::<JWTHeader<'_>>(&header)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
let kid = header.key_id.context("missing key id")?;
|
||||
let kid = header.kid.context("missing key id")?;
|
||||
|
||||
let mut guard = self
|
||||
.get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
|
||||
.await?;
|
||||
let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid) {
|
||||
let jwk = loop {
|
||||
let jwk = guard
|
||||
.key_sets
|
||||
.values()
|
||||
.flat_map(|jwks| &jwks.keys)
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
|
||||
|
||||
match jwk {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self
|
||||
.renew_jwks(permit, client, role_name.clone(), fetch)
|
||||
.await?;
|
||||
guard = self.renew_jwks(permit, client, fetch).await?;
|
||||
}
|
||||
_ => {
|
||||
bail!("jwk not found");
|
||||
@@ -264,7 +227,7 @@ impl JwkCacheEntryLock {
|
||||
};
|
||||
|
||||
ensure!(
|
||||
jwk.is_supported(&header.algorithm),
|
||||
jwk.is_supported(&header.alg),
|
||||
"signature algorithm not supported"
|
||||
);
|
||||
|
||||
@@ -278,60 +241,31 @@ impl JwkCacheEntryLock {
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
};
|
||||
|
||||
let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
tracing::debug!(?payload, "JWT signature valid with claims");
|
||||
|
||||
match (expected_audience, payload.audience) {
|
||||
// check the audience matches
|
||||
(Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
|
||||
// the audience is expected but is missing
|
||||
(Some(_), None) => bail!("invalid JWT token audience"),
|
||||
// we don't care for the audience field
|
||||
(None, _) => {}
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
// TODO(conrad): verify iss, exp, nbf, etc...
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkCache {
|
||||
pub async fn check_jwt<F: FetchAuthRules>(
|
||||
pub async fn check_jwt(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
jwt: &str,
|
||||
endpoint: EndpointIdInt,
|
||||
jwt: String,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// try with just a read lock first
|
||||
let key = (endpoint, role_name.clone());
|
||||
let entry = self.map.get(&key).as_deref().map(Arc::clone);
|
||||
let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
|
||||
let entry = match entry {
|
||||
Some(entry) => entry,
|
||||
None => {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(key).or_default();
|
||||
let entry = self.map.entry(endpoint).or_default();
|
||||
Arc::clone(&*entry)
|
||||
}
|
||||
};
|
||||
|
||||
entry
|
||||
.check_jwt(ctx, jwt, &self.client, role_name, fetch)
|
||||
.await
|
||||
let fetch = FetchAuthRulesFromCplane { endpoint };
|
||||
entry.check_jwt(jwt, &self.client, &fetch).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -381,49 +315,13 @@ fn verify_rsa_signature(
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct JwtHeader<'a> {
|
||||
struct JWTHeader<'a> {
|
||||
/// must be "JWT"
|
||||
#[serde(rename = "typ")]
|
||||
typ: &'a str,
|
||||
/// must be a supported alg
|
||||
#[serde(rename = "alg")]
|
||||
algorithm: jose_jwa::Algorithm,
|
||||
alg: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
#[serde(rename = "kid")]
|
||||
key_id: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize, Debug)]
|
||||
struct JwtPayload<'a> {
|
||||
/// Audience - Recipient for which the JWT is intended
|
||||
#[serde(rename = "aud")]
|
||||
audience: Option<&'a str>,
|
||||
/// Expiration - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
|
||||
expiration: Option<SystemTime>,
|
||||
/// Not before - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
|
||||
not_before: Option<SystemTime>,
|
||||
|
||||
// the following entries are only extracted for the sake of debug logging.
|
||||
/// Issuer of the JWT
|
||||
#[serde(rename = "iss")]
|
||||
issuer: Option<&'a str>,
|
||||
/// Subject of the JWT (the user)
|
||||
#[serde(rename = "sub")]
|
||||
subject: Option<&'a str>,
|
||||
/// Unique token identifier
|
||||
#[serde(rename = "jti")]
|
||||
jwt_id: Option<&'a str>,
|
||||
/// Unique session identifier
|
||||
#[serde(rename = "sid")]
|
||||
session_id: Option<&'a str>,
|
||||
}
|
||||
|
||||
fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
|
||||
let d = <Option<u64>>::deserialize(d)?;
|
||||
Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
|
||||
kid: Option<&'a str>,
|
||||
}
|
||||
|
||||
struct JwkRenewalPermit<'a> {
|
||||
@@ -490,8 +388,6 @@ impl Drop for JwkRenewalPermit<'_> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::RoleName;
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
|
||||
@@ -535,10 +431,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let header = JwtHeader {
|
||||
let header = JWTHeader {
|
||||
typ: "JWT",
|
||||
algorithm: jose_jwa::Algorithm::Signing(sig),
|
||||
key_id: Some(&kid),
|
||||
alg: jose_jwa::Algorithm::Signing(sig),
|
||||
kid: Some(&kid),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
@@ -628,40 +524,33 @@ mod tests {
|
||||
struct Fetch(SocketAddr);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
},
|
||||
])
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Ok(AuthRules {
|
||||
jwks_urls: vec![
|
||||
format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
],
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
let role_name = RoleName::from("user");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
for token in [jwt1, jwt2, jwt3, jwt4] {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&token,
|
||||
&client,
|
||||
role_name.clone(),
|
||||
&Fetch(addr),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
jwk_cache
|
||||
.check_jwt(jwt1, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt2, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt3, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt4, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -173,6 +173,9 @@ struct ProxyCliArgs {
|
||||
/// cache for `role_secret` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
role_secret_cache: String,
|
||||
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_ip_check_for_http: bool,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
@@ -658,7 +661,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: true,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
|
||||
@@ -52,7 +52,6 @@ pub struct TlsConfig {
|
||||
}
|
||||
|
||||
pub struct HttpConfig {
|
||||
pub accept_websockets: bool,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
pub cancel_set: CancelSet,
|
||||
pub client_conn_threshold: u64,
|
||||
|
||||
@@ -1,92 +1,4 @@
|
||||
// rustc lints/lint groups
|
||||
// https://doc.rust-lang.org/rustc/lints/groups.html
|
||||
#![deny(
|
||||
deprecated,
|
||||
future_incompatible,
|
||||
// TODO: consider let_underscore
|
||||
nonstandard_style,
|
||||
rust_2024_compatibility
|
||||
)]
|
||||
#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
|
||||
// List of denied lints from the clippy::restriction group.
|
||||
// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
|
||||
#![warn(
|
||||
clippy::undocumented_unsafe_blocks,
|
||||
clippy::dbg_macro,
|
||||
clippy::empty_enum_variants_with_brackets,
|
||||
clippy::exit,
|
||||
clippy::float_cmp_const,
|
||||
clippy::lossy_float_literal,
|
||||
clippy::macro_use_imports,
|
||||
clippy::manual_ok_or,
|
||||
// TODO: consider clippy::map_err_ignore
|
||||
// TODO: consider clippy::mem_forget
|
||||
clippy::rc_mutex,
|
||||
clippy::rest_pat_in_fully_bound_structs,
|
||||
clippy::string_add,
|
||||
clippy::string_to_string,
|
||||
clippy::todo,
|
||||
// TODO: consider clippy::unimplemented
|
||||
// TODO: consider clippy::unwrap_used
|
||||
)]
|
||||
// List of permanently allowed lints.
|
||||
#![allow(
|
||||
// It's ok to cast u8 to bool, etc.
|
||||
clippy::cast_lossless,
|
||||
)]
|
||||
// List of temporarily allowed lints.
|
||||
// TODO: Switch to except() once stable with 1.81.
|
||||
// TODO: fix code and reduce list or move to permanent list above.
|
||||
#![allow(
|
||||
clippy::cargo_common_metadata,
|
||||
clippy::cast_possible_truncation,
|
||||
clippy::cast_possible_wrap,
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_sign_loss,
|
||||
clippy::default_trait_access,
|
||||
clippy::doc_markdown,
|
||||
clippy::explicit_iter_loop,
|
||||
clippy::float_cmp,
|
||||
clippy::if_not_else,
|
||||
clippy::ignored_unit_patterns,
|
||||
clippy::implicit_hasher,
|
||||
clippy::inconsistent_struct_constructor,
|
||||
clippy::inline_always,
|
||||
clippy::items_after_statements,
|
||||
clippy::manual_assert,
|
||||
clippy::manual_let_else,
|
||||
clippy::manual_string_new,
|
||||
clippy::match_bool,
|
||||
clippy::match_same_arms,
|
||||
clippy::match_wild_err_arm,
|
||||
clippy::missing_errors_doc,
|
||||
clippy::missing_panics_doc,
|
||||
clippy::module_name_repetitions,
|
||||
clippy::multiple_crate_versions,
|
||||
clippy::must_use_candidate,
|
||||
clippy::needless_for_each,
|
||||
clippy::needless_pass_by_value,
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::option_as_ref_cloned,
|
||||
clippy::redundant_closure_for_method_calls,
|
||||
clippy::redundant_else,
|
||||
clippy::return_self_not_must_use,
|
||||
clippy::similar_names,
|
||||
clippy::single_char_pattern,
|
||||
clippy::single_match_else,
|
||||
clippy::struct_excessive_bools,
|
||||
clippy::struct_field_names,
|
||||
clippy::too_many_lines,
|
||||
clippy::uninlined_format_args,
|
||||
clippy::unnested_or_patterns,
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
clippy::unused_self,
|
||||
clippy::used_underscore_binding,
|
||||
clippy::wildcard_imports
|
||||
)]
|
||||
// List of temporarily allowed lints to unblock beta/nightly.
|
||||
#![allow(unknown_lints, clippy::manual_inspect)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
use std::convert::Infallible;
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ mod json;
|
||||
mod sql_over_http;
|
||||
mod websocket;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use atomic_take::AtomicTake;
|
||||
use bytes::Bytes;
|
||||
pub use conn_pool::GlobalConnPoolOptions;
|
||||
@@ -27,9 +26,8 @@ use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::time::timeout;
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
use tokio_rustls::{server::TlsStream, TlsAcceptor};
|
||||
use tokio_util::task::TaskTracker;
|
||||
|
||||
use crate::cancellation::CancellationHandlerMain;
|
||||
@@ -43,7 +41,7 @@ use crate::serverless::backend::PoolingBackend;
|
||||
use crate::serverless::http_util::{api_error_into_response, json_response};
|
||||
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::pin::{pin, Pin};
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -88,18 +86,18 @@ pub async fn task_main(
|
||||
config,
|
||||
endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
|
||||
});
|
||||
let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
|
||||
Some(config) => {
|
||||
let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config());
|
||||
// prefer http2, but support http/1.1
|
||||
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
|
||||
Arc::new(tls_server_config) as Arc<_>
|
||||
}
|
||||
|
||||
let tls_config = match config.tls_config.as_ref() {
|
||||
Some(config) => config,
|
||||
None => {
|
||||
warn!("TLS config is missing");
|
||||
Arc::new(NoTls) as Arc<_>
|
||||
warn!("TLS config is missing, WebSocket Secure server will not be started");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
|
||||
// prefer http2, but support http/1.1
|
||||
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
|
||||
let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
|
||||
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
connections.close(); // allows `connections.wait to complete`
|
||||
@@ -178,41 +176,16 @@ pub async fn task_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub trait AsyncReadWrite: AsyncRead + AsyncWrite + Send + 'static {}
|
||||
impl<T: AsyncRead + AsyncWrite + Send + 'static> AsyncReadWrite for T {}
|
||||
pub type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
|
||||
|
||||
#[async_trait]
|
||||
trait MaybeTlsAcceptor: Send + Sync + 'static {
|
||||
async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MaybeTlsAcceptor for rustls::ServerConfig {
|
||||
async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
|
||||
Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?))
|
||||
}
|
||||
}
|
||||
|
||||
struct NoTls;
|
||||
|
||||
#[async_trait]
|
||||
impl MaybeTlsAcceptor for NoTls {
|
||||
async fn accept(self: Arc<Self>, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
|
||||
Ok(Box::pin(conn))
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles the TCP startup lifecycle.
|
||||
/// 1. Parses PROXY protocol V2
|
||||
/// 2. Handles TLS handshake
|
||||
async fn connection_startup(
|
||||
config: &ProxyConfig,
|
||||
tls_acceptor: Arc<dyn MaybeTlsAcceptor>,
|
||||
tls_acceptor: TlsAcceptor,
|
||||
session_id: uuid::Uuid,
|
||||
conn: TcpStream,
|
||||
peer_addr: SocketAddr,
|
||||
) -> Option<(AsyncRW, IpAddr)> {
|
||||
) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
|
||||
// handle PROXY protocol
|
||||
let (conn, peer) = match read_proxy_protocol(conn).await {
|
||||
Ok(c) => c,
|
||||
@@ -268,7 +241,7 @@ async fn connection_handler(
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_token: CancellationToken,
|
||||
conn: AsyncRW,
|
||||
conn: TlsStream<ChainRW<TcpStream>>,
|
||||
peer_addr: IpAddr,
|
||||
session_id: uuid::Uuid,
|
||||
) {
|
||||
@@ -353,9 +326,7 @@ async fn request_handler(
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if config.http_config.accept_websockets
|
||||
&& framed_websockets::upgrade::is_upgrade_request(&request)
|
||||
{
|
||||
if framed_websockets::upgrade::is_upgrade_request(&request) {
|
||||
let ctx = RequestMonitoring::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
|
||||
@@ -758,7 +758,6 @@ mod tests {
|
||||
async fn test_pool() {
|
||||
let _ = env_logger::try_init();
|
||||
let config = Box::leak(Box::new(crate::config::HttpConfig {
|
||||
accept_websockets: false,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: 2,
|
||||
gc_epoch: Duration::from_secs(1),
|
||||
|
||||
@@ -147,7 +147,7 @@ impl UserFacingError for ConnInfoError {
|
||||
fn get_conn_info(
|
||||
ctx: &RequestMonitoring,
|
||||
headers: &HeaderMap,
|
||||
tls: Option<&TlsConfig>,
|
||||
tls: &TlsConfig,
|
||||
) -> Result<ConnInfo, ConnInfoError> {
|
||||
// HTTP only uses cleartext (for now and likely always)
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
@@ -184,22 +184,12 @@ fn get_conn_info(
|
||||
.ok_or(ConnInfoError::MissingPassword)?;
|
||||
let password = urlencoding::decode_binary(password.as_bytes());
|
||||
|
||||
let endpoint = match connection_url.host() {
|
||||
Some(url::Host::Domain(hostname)) => {
|
||||
if let Some(tls) = tls {
|
||||
endpoint_sni(hostname, &tls.common_names)?
|
||||
.ok_or(ConnInfoError::MalformedEndpoint)?
|
||||
} else {
|
||||
hostname
|
||||
.split_once(".")
|
||||
.map_or(hostname, |(prefix, _)| prefix)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
Some(url::Host::Ipv4(_)) | Some(url::Host::Ipv6(_)) | None => {
|
||||
return Err(ConnInfoError::MissingHostname)
|
||||
}
|
||||
};
|
||||
let hostname = connection_url
|
||||
.host_str()
|
||||
.ok_or(ConnInfoError::MissingHostname)?;
|
||||
|
||||
let endpoint =
|
||||
endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?;
|
||||
ctx.set_endpoint_id(endpoint.clone());
|
||||
|
||||
let pairs = connection_url.query_pairs();
|
||||
@@ -512,7 +502,7 @@ async fn handle_inner(
|
||||
let headers = request.headers();
|
||||
|
||||
// TLS config should be there.
|
||||
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
|
||||
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
|
||||
info!(user = conn_info.user_info.user.as_str(), "credentials");
|
||||
|
||||
// Allow connection pooling only if explicitly requested
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
use utils::auth::{AuthError, Claims, Scope};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// If tenant_id is provided, allow if token (claims) is for this tenant or
|
||||
/// whole safekeeper scope (SafekeeperData). Else, allow only if token is
|
||||
/// SafekeeperData.
|
||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||
match (&claims.scope, tenant_id) {
|
||||
(Scope::Tenant, None) => Err(AuthError(
|
||||
|
||||
@@ -18,8 +18,8 @@ use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWri
|
||||
use utils::http::request::parse_query_param;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
|
||||
use safekeeper_api::models::{TimelineCreateRequest, TimelineTermBumpRequest};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
http::{
|
||||
@@ -114,16 +114,6 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
})
|
||||
}
|
||||
|
||||
/// List all (not deleted) timelines.
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
|
||||
.iter()
|
||||
.map(|tli| tli.ttid)
|
||||
.collect();
|
||||
json_response(StatusCode::OK, res)
|
||||
}
|
||||
|
||||
/// Report info about timeline.
|
||||
async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -312,12 +302,11 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
/// Force persist control file.
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid)?;
|
||||
tli.write_shared_state()
|
||||
@@ -330,6 +319,28 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Make term at least as high as one in request. If one in request is None,
|
||||
/// increment current one.
|
||||
async fn timeline_term_bump_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let response = tli
|
||||
.term_bump(request_data.term)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Deactivates the timeline and removes its data directory.
|
||||
async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
@@ -568,33 +579,23 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
failpoints_handler(r, cancel).await
|
||||
})
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
// Will be used in the future instead of implicit timeline creation
|
||||
.post("/v1/tenant/timeline", |r| {
|
||||
request_span(r, timeline_create_handler)
|
||||
})
|
||||
.get("/v1/tenant/timeline", |r| {
|
||||
request_span(r, timeline_list_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
request_span(r, timeline_status_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
request_span(r, timeline_delete_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
|
||||
|r| request_span(r, timeline_snapshot_handler),
|
||||
)
|
||||
.post("/v1/pull_timeline", |r| {
|
||||
request_span(r, timeline_pull_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
)
|
||||
.patch(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
|
||||
|r| request_span(r, patch_control_file_handler),
|
||||
@@ -603,6 +604,17 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
||||
|r| request_span(r, timeline_checkpoint_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/term_bump",
|
||||
|r| request_span(r, timeline_term_bump_handler),
|
||||
)
|
||||
.post("/v1/pull_timeline", |r| {
|
||||
request_span(r, timeline_pull_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
)
|
||||
// for tests
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//! Defines per timeline data stored persistently (SafeKeeperPersistentState)
|
||||
//! and its wrapper with in memory layer (SafekeeperState).
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::{cmp::max, ops::Deref};
|
||||
|
||||
use anyhow::Result;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -12,7 +13,7 @@ use utils::{
|
||||
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
|
||||
@@ -209,6 +210,27 @@ where
|
||||
let s = self.start_change();
|
||||
self.finish_change(&s).await
|
||||
}
|
||||
|
||||
/// Make term at least as `to`. If `to` is None, increment current one. This
|
||||
/// is not in safekeeper.rs because we want to be able to do it even if
|
||||
/// timeline is offloaded.
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let before = self.acceptor_state.term;
|
||||
let mut state = self.start_change();
|
||||
let new = match to {
|
||||
Some(to) => max(state.acceptor_state.term, to),
|
||||
None => state.acceptor_state.term + 1,
|
||||
};
|
||||
if new > state.acceptor_state.term {
|
||||
state.acceptor_state.term = new;
|
||||
self.finish_change(&state).await?;
|
||||
}
|
||||
let after = self.acceptor_state.term;
|
||||
Ok(TimelineTermBumpResponse {
|
||||
previous_term: before,
|
||||
current_term: after,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<CTRL> Deref for TimelineState<CTRL>
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use safekeeper_api::models::TimelineTermBumpResponse;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::fs::{self};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -213,6 +214,10 @@ impl StateSK {
|
||||
.get_last_log_term(self.flush_lsn())
|
||||
}
|
||||
|
||||
pub async fn term_bump(&mut self, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
self.state_mut().term_bump(to).await
|
||||
}
|
||||
|
||||
/// Close open WAL files to release FDs.
|
||||
fn close_wal_store(&mut self) {
|
||||
if let StateSK::Loaded(sk) = self {
|
||||
@@ -847,6 +852,11 @@ impl Timeline {
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub async fn term_bump(self: &Arc<Self>, to: Option<Term>) -> Result<TimelineTermBumpResponse> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.term_bump(to).await
|
||||
}
|
||||
|
||||
/// Get the timeline guard for reading/writing WAL files.
|
||||
/// If WAL files are not present on disk (evicted), they will be automatically
|
||||
/// downloaded from remote storage. This is done in the manager task, which is
|
||||
|
||||
@@ -68,29 +68,16 @@ const parseReportJson = async ({ reportJsonUrl, fetch }) => {
|
||||
console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
|
||||
|
||||
buildType = "release"
|
||||
pgVersion = "16"
|
||||
pgVersion = "14"
|
||||
}
|
||||
|
||||
pgVersions.add(pgVersion)
|
||||
|
||||
// We use `arch` as it is returned by GitHub Actions
|
||||
// (RUNNER_ARCH env var): X86, X64, ARM, or ARM64
|
||||
// Ref https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
|
||||
let arch = ""
|
||||
if (test.parameters.includes("'X64'")) {
|
||||
arch = "x86-64"
|
||||
} else if (test.parameters.includes("'ARM64'")) {
|
||||
arch = "arm64"
|
||||
} else {
|
||||
arch = "unknown"
|
||||
}
|
||||
|
||||
// Removing build type and PostgreSQL version from the test name to make it shorter
|
||||
const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
|
||||
test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
|
||||
test.pgVersion = pgVersion
|
||||
test.buildType = buildType
|
||||
test.arch = arch
|
||||
|
||||
if (test.status === "passed") {
|
||||
passedTests[pgVersion][testName].push(test)
|
||||
@@ -157,7 +144,7 @@ const reportSummary = async (params) => {
|
||||
const links = []
|
||||
for (const test of tests) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
|
||||
links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
|
||||
links.push(`[${test.buildType}](${allureLink})`)
|
||||
}
|
||||
summary += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
}
|
||||
@@ -188,7 +175,7 @@ const reportSummary = async (params) => {
|
||||
const links = []
|
||||
for (const test of tests) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
|
||||
links.push(`[${test.buildType}-${test.arch}](${allureLink})`)
|
||||
links.push(`[${test.buildType}](${allureLink})`)
|
||||
}
|
||||
summary += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@ import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
CREATE_TABLE = """
|
||||
CREATE TYPE arch AS ENUM ('ARM64', 'X64', 'UNKNOWN');
|
||||
CREATE TABLE IF NOT EXISTS results (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
parent_suite TEXT NOT NULL,
|
||||
@@ -29,7 +28,6 @@ CREATE TABLE IF NOT EXISTS results (
|
||||
stopped_at TIMESTAMPTZ NOT NULL,
|
||||
duration INT NOT NULL,
|
||||
flaky BOOLEAN NOT NULL,
|
||||
arch arch DEFAULT 'X64',
|
||||
build_type TEXT NOT NULL,
|
||||
pg_version INT NOT NULL,
|
||||
run_id BIGINT NOT NULL,
|
||||
@@ -37,7 +35,7 @@ CREATE TABLE IF NOT EXISTS results (
|
||||
reference TEXT NOT NULL,
|
||||
revision CHAR(40) NOT NULL,
|
||||
raw JSONB COMPRESSION lz4 NOT NULL,
|
||||
UNIQUE (parent_suite, suite, name, arch, build_type, pg_version, started_at, stopped_at, run_id)
|
||||
UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
|
||||
);
|
||||
"""
|
||||
|
||||
@@ -52,7 +50,6 @@ class Row:
|
||||
stopped_at: datetime
|
||||
duration: int
|
||||
flaky: bool
|
||||
arch: str
|
||||
build_type: str
|
||||
pg_version: int
|
||||
run_id: int
|
||||
@@ -124,14 +121,6 @@ def ingest_test_result(
|
||||
raw.pop("labels")
|
||||
raw.pop("extra")
|
||||
|
||||
# All allure parameters are prefixed with "__", see test_runner/fixtures/parametrize.py
|
||||
parameters = {
|
||||
p["name"].removeprefix("__"): p["value"]
|
||||
for p in test["parameters"]
|
||||
if p["name"].startswith("__")
|
||||
}
|
||||
arch = parameters.get("arch", "UNKNOWN").strip("'")
|
||||
|
||||
build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
|
||||
labels = {label["name"]: label["value"] for label in test["labels"]}
|
||||
row = Row(
|
||||
@@ -143,7 +132,6 @@ def ingest_test_result(
|
||||
stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
|
||||
duration=test["time"]["duration"],
|
||||
flaky=test["flaky"] or test["retriesStatusChange"],
|
||||
arch=arch,
|
||||
build_type=build_type,
|
||||
pg_version=pg_version,
|
||||
run_id=run_id,
|
||||
|
||||
@@ -44,7 +44,7 @@ run the following commands from the top of the neon.git checkout
|
||||
|
||||
# test suite run
|
||||
export TEST_OUTPUT="$TEST_OUTPUT"
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
|
||||
|
||||
# for interactive use
|
||||
export NEON_REPO_DIR="$NEON_REPO_DIR"
|
||||
|
||||
@@ -6,7 +6,10 @@ use std::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};
|
||||
use pageserver_api::{
|
||||
controller_api::{NodeAvailability, UtilizationScore},
|
||||
models::PageserverUtilization,
|
||||
};
|
||||
|
||||
use thiserror::Error;
|
||||
use utils::id::NodeId;
|
||||
@@ -84,12 +87,9 @@ impl Heartbeater {
|
||||
pageservers,
|
||||
reply: sender,
|
||||
})
|
||||
.map_err(|_| HeartbeaterError::Cancel)?;
|
||||
.unwrap();
|
||||
|
||||
receiver
|
||||
.await
|
||||
.map_err(|_| HeartbeaterError::Cancel)
|
||||
.and_then(|x| x)
|
||||
receiver.await.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -144,8 +144,7 @@ impl HeartbeaterTask {
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||
let mut node_clone = node.clone();
|
||||
node_clone
|
||||
.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
|
||||
node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
async move {
|
||||
let response = node_clone
|
||||
@@ -177,7 +176,7 @@ impl HeartbeaterTask {
|
||||
node.get_availability()
|
||||
{
|
||||
PageserverState::WarmingUp {
|
||||
started_at: *last_seen_at,
|
||||
started_at: last_seen_at,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
|
||||
@@ -1074,6 +1074,7 @@ pub fn make_router(
|
||||
RequestName("control_v1_metadata_health_list_outdated"),
|
||||
)
|
||||
})
|
||||
// TODO(vlad): endpoint for cancelling drain and fill
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use hyper::Uri;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::{
|
||||
peer_client::{GlobalObservedState, PeerClient},
|
||||
persistence::{ControllerPersistence, DatabaseError, DatabaseResult, Persistence},
|
||||
service::Config,
|
||||
};
|
||||
|
||||
/// Helper for storage controller leadership acquisition
|
||||
pub(crate) struct Leadership {
|
||||
persistence: Arc<Persistence>,
|
||||
config: Config,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum Error {
|
||||
#[error(transparent)]
|
||||
Database(#[from] DatabaseError),
|
||||
}
|
||||
|
||||
pub(crate) type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl Leadership {
|
||||
pub(crate) fn new(
|
||||
persistence: Arc<Persistence>,
|
||||
config: Config,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
persistence,
|
||||
config,
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the current leader in the database and request it to step down if required.
|
||||
/// Should be called early on in within the start-up sequence.
|
||||
///
|
||||
/// Returns a tuple of two optionals: the current leader and its observed state
|
||||
pub(crate) async fn step_down_current_leader(
|
||||
&self,
|
||||
) -> Result<(Option<ControllerPersistence>, Option<GlobalObservedState>)> {
|
||||
let leader = self.current_leader().await?;
|
||||
let leader_step_down_state = if let Some(ref leader) = leader {
|
||||
if self.config.start_as_candidate {
|
||||
self.request_step_down(leader).await
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
tracing::info!("No leader found to request step down from. Will build observed state.");
|
||||
None
|
||||
};
|
||||
|
||||
Ok((leader, leader_step_down_state))
|
||||
}
|
||||
|
||||
/// Mark the current storage controller instance as the leader in the database
|
||||
pub(crate) async fn become_leader(
|
||||
&self,
|
||||
current_leader: Option<ControllerPersistence>,
|
||||
) -> Result<()> {
|
||||
if let Some(address_for_peers) = &self.config.address_for_peers {
|
||||
// TODO: `address-for-peers` can become a mandatory cli arg
|
||||
// after we update the k8s setup
|
||||
let proposed_leader = ControllerPersistence {
|
||||
address: address_for_peers.to_string(),
|
||||
started_at: chrono::Utc::now(),
|
||||
};
|
||||
|
||||
self.persistence
|
||||
.update_leader(current_leader, proposed_leader)
|
||||
.await
|
||||
.map_err(Error::Database)
|
||||
} else {
|
||||
tracing::info!("No address-for-peers provided. Skipping leader persistence.");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn current_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
|
||||
let res = self.persistence.get_leader().await;
|
||||
if let Err(DatabaseError::Query(diesel::result::Error::DatabaseError(_kind, ref err))) = res
|
||||
{
|
||||
const REL_NOT_FOUND_MSG: &str = "relation \"controllers\" does not exist";
|
||||
if err.message().trim() == REL_NOT_FOUND_MSG {
|
||||
// Special case: if this is a brand new storage controller, migrations will not
|
||||
// have run at this point yet, and, hence, the controllers table does not exist.
|
||||
// Detect this case via the error string (diesel doesn't type it) and allow it.
|
||||
tracing::info!("Detected first storage controller start-up. Allowing missing controllers table ...");
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Request step down from the currently registered leader in the database
|
||||
///
|
||||
/// If such an entry is persisted, the success path returns the observed
|
||||
/// state and details of the leader. Otherwise, None is returned indicating
|
||||
/// there is no leader currently.
|
||||
async fn request_step_down(
|
||||
&self,
|
||||
leader: &ControllerPersistence,
|
||||
) -> Option<GlobalObservedState> {
|
||||
tracing::info!("Sending step down request to {leader:?}");
|
||||
|
||||
let client = PeerClient::new(
|
||||
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
|
||||
self.config.peer_jwt_token.clone(),
|
||||
);
|
||||
let state = client.step_down(&self.cancel).await;
|
||||
match state {
|
||||
Ok(state) => Some(state),
|
||||
Err(err) => {
|
||||
// TODO: Make leaders periodically update a timestamp field in the
|
||||
// database and, if the leader is not reachable from the current instance,
|
||||
// but inferred as alive from the timestamp, abort start-up. This avoids
|
||||
// a potential scenario in which we have two controllers acting as leaders.
|
||||
tracing::error!(
|
||||
"Leader ({}) did not respond to step-down request: {}",
|
||||
leader.address,
|
||||
err
|
||||
);
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ mod drain_utils;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
mod leadership;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
mod pageserver_client;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
use hyper::Uri;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use metrics::BuildInfo;
|
||||
@@ -26,6 +27,9 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use diesel_migrations::{embed_migrations, EmbeddedMigrations};
|
||||
pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
@@ -47,9 +51,6 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
control_plane_jwt_token: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
peer_jwt_token: Option<String>,
|
||||
|
||||
/// URL to control plane compute notification endpoint
|
||||
#[arg(long)]
|
||||
compute_hook_url: Option<String>,
|
||||
@@ -129,28 +130,28 @@ struct Secrets {
|
||||
public_key: Option<JwtAuth>,
|
||||
jwt_token: Option<String>,
|
||||
control_plane_jwt_token: Option<String>,
|
||||
peer_jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
impl Secrets {
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
|
||||
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
|
||||
|
||||
/// Load secrets from, in order of preference:
|
||||
/// - CLI args if database URL is provided on the CLI
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
let Some(database_url) = Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV)
|
||||
let Some(database_url) =
|
||||
Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
|
||||
)
|
||||
};
|
||||
|
||||
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV) {
|
||||
let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
|
||||
Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
|
||||
None => None,
|
||||
};
|
||||
@@ -158,18 +159,18 @@ impl Secrets {
|
||||
let this = Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
|
||||
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
|
||||
control_plane_jwt_token: Self::load_secret(
|
||||
&args.control_plane_jwt_token,
|
||||
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
|
||||
),
|
||||
peer_jwt_token: Self::load_secret(&args.peer_jwt_token, Self::PEER_JWT_TOKEN_ENV),
|
||||
)
|
||||
.await,
|
||||
};
|
||||
|
||||
Ok(this)
|
||||
}
|
||||
|
||||
fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||
async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||
if let Some(v) = cli {
|
||||
Some(v.clone())
|
||||
} else if let Ok(v) = std::env::var(env_name) {
|
||||
@@ -180,6 +181,20 @@ impl Secrets {
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute the diesel migrations that are built into this binary
|
||||
async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
||||
use diesel::PgConnection;
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
let mut conn = PgConnection::establish(database_url)?;
|
||||
|
||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
||||
.run_pending_migrations(MIGRATIONS)
|
||||
.map(|_| ())
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
@@ -269,7 +284,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let config = Config {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
peer_jwt_token: secrets.peer_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
max_offline_interval: args
|
||||
.max_offline_interval
|
||||
@@ -290,9 +304,13 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
http_service_port: args.listen.port() as i32,
|
||||
};
|
||||
|
||||
// Validate that we can connect to the database
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
|
||||
|
||||
migration_run(&secrets.database_url)
|
||||
.await
|
||||
.context("Running database migrations")?;
|
||||
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url));
|
||||
|
||||
let service = Service::spawn(config, persistence.clone()).await?;
|
||||
|
||||
@@ -230,7 +230,6 @@ pub(crate) enum DatabaseErrorLabel {
|
||||
Connection,
|
||||
ConnectionPool,
|
||||
Logical,
|
||||
Migration,
|
||||
}
|
||||
|
||||
impl DatabaseError {
|
||||
@@ -240,7 +239,6 @@ impl DatabaseError {
|
||||
Self::Connection(_) => DatabaseErrorLabel::Connection,
|
||||
Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool,
|
||||
Self::Logical(_) => DatabaseErrorLabel::Logical,
|
||||
Self::Migration(_) => DatabaseErrorLabel::Migration,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,15 +92,15 @@ impl Node {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_availability(&self) -> &NodeAvailability {
|
||||
&self.availability
|
||||
pub(crate) fn get_availability(&self) -> NodeAvailability {
|
||||
self.availability
|
||||
}
|
||||
|
||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::WarmingUp;
|
||||
|
||||
match self.get_availability_transition(&availability) {
|
||||
match self.get_availability_transition(availability) {
|
||||
ToActive => {
|
||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||
// users of previously-cloned copies of the node will still see the old cancellation
|
||||
@@ -115,8 +115,8 @@ impl Node {
|
||||
Unchanged | ToWarmingUpFromOffline => {}
|
||||
}
|
||||
|
||||
if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
|
||||
self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
|
||||
if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
|
||||
self.availability = WarmingUp(std::cmp::max(crnt, proposed));
|
||||
} else {
|
||||
self.availability = availability;
|
||||
}
|
||||
@@ -126,12 +126,12 @@ impl Node {
|
||||
/// into a description of the transition.
|
||||
pub(crate) fn get_availability_transition(
|
||||
&self,
|
||||
availability: &NodeAvailability,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::*;
|
||||
|
||||
match (&self.availability, availability) {
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
(Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
|
||||
@@ -153,15 +153,15 @@ impl Node {
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||
let utilization = match &self.availability {
|
||||
NodeAvailability::Active(u) => u.clone(),
|
||||
let score = match self.availability {
|
||||
NodeAvailability::Active(score) => score,
|
||||
NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
|
||||
};
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
|
||||
}
|
||||
@@ -285,7 +285,7 @@ impl Node {
|
||||
pub(crate) fn describe(&self) -> NodeDescribeResponse {
|
||||
NodeDescribeResponse {
|
||||
id: self.id,
|
||||
availability: self.availability.clone().into(),
|
||||
availability: self.availability.into(),
|
||||
scheduling: self.scheduling,
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
|
||||
@@ -25,9 +25,6 @@ use crate::metrics::{
|
||||
};
|
||||
use crate::node::Node;
|
||||
|
||||
use diesel_migrations::{embed_migrations, EmbeddedMigrations};
|
||||
const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
|
||||
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The storage controller service does not store most of its state durably.
|
||||
@@ -75,8 +72,6 @@ pub(crate) enum DatabaseError {
|
||||
ConnectionPool(#[from] r2d2::Error),
|
||||
#[error("Logical error: {0}")]
|
||||
Logical(String),
|
||||
#[error("Migration error: {0}")]
|
||||
Migration(String),
|
||||
}
|
||||
|
||||
#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
|
||||
@@ -172,19 +167,6 @@ impl Persistence {
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute the diesel migrations that are built into this binary
|
||||
pub(crate) async fn migration_run(&self) -> DatabaseResult<()> {
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
|
||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||
HarnessWithOutput::write_to_stdout(conn)
|
||||
.run_pending_migrations(MIGRATIONS)
|
||||
.map(|_| ())
|
||||
.map_err(|e| DatabaseError::Migration(e.to_string()))
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::{node::Node, tenant_shard::TenantShard};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::models::PageserverUtilization;
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
@@ -20,9 +20,9 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
pub enum MaySchedule {
|
||||
Yes(PageserverUtilization),
|
||||
Yes(UtilizationScore),
|
||||
No,
|
||||
}
|
||||
|
||||
@@ -282,28 +282,6 @@ impl Scheduler {
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe update PageserverUtilization
|
||||
match update {
|
||||
RefCountUpdate::AddSecondary | RefCountUpdate::Attach => {
|
||||
// Referencing the node: if this takes our shard_count above the utilzation structure's
|
||||
// shard count, then artifically bump it: this ensures that the scheduler immediately
|
||||
// recognizes that this node has more work on it, without waiting for the next heartbeat
|
||||
// to update the utilization.
|
||||
if let MaySchedule::Yes(utilization) = &mut node.may_schedule {
|
||||
utilization.adjust_shard_count_max(node.shard_count as u32);
|
||||
}
|
||||
}
|
||||
RefCountUpdate::PromoteSecondary
|
||||
| RefCountUpdate::Detach
|
||||
| RefCountUpdate::RemoveSecondary
|
||||
| RefCountUpdate::DemoteAttached => {
|
||||
// De-referencing the node: leave the utilization's shard_count at a stale higher
|
||||
// value until some future heartbeat after we have physically removed this shard
|
||||
// from the node: this prevents the scheduler over-optimistically trying to schedule
|
||||
// more work onto the node before earlier detaches are done.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the number of shards attached to a given node is lagging below
|
||||
@@ -348,18 +326,7 @@ impl Scheduler {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
match self.nodes.entry(node.get_id()) {
|
||||
Occupied(mut entry) => {
|
||||
// Updates to MaySchedule are how we receive updated PageserverUtilization: adjust these values
|
||||
// to account for any shards scheduled on the controller but not yet visible to the pageserver.
|
||||
let mut may_schedule = node.may_schedule();
|
||||
match &mut may_schedule {
|
||||
MaySchedule::Yes(utilization) => {
|
||||
utilization.adjust_shard_count_max(entry.get().shard_count as u32);
|
||||
}
|
||||
MaySchedule::No => { // Nothing to tweak
|
||||
}
|
||||
}
|
||||
|
||||
entry.get_mut().may_schedule = may_schedule;
|
||||
entry.get_mut().may_schedule = node.may_schedule();
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
@@ -396,7 +363,7 @@ impl Scheduler {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| !matches!(n.may_schedule, MaySchedule::No))
|
||||
.map(|n| n.may_schedule != MaySchedule::No)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
@@ -416,7 +383,7 @@ impl Scheduler {
|
||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
||||
/// cause us to fail to schedule a shard.
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
&self,
|
||||
hard_exclude: &[NodeId],
|
||||
context: &ScheduleContext,
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
@@ -424,41 +391,31 @@ impl Scheduler {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
|
||||
.nodes
|
||||
.iter_mut()
|
||||
.filter_map(|(k, v)| match &mut v.may_schedule {
|
||||
MaySchedule::No => None,
|
||||
MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
|
||||
MaySchedule::Yes(utilization) => Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
utilization.cached_score(),
|
||||
v.attached_shard_count,
|
||||
)),
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
v.shard_count,
|
||||
v.attached_shard_count,
|
||||
))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Exclude nodes whose utilization is critically high, if there are alternatives available. This will
|
||||
// cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
|
||||
// we may place shards in the same tenant together on the same pageserver if all other pageservers are
|
||||
// overloaded.
|
||||
let non_overloaded_scores = scores
|
||||
.iter()
|
||||
.filter(|i| !PageserverUtilization::is_overloaded(i.2))
|
||||
.copied()
|
||||
.collect::<Vec<_>>();
|
||||
if !non_overloaded_scores.is_empty() {
|
||||
scores = non_overloaded_scores;
|
||||
}
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Utilization score (this combines shard count and disk utilization)
|
||||
// 3rd: Attached shard count. When nodes have identical utilization (e.g. when populating some
|
||||
// empty nodes), this acts as an anti-affinity between attached shards.
|
||||
// 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with
|
||||
// the least number of attached shards.
|
||||
// 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes
|
||||
// with the lower total shard count.
|
||||
// 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
|
||||
scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
// After applying constraints, no pageservers were left.
|
||||
@@ -472,7 +429,7 @@ impl Scheduler {
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
!matches!(node.may_schedule, MaySchedule::No),
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
@@ -512,7 +469,7 @@ impl Scheduler {
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
@@ -529,7 +486,7 @@ pub(crate) mod test_utils {
|
||||
format!("pghost-{i}"),
|
||||
5432 + i as u16,
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
assert!(node.is_available());
|
||||
node
|
||||
})
|
||||
@@ -540,8 +497,6 @@ pub(crate) mod test_utils {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::tenant_shard::IntentState;
|
||||
@@ -602,130 +557,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Test the PageserverUtilization's contribution to scheduling algorithm
|
||||
fn scheduler_utilization() {
|
||||
let mut nodes = test_utils::make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
// Need to keep these alive because they contribute to shard counts via RAII
|
||||
let mut scheduled_intents = Vec::new();
|
||||
|
||||
let empty_context = ScheduleContext::default();
|
||||
|
||||
fn assert_scheduler_chooses(
|
||||
expect_node: NodeId,
|
||||
scheduled_intents: &mut Vec<IntentState>,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) {
|
||||
let scheduled = scheduler.schedule_shard(&[], context).unwrap();
|
||||
let mut intent = IntentState::new();
|
||||
intent.set_attached(scheduler, Some(scheduled));
|
||||
scheduled_intents.push(intent);
|
||||
assert_eq!(scheduled, expect_node);
|
||||
}
|
||||
|
||||
// Independent schedule calls onto empty nodes should round-robin, because each node's
|
||||
// utilization's shard count is updated inline. The order is determinsitic because when all other factors are
|
||||
// equal, we order by node ID.
|
||||
assert_scheduler_chooses(
|
||||
NodeId(1),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(2),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(3),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
|
||||
// Manually setting utilization higher should cause schedule calls to round-robin the other nodes
|
||||
// which have equal utilization.
|
||||
nodes
|
||||
.get_mut(&NodeId(1))
|
||||
.unwrap()
|
||||
.set_availability(NodeAvailability::Active(test_utilization::simple(
|
||||
10,
|
||||
1024 * 1024 * 1024,
|
||||
)));
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
|
||||
assert_scheduler_chooses(
|
||||
NodeId(2),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(3),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(2),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(3),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&empty_context,
|
||||
);
|
||||
|
||||
// The scheduler should prefer nodes with lower affinity score,
|
||||
// even if they have higher utilization (as long as they aren't utilized at >100%)
|
||||
let mut context_prefer_node1 = ScheduleContext::default();
|
||||
context_prefer_node1.avoid(&[NodeId(2), NodeId(3)]);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(1),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&context_prefer_node1,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(1),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&context_prefer_node1,
|
||||
);
|
||||
|
||||
// If a node is over-utilized, it will not be used even if affinity scores prefer it
|
||||
nodes
|
||||
.get_mut(&NodeId(1))
|
||||
.unwrap()
|
||||
.set_availability(NodeAvailability::Active(test_utilization::simple(
|
||||
20000,
|
||||
1024 * 1024 * 1024,
|
||||
)));
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
assert_scheduler_chooses(
|
||||
NodeId(2),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&context_prefer_node1,
|
||||
);
|
||||
assert_scheduler_chooses(
|
||||
NodeId(3),
|
||||
&mut scheduled_intents,
|
||||
&mut scheduler,
|
||||
&context_prefer_node1,
|
||||
);
|
||||
|
||||
for mut intent in scheduled_intents {
|
||||
intent.clear(&mut scheduler);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,9 +17,8 @@ use crate::{
|
||||
compute_hook::NotifyError,
|
||||
drain_utils::{self, TenantShardDrain, TenantShardIterator},
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||
leadership::Leadership,
|
||||
metrics,
|
||||
peer_client::GlobalObservedState,
|
||||
peer_client::{GlobalObservedState, PeerClient},
|
||||
persistence::{
|
||||
AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
|
||||
TenantFilter,
|
||||
@@ -44,7 +43,7 @@ use pageserver_api::{
|
||||
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
|
||||
},
|
||||
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
|
||||
};
|
||||
@@ -288,9 +287,6 @@ pub struct Config {
|
||||
// This JWT token will be used to authenticate this service to the control plane.
|
||||
pub control_plane_jwt_token: Option<String>,
|
||||
|
||||
// This JWT token will be used to authenticate with other storage controller instances
|
||||
pub peer_jwt_token: Option<String>,
|
||||
|
||||
/// Where the compute hook should send notifications of pageserver attachment locations
|
||||
/// (this URL points to the control plane in prod). If this is None, the compute hook will
|
||||
/// assume it is running in a test environment and try to update neon_local.
|
||||
@@ -337,7 +333,7 @@ impl From<DatabaseError> for ApiError {
|
||||
DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
|
||||
ApiError::ShuttingDown
|
||||
}
|
||||
DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => {
|
||||
DatabaseError::Logical(reason) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(reason))
|
||||
}
|
||||
}
|
||||
@@ -542,7 +538,7 @@ impl Service {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.nodes.clone()
|
||||
};
|
||||
let mut nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
|
||||
let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
|
||||
|
||||
// List of tenants for which we will attempt to notify compute of their location at startup
|
||||
let mut compute_notifications = Vec::new();
|
||||
@@ -556,8 +552,10 @@ impl Service {
|
||||
// Mark nodes online if they responded to us: nodes are offline by default after a restart.
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
for (node_id, node) in new_nodes.iter_mut() {
|
||||
if let Some(utilization) = nodes_online.remove(node_id) {
|
||||
node.set_availability(NodeAvailability::Active(utilization));
|
||||
if let Some(utilization) = nodes_online.get(node_id) {
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore(
|
||||
utilization.utilization_score,
|
||||
)));
|
||||
scheduler.node_upsert(node);
|
||||
}
|
||||
}
|
||||
@@ -608,15 +606,22 @@ impl Service {
|
||||
|
||||
// Before making any obeservable changes to the cluster, persist self
|
||||
// as leader in database and memory.
|
||||
let leadership = Leadership::new(
|
||||
self.persistence.clone(),
|
||||
self.config.clone(),
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
if let Some(address_for_peers) = &self.config.address_for_peers {
|
||||
// TODO: `address-for-peers` can become a mandatory cli arg
|
||||
// after we update the k8s setup
|
||||
let proposed_leader = ControllerPersistence {
|
||||
address: address_for_peers.to_string(),
|
||||
started_at: chrono::Utc::now(),
|
||||
};
|
||||
|
||||
if let Err(e) = leadership.become_leader(current_leader).await {
|
||||
tracing::error!("Failed to persist self as leader: {e}. Aborting start-up ...");
|
||||
std::process::exit(1);
|
||||
if let Err(err) = self
|
||||
.persistence
|
||||
.update_leader(current_leader, proposed_leader)
|
||||
.await
|
||||
{
|
||||
tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
self.inner.write().unwrap().become_leader();
|
||||
@@ -923,9 +928,9 @@ impl Service {
|
||||
if let Ok(deltas) = res {
|
||||
for (node_id, state) in deltas.0 {
|
||||
let new_availability = match state {
|
||||
PageserverState::Available { utilization, .. } => {
|
||||
NodeAvailability::Active(utilization)
|
||||
}
|
||||
PageserverState::Available { utilization, .. } => NodeAvailability::Active(
|
||||
UtilizationScore(utilization.utilization_score),
|
||||
),
|
||||
PageserverState::WarmingUp { started_at } => {
|
||||
NodeAvailability::WarmingUp(started_at)
|
||||
}
|
||||
@@ -934,17 +939,14 @@ impl Service {
|
||||
// while the heartbeat round was on-going. Hence, filter out
|
||||
// offline transitions for WarmingUp nodes that are still within
|
||||
// their grace period.
|
||||
if let Ok(NodeAvailability::WarmingUp(started_at)) = self
|
||||
.get_node(node_id)
|
||||
.await
|
||||
.as_ref()
|
||||
.map(|n| n.get_availability())
|
||||
if let Ok(NodeAvailability::WarmingUp(started_at)) =
|
||||
self.get_node(node_id).await.map(|n| n.get_availability())
|
||||
{
|
||||
let now = Instant::now();
|
||||
if now - *started_at >= self.config.max_warming_up_interval {
|
||||
if now - started_at >= self.config.max_warming_up_interval {
|
||||
NodeAvailability::Offline
|
||||
} else {
|
||||
NodeAvailability::WarmingUp(*started_at)
|
||||
NodeAvailability::WarmingUp(started_at)
|
||||
}
|
||||
} else {
|
||||
NodeAvailability::Offline
|
||||
@@ -1157,16 +1159,6 @@ impl Service {
|
||||
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let leadership_cancel = CancellationToken::new();
|
||||
let leadership = Leadership::new(persistence.clone(), config.clone(), leadership_cancel);
|
||||
let (leader, leader_step_down_state) = leadership.step_down_current_leader().await?;
|
||||
|
||||
// Apply the migrations **after** the current leader has stepped down
|
||||
// (or we've given up waiting for it), but **before** reading from the
|
||||
// database. The only exception is reading the current leader before
|
||||
// migrating.
|
||||
persistence.migration_run().await?;
|
||||
|
||||
tracing::info!("Loading nodes from database...");
|
||||
let nodes = persistence
|
||||
.list_nodes()
|
||||
@@ -1384,6 +1376,32 @@ impl Service {
|
||||
return;
|
||||
};
|
||||
|
||||
let leadership_status = this.inner.read().unwrap().get_leadership_status();
|
||||
let leader = match this.get_leader().await {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
"Failed to query database for current leader: {err}. Aborting start-up ..."
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let leader_step_down_state = match leadership_status {
|
||||
LeadershipStatus::Candidate => {
|
||||
if let Some(ref leader) = leader {
|
||||
this.request_step_down(leader).await
|
||||
} else {
|
||||
tracing::info!(
|
||||
"No leader found to request step down from. Will build observed state."
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
LeadershipStatus::Leader => None,
|
||||
LeadershipStatus::SteppedDown => unreachable!(),
|
||||
};
|
||||
|
||||
this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
|
||||
.await;
|
||||
|
||||
@@ -1626,7 +1644,7 @@ impl Service {
|
||||
// This Node is a mutable local copy: we will set it active so that we can use its
|
||||
// API client to reconcile with the node. The Node in [`Self::nodes`] will get updated
|
||||
// later.
|
||||
node.set_availability(NodeAvailability::Active(PageserverUtilization::full()));
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
let configs = match node
|
||||
.with_client_retries(
|
||||
@@ -2474,7 +2492,7 @@ impl Service {
|
||||
.await;
|
||||
|
||||
let node = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let locked = self.inner.read().unwrap();
|
||||
// Just a sanity check to prevent misuse: the API expects that the tenant is fully
|
||||
// detached everywhere, and nothing writes to S3 storage. Here, we verify that,
|
||||
// but only at the start of the process, so it's really just to prevent operator
|
||||
@@ -2501,7 +2519,7 @@ impl Service {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
|
||||
}
|
||||
}
|
||||
let scheduler = &mut locked.scheduler;
|
||||
let scheduler = &locked.scheduler;
|
||||
// Right now we only perform the operation on a single node without parallelization
|
||||
// TODO fan out the operation to multiple nodes for better performance
|
||||
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
|
||||
@@ -4762,7 +4780,7 @@ impl Service {
|
||||
//
|
||||
// The transition we calculate here remains valid later in the function because we hold the op lock on the node:
|
||||
// nothing else can mutate its availability while we run.
|
||||
let availability_transition = if let Some(input_availability) = availability.as_ref() {
|
||||
let availability_transition = if let Some(input_availability) = availability {
|
||||
let (activate_node, availability_transition) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(node) = locked.nodes.get(&node_id) else {
|
||||
@@ -4798,8 +4816,8 @@ impl Service {
|
||||
));
|
||||
};
|
||||
|
||||
if let Some(availability) = availability.as_ref() {
|
||||
node.set_availability(availability.clone());
|
||||
if let Some(availability) = &availability {
|
||||
node.set_availability(*availability);
|
||||
}
|
||||
|
||||
if let Some(scheduling) = scheduling {
|
||||
@@ -6359,4 +6377,42 @@ impl Service {
|
||||
|
||||
global_observed
|
||||
}
|
||||
|
||||
/// Request step down from the currently registered leader in the database
|
||||
///
|
||||
/// If such an entry is persisted, the success path returns the observed
|
||||
/// state and details of the leader. Otherwise, None is returned indicating
|
||||
/// there is no leader currently.
|
||||
///
|
||||
/// On failures to query the database or step down error responses the process is killed
|
||||
/// and we rely on k8s to retry.
|
||||
async fn request_step_down(
|
||||
&self,
|
||||
leader: &ControllerPersistence,
|
||||
) -> Option<GlobalObservedState> {
|
||||
tracing::info!("Sending step down request to {leader:?}");
|
||||
|
||||
// TODO: jwt token
|
||||
let client = PeerClient::new(
|
||||
Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
|
||||
self.config.jwt_token.clone(),
|
||||
);
|
||||
let state = client.step_down(&self.cancel).await;
|
||||
match state {
|
||||
Ok(state) => Some(state),
|
||||
Err(err) => {
|
||||
// TODO: Make leaders periodically update a timestamp field in the
|
||||
// database and, if the leader is not reachable from the current instance,
|
||||
// but inferred as alive from the timestamp, abort start-up. This avoids
|
||||
// a potential scenario in which we have two controllers acting as leaders.
|
||||
tracing::error!(
|
||||
"Leader ({}) did not respond to step-down request: {}",
|
||||
leader.address,
|
||||
err
|
||||
);
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -779,7 +779,7 @@ impl TenantShard {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_secondary(
|
||||
&self,
|
||||
scheduler: &mut Scheduler,
|
||||
scheduler: &Scheduler,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
if self.intent.secondary.is_empty() {
|
||||
@@ -1595,7 +1595,7 @@ pub(crate) mod tests {
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
|
||||
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
|
||||
|
||||
// Since there is a node with no locations available, the node with two locations for the
|
||||
// same tenant should generate an optimization to move one away
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::Client;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
@@ -16,7 +16,7 @@ use futures_util::StreamExt;
|
||||
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
|
||||
use remote_storage::RemotePath;
|
||||
|
||||
pub(crate) struct TimelineAnalysis {
|
||||
/// Anomalies detected
|
||||
@@ -48,12 +48,13 @@ impl TimelineAnalysis {
|
||||
}
|
||||
|
||||
pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
s3_client: &Client,
|
||||
target: &RootTarget,
|
||||
id: &TenantShardTimelineId,
|
||||
tenant_objects: &mut TenantObjectListing,
|
||||
s3_active_branch: Option<&BranchData>,
|
||||
console_branch: Option<BranchData>,
|
||||
s3_data: Option<RemoteTimelineBlobData>,
|
||||
s3_data: Option<S3TimelineBlobData>,
|
||||
) -> TimelineAnalysis {
|
||||
let mut result = TimelineAnalysis::new();
|
||||
|
||||
@@ -77,9 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
|
||||
match s3_data {
|
||||
Some(s3_data) => {
|
||||
result
|
||||
.garbage_keys
|
||||
.extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
|
||||
result.garbage_keys.extend(s3_data.unknown_keys);
|
||||
|
||||
match s3_data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
@@ -144,13 +143,16 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
|
||||
// HEAD request used here to address a race condition when an index was uploaded concurrently
|
||||
// with our scan. We check if the object is uploaded to S3 after taking the listing snapshot.
|
||||
let response = remote_client
|
||||
.head_object(&path, &CancellationToken::new())
|
||||
let response = s3_client
|
||||
.head_object()
|
||||
.bucket(target.bucket_name())
|
||||
.key(path.get_path().as_str())
|
||||
.send()
|
||||
.await;
|
||||
|
||||
if response.is_err() {
|
||||
// Object is not present.
|
||||
let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
|
||||
let is_l0 = LayerMap::is_l0(layer.key_range());
|
||||
|
||||
let msg = format!(
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
|
||||
@@ -282,14 +284,14 @@ impl TenantObjectListing {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct RemoteTimelineBlobData {
|
||||
pub(crate) struct S3TimelineBlobData {
|
||||
pub(crate) blob_data: BlobDataParseResult,
|
||||
|
||||
// Index objects that were not used when loading `blob_data`, e.g. those from old generations
|
||||
pub(crate) unused_index_keys: Vec<ListingObject>,
|
||||
pub(crate) unused_index_keys: Vec<String>,
|
||||
|
||||
// Objects whose keys were not recognized at all, i.e. not layer files, not indices
|
||||
pub(crate) unknown_keys: Vec<ListingObject>,
|
||||
pub(crate) unknown_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -321,37 +323,31 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
|
||||
}
|
||||
|
||||
pub(crate) async fn list_timeline_blobs(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
s3_client: &Client,
|
||||
id: TenantShardTimelineId,
|
||||
root_target: &RootTarget,
|
||||
) -> anyhow::Result<RemoteTimelineBlobData> {
|
||||
s3_root: &RootTarget,
|
||||
) -> anyhow::Result<S3TimelineBlobData> {
|
||||
let mut s3_layers = HashSet::new();
|
||||
|
||||
let mut errors = Vec::new();
|
||||
let mut unknown_keys = Vec::new();
|
||||
|
||||
let mut timeline_dir_target = root_target.timeline_root(&id);
|
||||
let mut timeline_dir_target = s3_root.timeline_root(&id);
|
||||
timeline_dir_target.delimiter = String::new();
|
||||
|
||||
let mut index_part_keys: Vec<ListingObject> = Vec::new();
|
||||
let mut index_part_keys: Vec<String> = Vec::new();
|
||||
let mut initdb_archive: bool = false;
|
||||
|
||||
let prefix_str = &timeline_dir_target
|
||||
.prefix_in_bucket
|
||||
.strip_prefix("/")
|
||||
.unwrap_or(&timeline_dir_target.prefix_in_bucket);
|
||||
|
||||
let mut stream = std::pin::pin!(stream_listing(remote_client, &timeline_dir_target));
|
||||
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
|
||||
while let Some(obj) = stream.next().await {
|
||||
let (key, Some(obj)) = obj? else {
|
||||
panic!("ListingObject not specified");
|
||||
};
|
||||
let obj = obj?;
|
||||
let key = obj.key();
|
||||
|
||||
let blob_name = key.get_path().as_str().strip_prefix(prefix_str);
|
||||
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
|
||||
match blob_name {
|
||||
Some(name) if name.starts_with("index_part.json") => {
|
||||
tracing::debug!("Index key {key}");
|
||||
index_part_keys.push(obj)
|
||||
index_part_keys.push(key.to_owned())
|
||||
}
|
||||
Some("initdb.tar.zst") => {
|
||||
tracing::debug!("initdb archive {key}");
|
||||
@@ -362,7 +358,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
}
|
||||
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
|
||||
Ok((new_layer, gen)) => {
|
||||
tracing::debug!("Parsed layer key: {new_layer} {gen:?}");
|
||||
tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen);
|
||||
s3_layers.insert((new_layer, gen));
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -370,13 +366,13 @@ pub(crate) async fn list_timeline_blobs(
|
||||
errors.push(
|
||||
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
|
||||
);
|
||||
unknown_keys.push(obj);
|
||||
unknown_keys.push(key.to_string());
|
||||
}
|
||||
},
|
||||
None => {
|
||||
tracing::warn!("Unknown key {key}");
|
||||
tracing::warn!("Unknown key {}", key);
|
||||
errors.push(format!("S3 list response got an object with odd key {key}"));
|
||||
unknown_keys.push(obj);
|
||||
unknown_keys.push(key.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -385,7 +381,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
tracing::debug!(
|
||||
"Timeline is empty apart from initdb archive: expected post-deletion state."
|
||||
);
|
||||
return Ok(RemoteTimelineBlobData {
|
||||
return Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Relic,
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys: Vec::new(),
|
||||
@@ -399,13 +395,13 @@ pub(crate) async fn list_timeline_blobs(
|
||||
// Stripping the index key to the last part, because RemotePath doesn't
|
||||
// like absolute paths, and depending on prefix_in_bucket it's possible
|
||||
// for the keys we read back to start with a slash.
|
||||
let basename = key.key.get_path().as_str().rsplit_once('/').unwrap().1;
|
||||
let basename = key.rsplit_once('/').unwrap().1;
|
||||
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g))
|
||||
})
|
||||
.max_by_key(|i| i.1)
|
||||
.map(|(k, g)| (k.clone(), g))
|
||||
{
|
||||
Some((key, gen)) => (Some::<ListingObject>(key.to_owned()), gen),
|
||||
Some((key, gen)) => (Some(key), gen),
|
||||
None => {
|
||||
// Legacy/missing case: one or zero index parts, which did not have a generation
|
||||
(index_part_keys.pop(), Generation::none())
|
||||
@@ -420,14 +416,17 @@ pub(crate) async fn list_timeline_blobs(
|
||||
}
|
||||
|
||||
if let Some(index_part_object_key) = index_part_object.as_ref() {
|
||||
let index_part_bytes =
|
||||
download_object_with_retries(remote_client, &index_part_object_key.key)
|
||||
.await
|
||||
.context("index_part.json download")?;
|
||||
let index_part_bytes = download_object_with_retries(
|
||||
s3_client,
|
||||
&timeline_dir_target.bucket_name,
|
||||
index_part_object_key,
|
||||
)
|
||||
.await
|
||||
.context("index_part.json download")?;
|
||||
|
||||
match serde_json::from_slice(&index_part_bytes) {
|
||||
Ok(index_part) => {
|
||||
return Ok(RemoteTimelineBlobData {
|
||||
return Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Parsed {
|
||||
index_part: Box::new(index_part),
|
||||
index_part_generation,
|
||||
@@ -449,7 +448,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
);
|
||||
}
|
||||
|
||||
Ok(RemoteTimelineBlobData {
|
||||
Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
|
||||
@@ -6,7 +6,7 @@ use remote_storage::ListingMode;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{
|
||||
checks::parse_layer_object_name, init_remote, metadata_stream::stream_tenants,
|
||||
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
|
||||
stream_objects_with_retries, BucketConfig, NodeKind,
|
||||
};
|
||||
|
||||
@@ -50,8 +50,9 @@ pub async fn find_large_objects(
|
||||
ignore_deltas: bool,
|
||||
concurrency: usize,
|
||||
) -> anyhow::Result<LargeObjectListing> {
|
||||
let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||
let tenants = pin!(stream_tenants(&remote_client, &target));
|
||||
let (remote_client, target) =
|
||||
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
|
||||
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
|
||||
|
||||
let objects_stream = tenants.map_ok(|tenant_shard_id| {
|
||||
let mut tenant_root = target.tenant_root(&tenant_shard_id);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user