mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 01:42:55 +00:00
Merge branch 'main' into amasterov/regress-arm
This commit is contained in:
@@ -23,10 +23,30 @@ platforms = [
|
||||
]
|
||||
|
||||
[final-excludes]
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
workspace-members = ["vm_monitor"]
|
||||
workspace-members = [
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
"vm_monitor",
|
||||
# All of these exist in libs and are not usually built independently.
|
||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"tenant_size_model",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"wal_craft",
|
||||
"walproposer",
|
||||
]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
|
||||
@@ -43,7 +43,7 @@ inputs:
|
||||
pg_version:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v14'
|
||||
default: 'v16'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -169,10 +169,8 @@ runs:
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
@@ -48,6 +48,8 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
|
||||
15
.github/workflows/_build-and-test-locally.yml
vendored
15
.github/workflows/_build-and-test-locally.yml
vendored
@@ -94,11 +94,16 @@ jobs:
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
@@ -158,6 +163,8 @@ jobs:
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
@@ -172,7 +179,7 @@ jobs:
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
@@ -243,8 +250,8 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
# Run test on x64 only
|
||||
if: inputs.arch == 'x64'
|
||||
# Don't run regression tests on debug arm64 builds
|
||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
|
||||
13
.github/workflows/build_and_test.yml
vendored
13
.github/workflows/build_and_test.yml
vendored
@@ -198,7 +198,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
arch: [ x64, arm64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
@@ -280,6 +280,7 @@ jobs:
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
pg_version: v16
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -985,10 +986,10 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
@@ -998,14 +999,14 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
@@ -1015,7 +1016,7 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f branch=main \
|
||||
|
||||
27
Cargo.lock
generated
27
Cargo.lock
generated
@@ -1208,7 +1208,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1321,7 +1320,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1670,14 +1668,13 @@ dependencies = [
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.2.1"
|
||||
version = "2.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
@@ -3147,7 +3144,6 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3791,7 +3787,6 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4193,7 +4188,6 @@ dependencies = [
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4206,7 +4200,6 @@ dependencies = [
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
"url",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4229,7 +4222,6 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4267,7 +4259,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4832,7 +4823,6 @@ dependencies = [
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5357,7 +5347,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5601,11 +5590,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.96"
|
||||
version = "1.0.125"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
|
||||
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@@ -6193,7 +6183,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6794,7 +6783,6 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7012,7 +7000,6 @@ dependencies = [
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7091,7 +7078,6 @@ dependencies = [
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7112,7 +7098,6 @@ dependencies = [
|
||||
"bindgen",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7669,8 +7654,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
@@ -113,7 +113,7 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
|
||||
@@ -945,7 +945,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
@@ -967,7 +967,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
|
||||
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -54,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
const DEFAULT_PG_VERSION: &str = "16";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
|
||||
@@ -217,7 +217,7 @@ impl StorageController {
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
/// Create our database if it doesn't exist
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
@@ -382,7 +382,6 @@ impl StorageController {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
@@ -454,6 +453,11 @@ impl StorageController {
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
|
||||
let peer_claims = Claims::new(None, Scope::Admin);
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
|
||||
@@ -147,9 +147,9 @@ enum Command {
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// outside of the specified set.
|
||||
Drain {
|
||||
BulkMigrate {
|
||||
// Set of pageserver node ids to drain.
|
||||
#[arg(long)]
|
||||
nodes: Vec<NodeId>,
|
||||
@@ -163,6 +163,34 @@ enum Command {
|
||||
#[arg(long)]
|
||||
dry_run: Option<bool>,
|
||||
},
|
||||
/// Start draining the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel draining the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Start filling the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel filling the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -249,6 +277,34 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_scheduling_policy<F>(
|
||||
client: Client,
|
||||
node_id: NodeId,
|
||||
timeout: Duration,
|
||||
f: F,
|
||||
) -> anyhow::Result<NodeSchedulingPolicy>
|
||||
where
|
||||
F: Fn(NodeSchedulingPolicy) -> bool,
|
||||
{
|
||||
let waiter = tokio::time::timeout(timeout, async move {
|
||||
loop {
|
||||
let node = client
|
||||
.dispatch::<(), NodeDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if f(node.scheduling) {
|
||||
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(waiter.await??)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
@@ -628,7 +684,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::Drain {
|
||||
Command::BulkMigrate {
|
||||
nodes,
|
||||
concurrency,
|
||||
max_shards,
|
||||
@@ -657,7 +713,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if nodes.len() != node_to_drain_descs.len() {
|
||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
||||
}
|
||||
|
||||
node_to_fill_descs.retain(|desc| {
|
||||
@@ -669,7 +725,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
if node_to_fill_descs.is_empty() {
|
||||
anyhow::bail!("There are no nodes to drain to")
|
||||
anyhow::bail!("There are no nodes to migrate to")
|
||||
}
|
||||
|
||||
// Set the node scheduling policy to draining for the nodes which
|
||||
@@ -690,7 +746,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Perform the drain: move each tenant shard scheduled on a node to
|
||||
// Perform the migration: move each tenant shard scheduled on a node to
|
||||
// be drained to a node which is being filled. A simple round robin
|
||||
// strategy is used to pick the new node.
|
||||
let tenants = storcon_client
|
||||
@@ -703,13 +759,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mut selected_node_idx = 0;
|
||||
|
||||
struct DrainMove {
|
||||
struct MigrationMove {
|
||||
tenant_shard_id: TenantShardId,
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
}
|
||||
|
||||
let mut moves: Vec<DrainMove> = Vec::new();
|
||||
let mut moves: Vec<MigrationMove> = Vec::new();
|
||||
|
||||
let shards = tenants
|
||||
.into_iter()
|
||||
@@ -739,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
moves.push(DrainMove {
|
||||
moves.push(MigrationMove {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
from: shard
|
||||
.node_attached
|
||||
@@ -816,6 +872,67 @@ async fn main() -> anyhow::Result<()> {
|
||||
failure
|
||||
);
|
||||
}
|
||||
Command::StartDrain { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Drain started for {node_id}");
|
||||
}
|
||||
Command::CancelDrain { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active | PauseForRestart)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::StartFill { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
||||
.await?;
|
||||
|
||||
println!("Fill started for {node_id}");
|
||||
}
|
||||
Command::CancelFill { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/fill"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -3,7 +3,7 @@ set -x
|
||||
|
||||
cd /ext-src || exit 2
|
||||
FAILED=
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
|
||||
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
|
||||
during the restart at 2024-04-03 16:37 UTC.
|
||||
|
||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||
|
||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||
|
||||
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
git checkout -b my-branch-15 REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
git merge REL_15_4
|
||||
```
|
||||
|
||||
In the commit message of the merge commit, mention if there were
|
||||
any non-trivial conflicts or other issues.
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
@@ -57,7 +48,7 @@ Postgres in a negative way.
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
git push origin my-branch-15
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
@@ -74,7 +65,7 @@ branch.
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
@@ -89,14 +80,12 @@ minor Postgres release.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
git push origin my-branch-15:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
|
||||
@@ -14,5 +14,3 @@ regex.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -6,10 +6,8 @@ license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,5 +14,3 @@ parking_lot.workspace = true
|
||||
hex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -12,8 +12,6 @@ chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
measured-process.workspace = true
|
||||
|
||||
@@ -21,11 +21,9 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::models::PageserverUtilization;
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
@@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Clone, Debug)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active(UtilizationScore),
|
||||
Active(PageserverUtilization),
|
||||
// Node is warming up, but we expect it to become available soon. Covers
|
||||
// the time span between the re-attach response being composed on the storage controller
|
||||
// and the first successful heartbeat after the processing of the re-attach response
|
||||
@@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::Active => {
|
||||
NodeAvailability::Active(PageserverUtilization::full())
|
||||
}
|
||||
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
|
||||
@@ -236,6 +236,15 @@ impl Key {
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
};
|
||||
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
|
||||
pub const NON_L0_MAX: Key = Key {
|
||||
field1: u8::MAX,
|
||||
field2: u32::MAX,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX - 1,
|
||||
};
|
||||
|
||||
pub fn from_hex(s: &str) -> Result<Self> {
|
||||
if s.len() != 36 {
|
||||
|
||||
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V1
|
||||
Self::V2
|
||||
}
|
||||
}
|
||||
|
||||
@@ -718,6 +718,7 @@ pub struct TimelineInfo {
|
||||
pub pg_version: u32,
|
||||
|
||||
pub state: TimelineState,
|
||||
pub is_archived: bool,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
@@ -1062,7 +1063,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
}
|
||||
}
|
||||
|
||||
// In the V2 protocol version, a GetPage request contains two LSN values:
|
||||
// A GetPage request contains two LSN values:
|
||||
//
|
||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||
@@ -1075,7 +1076,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||
// request without waiting for 'request_lsn' to arrive.
|
||||
//
|
||||
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||
// standby to request a page at a particular non-latest LSN, and also include the
|
||||
@@ -1083,15 +1084,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
||||
// maps the old format requests to the new format.
|
||||
//
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
@@ -1230,36 +1227,17 @@ impl PagestreamFeMessage {
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
|
||||
let (request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V1 => {
|
||||
// In the old protocol, each message starts with a boolean 'latest' flag,
|
||||
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
||||
// 'not_modified_since', used in the new protocol version.
|
||||
let latest = body.read_u8()? != 0;
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
if latest {
|
||||
(Lsn::MAX, request_lsn) // get latest version
|
||||
} else {
|
||||
(request_lsn, request_lsn) // get version at specified LSN
|
||||
}
|
||||
}
|
||||
};
|
||||
// these two fields are the same for every request type
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
|
||||
// The rest of the messages are the same between V1 and V2
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
request_lsn,
|
||||
@@ -1467,9 +1445,7 @@ mod tests {
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
||||
.unwrap();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
|
||||
pub max_shard_count: u32,
|
||||
|
||||
/// Cached result of [`Self::score`]
|
||||
pub utilization_score: u64,
|
||||
pub utilization_score: Option<u64>,
|
||||
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
|
||||
Percent::new(0).unwrap()
|
||||
}
|
||||
|
||||
pub type RawScore = u64;
|
||||
|
||||
impl PageserverUtilization {
|
||||
const UTILIZATION_FULL: u64 = 1000000;
|
||||
|
||||
@@ -62,7 +64,7 @@ impl PageserverUtilization {
|
||||
/// - Negative values are forbidden
|
||||
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
||||
/// layer eviction.
|
||||
pub fn score(&self) -> u64 {
|
||||
pub fn score(&self) -> RawScore {
|
||||
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
||||
* self.disk_usable_pct.get() as u64)
|
||||
/ 100;
|
||||
@@ -74,8 +76,30 @@ impl PageserverUtilization {
|
||||
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
||||
}
|
||||
|
||||
pub fn refresh_score(&mut self) {
|
||||
self.utilization_score = self.score();
|
||||
pub fn cached_score(&mut self) -> RawScore {
|
||||
match self.utilization_score {
|
||||
None => {
|
||||
let s = self.score();
|
||||
self.utilization_score = Some(s);
|
||||
s
|
||||
}
|
||||
Some(s) => s,
|
||||
}
|
||||
}
|
||||
|
||||
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
||||
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
||||
pub fn is_overloaded(score: RawScore) -> bool {
|
||||
score >= Self::UTILIZATION_FULL
|
||||
}
|
||||
|
||||
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
||||
if self.shard_count < shard_count {
|
||||
self.shard_count = shard_count;
|
||||
|
||||
// Dirty cache: this will be calculated next time someone retrives the score
|
||||
self.utilization_score = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
||||
@@ -88,7 +112,38 @@ impl PageserverUtilization {
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count: 1,
|
||||
max_shard_count: 1,
|
||||
utilization_score: Self::UTILIZATION_FULL,
|
||||
utilization_score: Some(Self::UTILIZATION_FULL),
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper
|
||||
pub mod test_utilization {
|
||||
use super::PageserverUtilization;
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
serde_percent::Percent,
|
||||
serde_system_time::{self},
|
||||
};
|
||||
|
||||
// Parameters of the imaginary node used for test utilization instances
|
||||
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
|
||||
const TEST_SHARDS_MAX: u32 = 1000;
|
||||
|
||||
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
|
||||
/// not abuse this function from non-test code.
|
||||
///
|
||||
/// Emulates a node with a 1000 shard limit and a 1TB disk.
|
||||
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
|
||||
PageserverUtilization {
|
||||
disk_usage_bytes: disk_wanted_bytes,
|
||||
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
|
||||
disk_wanted_bytes,
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count,
|
||||
max_shard_count: TEST_SHARDS_MAX,
|
||||
utilization_score: None,
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
@@ -120,7 +175,7 @@ mod tests {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
disk_wanted_bytes: u64::MAX,
|
||||
utilization_score: 13,
|
||||
utilization_score: Some(13),
|
||||
disk_usable_pct: Percent::new(90).unwrap(),
|
||||
shard_count: 100,
|
||||
max_shard_count: 200,
|
||||
|
||||
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -11,7 +11,5 @@ postgres.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -19,8 +19,6 @@ thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -14,8 +14,6 @@ postgres.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
regex.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -11,9 +11,7 @@ itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -32,7 +32,7 @@ scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
azure_core.workspace = true
|
||||
azure_identity.workspace = true
|
||||
azure_storage.workspace = true
|
||||
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
|
||||
let properties_future = blob_client.get_properties().into_future();
|
||||
|
||||
let properties_future = tokio::time::timeout(self.timeout, properties_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = properties_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
let data = match res {
|
||||
Ok(Ok(data)) => Ok(data),
|
||||
Ok(Err(sdk)) => Err(to_download_error(sdk)),
|
||||
Err(_timeout) => Err(DownloadError::Timeout),
|
||||
}?;
|
||||
|
||||
let properties = data.blob.properties;
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::from(properties.last_modified),
|
||||
size: properties.content_length,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -150,7 +150,7 @@ pub enum ListingMode {
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
Ok(combined)
|
||||
}
|
||||
|
||||
/// Obtain metadata information about an object.
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// See [`RemoteStorage::head_object`].
|
||||
pub async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.head_object(key, cancel).await,
|
||||
Self::AwsS3(s) => s.head_object(key, cancel).await,
|
||||
Self::AzureBlob(s) => s.head_object(key, cancel).await,
|
||||
Self::Unreliable(s) => s.head_object(key, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
RequestKind::Head => &self.read,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let target_file_path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&target_file_path).await?;
|
||||
Ok(ListingObject {
|
||||
key: key.clone(),
|
||||
last_modified: metadata.modified()?,
|
||||
size: metadata.len(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
|
||||
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
Head = 6,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
@@ -27,6 +28,7 @@ impl RequestKind {
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
Head => "head_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -34,7 +36,8 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
const REQUEST_KIND_COUNT: usize = 7;
|
||||
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
|
||||
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -23,7 +23,7 @@ use aws_config::{
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
operation::{get_object::GetObjectError, head_object::HeadObjectError},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||
Client,
|
||||
};
|
||||
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let head_future = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket_name())
|
||||
.key(self.relative_path_to_s3_object(key))
|
||||
.send();
|
||||
|
||||
let head_future = tokio::time::timeout(self.timeout, head_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = head_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let res = res.map_err(|_e| DownloadError::Timeout)?;
|
||||
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
let data = match res {
|
||||
Ok(object_output) => object_output,
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
return Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("s3 head object"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"head_object doesn't contain last_modified or content_length"
|
||||
)))?;
|
||||
};
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
|
||||
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
|
||||
})?,
|
||||
size: size as u64,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
enum RemoteOp {
|
||||
ListPrefixes(Option<RemotePath>),
|
||||
HeadObject(RemotePath),
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::ListingObject, DownloadError> {
|
||||
self.attempt(RemoteOp::HeadObject(key.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.head_object(key, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -9,5 +9,3 @@ serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -9,5 +9,3 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -39,7 +39,7 @@ thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
toml_edit = { workspace = true, features = ["serde"] }
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
@@ -54,7 +54,6 @@ walkdir.workspace = true
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
@@ -71,6 +70,7 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
serde_assert.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -9,8 +9,6 @@ anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
|
||||
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("PG_VERSION_NUM")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
|
||||
@@ -282,7 +282,11 @@ mod tests {
|
||||
use std::cell::UnsafeCell;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
use crate::{
|
||||
api_bindings::Level,
|
||||
bindings::{NeonWALReadResult, PG_VERSION_NUM},
|
||||
walproposer::Wrapper,
|
||||
};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -489,41 +493,79 @@ mod tests {
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
// Messages definitions are at walproposer.h
|
||||
// xxx: it would be better to extract them from safekeeper crate and
|
||||
// use serialization/deserialization here.
|
||||
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
||||
let proto_version = 2_u32.to_ne_bytes();
|
||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let system_id = 0_u64.to_ne_bytes();
|
||||
let tenant_id = ttid.tenant_id.as_arr();
|
||||
let timeline_id = ttid.timeline_id.as_arr();
|
||||
let pg_tli = 1_u32.to_ne_bytes();
|
||||
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
||||
let proposer_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
proto_version.as_slice(),
|
||||
pg_version.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
system_id.as_slice(),
|
||||
tenant_id.as_slice(),
|
||||
timeline_id.as_slice(),
|
||||
pg_tli.as_slice(),
|
||||
wal_seg_size.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let voting_tag = (b'v' as u64).to_ne_bytes();
|
||||
let vote_request_term = 3_u64.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let vote_request = [
|
||||
voting_tag.as_slice(),
|
||||
vote_request_term.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
||||
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
||||
let acceptor_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
acceptor_greeting_term.as_slice(),
|
||||
acceptor_greeting_node_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let vote_response_term = 3_u64.to_ne_bytes();
|
||||
let vote_given = 1_u64.to_ne_bytes();
|
||||
let flush_lsn = 0x539_u64.to_ne_bytes();
|
||||
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
||||
let th_len = 1_u32.to_ne_bytes();
|
||||
let th_term = 2_u64.to_ne_bytes();
|
||||
let th_lsn = 0x539_u64.to_ne_bytes();
|
||||
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
||||
let vote_response = [
|
||||
voting_tag.as_slice(),
|
||||
vote_response_term.as_slice(),
|
||||
vote_given.as_slice(),
|
||||
flush_lsn.as_slice(),
|
||||
truncate_lsn.as_slice(),
|
||||
th_len.as_slice(),
|
||||
th_term.as_slice(),
|
||||
th_lsn.as_slice(),
|
||||
timeline_start_lsn.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_messages: vec![proposer_greeting, vote_request],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
safekeeper_replies: vec![acceptor_greeting, vote_response],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||
|
||||
@@ -10,6 +10,7 @@ use pageserver::{
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -67,12 +68,16 @@ async fn ingest(
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
const BATCH_SIZE: usize = 16;
|
||||
let mut batch = Vec::new();
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
@@ -95,7 +100,17 @@ async fn ingest(
|
||||
}
|
||||
}
|
||||
|
||||
layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
|
||||
@@ -126,10 +126,56 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
|
||||
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
|
||||
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
|
||||
{
|
||||
let open = || {
|
||||
nix::dir::Dir::open(
|
||||
tenants_path.as_std_path(),
|
||||
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
|
||||
nix::sys::stat::Mode::empty(),
|
||||
)
|
||||
};
|
||||
let dirfd = match open() {
|
||||
Ok(dirfd) => dirfd,
|
||||
Err(e) => match e {
|
||||
nix::errno::Errno::ENOENT => {
|
||||
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
|
||||
format!("Failed to create tenants root dir at '{tenants_path}'")
|
||||
})?;
|
||||
open().context("open tenants dir after creating it")?
|
||||
}
|
||||
e => anyhow::bail!(e),
|
||||
},
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::fd::AsRawFd;
|
||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
drop(dirfd);
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
|
||||
@@ -50,7 +50,6 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
@@ -90,8 +89,7 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
@@ -478,7 +476,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
@@ -1065,7 +1063,7 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1305,7 +1303,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
@@ -1378,7 +1376,7 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
|
||||
@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
|
||||
use crate::tenant::TimelineArchivalError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
|
||||
format!(
|
||||
"Cannot archive timeline which has non-archived child timelines: {children:?}"
|
||||
)
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
|
||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
// Report is_archived = false if the timeline is still loading
|
||||
let is_archived = timeline.is_archived().unwrap_or(false);
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
.get_remote_consistent_lsn_projected()
|
||||
.unwrap_or(Lsn(0));
|
||||
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
state,
|
||||
is_archived,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(
|
||||
|
||||
tenant
|
||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||
.await
|
||||
.context("applying archival config")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.await?;
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_archival_config",
|
||||
@@ -1706,11 +1725,6 @@ async fn timeline_compact_handler(
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||
if !cfg!(feature = "testing") {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||
)));
|
||||
}
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
}
|
||||
let wait_until_uploaded =
|
||||
@@ -2942,7 +2956,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
|
||||
impl L0FlushGlobalState {
|
||||
pub fn new(config: L0FlushConfig) -> Self {
|
||||
match config {
|
||||
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||
L0FlushConfig::Direct { max_concurrency } => {
|
||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||
Self(Arc::new(Inner::Direct { semaphore }))
|
||||
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl L0FlushConfig {
|
||||
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||
use L0FlushConfig::*;
|
||||
match self {
|
||||
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"Shut down successfully completed"
|
||||
);
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
|
||||
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV2,
|
||||
PageStream,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
LeaseLsn,
|
||||
@@ -1803,6 +1802,23 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_utilization_score",
|
||||
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_secondary_heatmap_total_size",
|
||||
"The total size in bytes of all layers in the most recently downloaded heatmap.",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -1853,16 +1869,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
pub struct BackgroundLoopSemaphoreMetrics {
|
||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
||||
|| {
|
||||
let counters = register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let durations = register_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_duration_seconds",
|
||||
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
BackgroundLoopSemaphoreMetrics {
|
||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
counters.with_label_values(&[kind.into()])
|
||||
})),
|
||||
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
durations.with_label_values(&[kind.into()])
|
||||
})),
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
impl BackgroundLoopSemaphoreMetrics {
|
||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
||||
struct Record<'a> {
|
||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||
task: BackgroundLoopKind,
|
||||
_counter_guard: metrics::IntCounterPairGuard,
|
||||
start: Instant,
|
||||
}
|
||||
impl Drop for Record<'_> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed().as_secs_f64();
|
||||
self.metrics.durations[self.task].inc_by(elapsed);
|
||||
}
|
||||
}
|
||||
Record {
|
||||
metrics: self,
|
||||
task,
|
||||
_counter_guard: self.counters[task].guard(),
|
||||
start: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
@@ -2544,6 +2608,7 @@ use std::time::{Duration, Instant};
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
|
||||
@@ -557,7 +557,7 @@ impl PageServerHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
_protocol_version: PagestreamProtocolVersion,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -601,8 +601,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// parse request
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
// invoke handler function
|
||||
let (handler_result, span) = match neon_fe_msg {
|
||||
@@ -1275,35 +1274,6 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStream)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V1,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
|
||||
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
@@ -174,6 +172,7 @@ impl Timeline {
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_bytes: 0,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -727,7 +726,17 @@ impl Timeline {
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V1) => {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
|
||||
self.list_aux_files_v1(lsn, ctx).await
|
||||
}
|
||||
None => {
|
||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||
if !res.is_empty() {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
@@ -1022,21 +1031,33 @@ pub struct DatadirModification<'a> {
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
|
||||
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
||||
pending_bytes: usize,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
||||
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
||||
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
|
||||
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
|
||||
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
||||
self.pending_bytes
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -1576,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
warn!("this timeline is using deprecated aux file policy V1");
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
@@ -1769,21 +1791,25 @@ impl<'a> DatadirModification<'a> {
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
let mut write_batch = Vec::new();
|
||||
for (lsn, value_ser_size, value) in values {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
write_batch.push((key.to_compact(), lsn, value_ser_size, value));
|
||||
} else {
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
retained_pending_updates.entry(key).or_default().push((
|
||||
lsn,
|
||||
value_ser_size,
|
||||
value,
|
||||
));
|
||||
}
|
||||
}
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1809,17 +1835,20 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index.
|
||||
let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
|
||||
.pending_updates
|
||||
.drain()
|
||||
.flat_map(|(key, values)| {
|
||||
values.into_iter().map(move |(lsn, val_ser_size, value)| {
|
||||
(key.to_compact(), lsn, val_ser_size, value)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1844,6 +1873,8 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
self.pending_bytes = 0;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1860,7 +1891,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
if let Some((_, _, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
@@ -1888,13 +1919,17 @@ impl<'a> DatadirModification<'a> {
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
values.push((self.lsn, val));
|
||||
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_bytes += val_serialized_size;
|
||||
values.push((self.lsn, val_serialized_size, val));
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
@@ -2024,7 +2059,7 @@ mod tests {
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
|
||||
@@ -393,7 +393,7 @@ struct PageServerTask {
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
@@ -405,7 +405,7 @@ struct PageServerTask {
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
future: F,
|
||||
@@ -550,7 +550,7 @@ pub async fn shutdown_tasks(
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
@@ -573,13 +573,8 @@ pub async fn shutdown_tasks(
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
if log_all {
|
||||
if tenant_shard_id.is_none() {
|
||||
// there are quite few of these
|
||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||
} else {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
|
||||
@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum TimelineArchivalError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout,
|
||||
|
||||
#[error("HasUnarchivedChildren")]
|
||||
HasUnarchivedChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline archival is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::Timeout => write!(f, "Timeout"),
|
||||
Self::HasUnarchivedChildren(c) => {
|
||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||
}
|
||||
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SetStoppingError {
|
||||
AlreadyStopping(completion::Barrier),
|
||||
Broken,
|
||||
@@ -798,7 +830,7 @@ impl Tenant {
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Attach,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
"attach tenant",
|
||||
async move {
|
||||
@@ -1326,24 +1358,50 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id, false)
|
||||
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
||||
) -> Result<(), TimelineArchivalError> {
|
||||
info!("setting timeline archival config");
|
||||
let timeline = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => t,
|
||||
None => return Err(TimelineArchivalError::NotFound),
|
||||
};
|
||||
|
||||
// Ensure that there are no non-archived child timelines
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
|
||||
return None;
|
||||
}
|
||||
if entry.is_archived() == Some(true) {
|
||||
return None;
|
||||
}
|
||||
Some(*id)
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() && state == TimelineArchivalState::Archived {
|
||||
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
|
||||
}
|
||||
Arc::clone(timeline)
|
||||
};
|
||||
|
||||
let upload_needed = timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||
|
||||
if upload_needed {
|
||||
info!("Uploading new state");
|
||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||
let Ok(v) =
|
||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||
else {
|
||||
tracing::warn!("reached timeout for waiting on upload queue");
|
||||
bail!("reached timeout for upload queue flush");
|
||||
return Err(TimelineArchivalError::Timeout);
|
||||
};
|
||||
v?;
|
||||
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -3741,13 +3799,21 @@ impl Tenant {
|
||||
/// less than this (via eviction and on-demand downloads), but this function enables
|
||||
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
||||
/// by keeping important things on local disk.
|
||||
///
|
||||
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
|
||||
/// than they report here, due to layer eviction. Tenants with many active branches may
|
||||
/// actually use more than they report here.
|
||||
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
||||
let mut wanted = 0;
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for timeline in timelines.values() {
|
||||
wanted += timeline.metrics.visible_physical_size_gauge.get();
|
||||
}
|
||||
wanted
|
||||
|
||||
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
|
||||
// reflects the observation that on tenants with multiple large branches, typically only one
|
||||
// of them is used actively enough to occupy space on disk.
|
||||
timelines
|
||||
.values()
|
||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5932,10 +5998,10 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
// the default aux file policy to switch is v2 if not set by the admins
|
||||
assert_eq!(
|
||||
harness.tenant_conf.switch_aux_file_policy,
|
||||
AuxFilePolicy::V1
|
||||
AuxFilePolicy::default_tenant_config()
|
||||
);
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -5979,8 +6045,8 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
||||
Some(AuxFilePolicy::V2),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
||||
);
|
||||
|
||||
// we can read everything from the storage
|
||||
@@ -6002,8 +6068,8 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep v1 storage format when new files are written"
|
||||
Some(AuxFilePolicy::V2),
|
||||
"keep v2 storage format when new files are written"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
@@ -6019,7 +6085,7 @@ mod tests {
|
||||
|
||||
// child copies the last flag even if that is not on remote storage yet
|
||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
@@ -7005,18 +7071,14 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: {
|
||||
let mut key = Key::MAX;
|
||||
key.field6 -= 1;
|
||||
Key::MIN..key
|
||||
},
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(3)..get_key(4),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
// The delta3 layer that should not be picked for the compaction
|
||||
@@ -7996,6 +8058,214 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
let harness =
|
||||
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x38),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
// delta1 and delta 2 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id),
|
||||
(Lsn(0x20), tline.timeline_id),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let mut dryrun_flags = EnumSet::new();
|
||||
dryrun_flags.insert(CompactFlags::DryRun);
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||
verify_result().await;
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
|
||||
@@ -21,7 +21,6 @@ pub struct EphemeralFile {
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||
mod zero_padded_read_write;
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -52,12 +51,10 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
rw: page_caching::RW::new(file, gate_guard),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -82,6 +79,8 @@ impl EphemeralFile {
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
// This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
@@ -89,17 +88,30 @@ impl EphemeralFile {
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
let mut len_bytes = std::io::Cursor::new(Vec::new());
|
||||
crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
|
||||
srcbuf.len(),
|
||||
&mut len_bytes,
|
||||
);
|
||||
let len_bytes = len_bytes.into_inner();
|
||||
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
}
|
||||
// Write the length field
|
||||
self.rw.write_all_borrowed(&len_bytes, ctx).await?;
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
|
||||
/// Returns the offset at which the first byte of the input was written, for use
|
||||
/// in constructing indices over the written value.
|
||||
pub(crate) async fn write_raw(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
//!
|
||||
//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::io::{self};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
@@ -18,33 +18,17 @@ use super::zero_padded_read_write;
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PrewarmOnWrite {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||
page_cache_file_id,
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -84,10 +68,10 @@ impl RW {
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let writer = self.rw.as_writer();
|
||||
let flushed_range = writer.written_range();
|
||||
let mut vec = writer
|
||||
.file
|
||||
let file_size_tracking_writer = self.rw.as_writer();
|
||||
let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
|
||||
let mut vec = file_size_tracking_writer
|
||||
.as_inner()
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
@@ -122,7 +106,7 @@ impl RW {
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_writer().file.path,
|
||||
self.rw.as_writer().as_inner().path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
@@ -132,7 +116,7 @@ impl RW {
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = writer
|
||||
.file
|
||||
.as_inner()
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
@@ -154,137 +138,16 @@ impl Drop for RW {
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
let path = &self.rw.as_writer().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.rw.as_writer().file.path,
|
||||
e
|
||||
);
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PreWarmingWriter {
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
nwritten_blocks: u32,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
impl PreWarmingWriter {
|
||||
fn new(
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
) -> Self {
|
||||
Self {
|
||||
prewarm_on_write,
|
||||
nwritten_blocks: 0,
|
||||
page_cache_file_id,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the byte range within `file` that has been written though `write_all`.
|
||||
///
|
||||
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||
struct Wrapper(Range<usize>);
|
||||
impl Deref for Wrapper {
|
||||
type Target = Range<usize>;
|
||||
fn deref(&self) -> &Range<usize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||
async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let buflen = buf.len();
|
||||
assert_eq!(
|
||||
buflen % PAGE_SZ,
|
||||
0,
|
||||
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
|
||||
);
|
||||
|
||||
// Do the IO.
|
||||
let buf = match self.file.write_all(buf, ctx).await {
|
||||
(buf, Ok(nwritten)) => {
|
||||
assert_eq!(nwritten, buflen);
|
||||
buf
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
// order error before path because path is long and error is short
|
||||
format!(
|
||||
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
|
||||
self.nwritten_blocks, buflen, e, self.file.path,
|
||||
),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let nblocks = buflen / PAGE_SZ;
|
||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||
|
||||
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer =
|
||||
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||
Ok((buflen, buf))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -464,7 +464,7 @@ impl LayerMap {
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
@@ -483,7 +483,7 @@ impl LayerMap {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -600,8 +600,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Check if the key range resembles that of an L0 layer.
|
||||
pub fn is_l0(key_range: &Range<Key>) -> bool {
|
||||
key_range == &(Key::MIN..Key::MAX)
|
||||
pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
|
||||
is_delta_layer && key_range == &(Key::MIN..Key::MAX)
|
||||
}
|
||||
|
||||
/// This function determines which layers are counted in `count_deltas`:
|
||||
@@ -628,7 +628,7 @@ impl LayerMap {
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(&layer.key_range) {
|
||||
if !Self::is_l0(&layer.key_range, layer.is_delta) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
@@ -574,7 +574,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
0, 0, 0, 16, // pg_version (4 bytes)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
|
||||
task_mgr::spawn(
|
||||
&self.runtime,
|
||||
TaskKind::RemoteUploadTask,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
async move {
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
};
|
||||
|
||||
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
|
||||
|
||||
// Sum of layer sizes on local disk
|
||||
pub(super) resident_size_metric: UIntGauge,
|
||||
|
||||
// Sum of layer sizes in the most recently downloaded heatmap
|
||||
pub(super) heatmap_total_size_metric: UIntGauge,
|
||||
}
|
||||
|
||||
impl Drop for SecondaryTenant {
|
||||
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
|
||||
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +133,10 @@ impl SecondaryTenant {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
@@ -145,6 +154,7 @@ impl SecondaryTenant {
|
||||
progress: std::sync::Mutex::default(),
|
||||
|
||||
resident_size_metric,
|
||||
heatmap_total_size_metric,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
|
||||
// Also expose heatmap bytes_total as a metric
|
||||
self.secondary_state
|
||||
.heatmap_total_size_metric
|
||||
.set(heatmap_stats.bytes);
|
||||
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod image_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -36,6 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
@@ -232,6 +233,18 @@ pub struct DeltaLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"delta {}..{} {}..{}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn_range().start,
|
||||
self.lsn_range().end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -556,7 +569,6 @@ impl DeltaLayerWriterInner {
|
||||
// 5GB limit for objects without multipart upload (which we don't want to use)
|
||||
// Make it a little bit below to account for differing GB units
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
|
||||
const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
ensure!(
|
||||
metadata.len() <= S3_UPLOAD_LIMIT,
|
||||
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
|
||||
@@ -690,12 +702,10 @@ impl DeltaLayerWriter {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
@@ -1527,6 +1537,10 @@ pub struct DeltaLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.delta_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
|
||||
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"image {}..{} {}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
@@ -705,10 +716,6 @@ struct ImageLayerWriterInner {
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -843,13 +850,19 @@ impl ImageLayerWriterInner {
|
||||
res?;
|
||||
}
|
||||
|
||||
let final_key_range = if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
};
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
let summary = Summary {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_range.clone(),
|
||||
key_range: final_key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
@@ -870,11 +883,7 @@ impl ImageLayerWriterInner {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
},
|
||||
final_key_range,
|
||||
self.lsn,
|
||||
metadata.len(),
|
||||
);
|
||||
@@ -963,14 +972,12 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
@@ -986,7 +993,6 @@ impl ImageLayerWriter {
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
@@ -1000,10 +1006,6 @@ impl ImageLayerWriter {
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
@@ -1024,6 +1026,10 @@ pub struct ImageLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.image_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
@@ -33,7 +33,7 @@ use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
@@ -249,9 +249,7 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
println!(
|
||||
@@ -259,39 +257,6 @@ impl InMemoryLayer {
|
||||
self.timeline_id, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -355,6 +320,82 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Offset of a particular Value within a serialized batch.
|
||||
struct SerializedBatchOffset {
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
/// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
pub struct SerializedBatch {
|
||||
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
|
||||
pub(crate) raw: Vec<u8>,
|
||||
|
||||
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
|
||||
offsets: Vec<SerializedBatchOffset>,
|
||||
|
||||
/// The highest LSN of any value in the batch
|
||||
pub(crate) max_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SerializedBatch {
|
||||
/// Write a blob length in the internal format of the EphemeralFile
|
||||
pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
|
||||
use std::io::Write;
|
||||
|
||||
if len < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [len as u8];
|
||||
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(len as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
|
||||
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
|
||||
|
||||
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = cursor.position();
|
||||
|
||||
Self::write_blob_length(val_ser_size, &mut cursor);
|
||||
val.ser_into(&mut cursor)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
offsets.push(SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
});
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
|
||||
let buffer = cursor.into_inner();
|
||||
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buffer.len() <= buffer_size);
|
||||
|
||||
Self {
|
||||
raw: buffer,
|
||||
offsets,
|
||||
max_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
|
||||
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
|
||||
}
|
||||
@@ -415,37 +456,20 @@ impl InMemoryLayer {
|
||||
})
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(
|
||||
// Write path.
|
||||
pub async fn put_batch(
|
||||
&self,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
serialized_batch: SerializedBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
&self,
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
locked_inner
|
||||
let base_off = {
|
||||
inner
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
.write_raw(
|
||||
&serialized_batch.raw,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
@@ -453,15 +477,23 @@ impl InMemoryLayer {
|
||||
.await?
|
||||
};
|
||||
|
||||
let vec_map = locked_inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
for SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
} in serialized_batch.offsets
|
||||
{
|
||||
let off = base_off + relative_off;
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
let size = locked_inner.file.len();
|
||||
locked_inner.resource_units.maybe_publish_size(size);
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.maybe_publish_size(size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -536,7 +568,6 @@ impl InMemoryLayer {
|
||||
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
|
||||
@@ -568,34 +599,6 @@ impl InMemoryLayer {
|
||||
.await?;
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let (tmp, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
buf.slice_len(),
|
||||
will_init,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
|
||||
@@ -35,6 +35,8 @@ mod tests;
|
||||
#[cfg(test)]
|
||||
mod failpoints;
|
||||
|
||||
pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
@@ -1296,7 +1298,10 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote: !resident,
|
||||
access_stats,
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
|
||||
l0: crate::tenant::layer_map::LayerMap::is_l0(
|
||||
&self.layer_desc().key_range,
|
||||
self.layer_desc().is_delta,
|
||||
),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
|
||||
@@ -256,6 +256,10 @@ impl LayerName {
|
||||
LayerName::Delta(layer) => &layer.key_range,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
matches!(self, LayerName::Delta(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerName {
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{
|
||||
collections::{binary_heap, BinaryHeap},
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -26,6 +27,13 @@ impl<'a> LayerRef<'a> {
|
||||
Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
|
||||
}
|
||||
}
|
||||
|
||||
fn layer_dbg_info(&self) -> String {
|
||||
match self {
|
||||
Self::Image(x) => x.layer_dbg_info(),
|
||||
Self::Delta(x) => x.layer_dbg_info(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum LayerIterRef<'a> {
|
||||
@@ -40,6 +48,13 @@ impl LayerIterRef<'_> {
|
||||
Self::Image(x) => x.next().await,
|
||||
}
|
||||
}
|
||||
|
||||
fn layer_dbg_info(&self) -> String {
|
||||
match self {
|
||||
Self::Image(x) => x.layer_dbg_info(),
|
||||
Self::Delta(x) => x.layer_dbg_info(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This type plays several roles at once
|
||||
@@ -75,6 +90,11 @@ impl<'a> PeekableLayerIterRef<'a> {
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let result = self.peeked.take();
|
||||
self.peeked = self.iter.next().await?;
|
||||
if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
|
||||
if (k1, l1) < (k2, l2) {
|
||||
bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
@@ -178,7 +198,12 @@ impl<'a> IteratorWrapper<'a> {
|
||||
let iter = PeekableLayerIterRef::create(iter).await?;
|
||||
if let Some((k1, l1, _)) = iter.peek() {
|
||||
let (k2, l2) = first_key_lower_bound;
|
||||
debug_assert!((k1, l1) >= (k2, l2));
|
||||
if (k1, l1) < (k2, l2) {
|
||||
bail!(
|
||||
"layer key range did not include the first key in the layer: {}",
|
||||
layer.layer_dbg_info()
|
||||
);
|
||||
}
|
||||
}
|
||||
*self = Self::Loaded { iter };
|
||||
Ok(())
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use std::{future::Future, ops::Range, sync::Arc};
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
@@ -7,7 +7,32 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
|
||||
};
|
||||
|
||||
pub(crate) enum SplitWriterResult {
|
||||
Produced(ResidentLayer),
|
||||
Discarded(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl SplitWriterResult {
|
||||
fn into_resident_layer(self) -> ResidentLayer {
|
||||
match self {
|
||||
SplitWriterResult::Produced(layer) => layer,
|
||||
SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
||||
}
|
||||
}
|
||||
|
||||
fn into_discarded_layer(self) -> PersistentLayerKey {
|
||||
match self {
|
||||
SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
||||
SplitWriterResult::Discarded(layer) => layer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
@@ -16,11 +41,12 @@ use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn: Lsn,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
impl SplitImageLayerWriter {
|
||||
@@ -49,16 +75,22 @@ impl SplitImageLayerWriter {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_image(
|
||||
pub async fn put_image_with_discard_fn<D, F>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
// The current estimation is an upper bound of the space that the key/image could take
|
||||
// because we did not consider compression in this estimation. The resulting image layer
|
||||
// could be smaller than the target size.
|
||||
@@ -76,33 +108,87 @@ impl SplitImageLayerWriter {
|
||||
)
|
||||
.await?;
|
||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||
self.generated_layers.push(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
self.start_key = key;
|
||||
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_image_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
self.generated_layers.push(SplitWriterResult::Produced(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
));
|
||||
}
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
#[cfg(test)]
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
}
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
generated_layers.push(SplitWriterResult::Produced(
|
||||
inner.finish_with_end_key(tline, end_key, ctx).await?,
|
||||
));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
@@ -110,15 +196,21 @@ impl SplitImageLayerWriter {
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
///
|
||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
@@ -147,9 +239,74 @@ impl SplitDeltaLayerWriter {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
/// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
|
||||
pub async fn put_value_with_discard_fn<D, F>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
//
|
||||
// Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
|
||||
// strategy. https://github.com/neondatabase/neon/issues/8837
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
if key != self.last_key_written {
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
self.start_key = key;
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_delta_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
} else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
// We have to produce a very large file b/c a key is updated too often.
|
||||
anyhow::bail!(
|
||||
"a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
|
||||
key,
|
||||
self.inner.estimated_size()
|
||||
);
|
||||
}
|
||||
}
|
||||
self.last_key_written = key;
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
@@ -158,56 +315,66 @@ impl SplitDeltaLayerWriter {
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers.push(delta_layer);
|
||||
}
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(delta_layer);
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
}
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use itertools::Itertools;
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
@@ -229,7 +396,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn get_large_img() -> Bytes {
|
||||
vec![0; 8192].into()
|
||||
let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
|
||||
let mut data = vec![0; 8192];
|
||||
rng.fill_bytes(&mut data);
|
||||
data.into()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -297,9 +467,16 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split() {
|
||||
let harness = TenantHarness::create("split_writer_write_split")
|
||||
.await
|
||||
.unwrap();
|
||||
write_split_helper("split_writer_write_split", false).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split_discard() {
|
||||
write_split_helper("split_writer_write_split_discard", false).await;
|
||||
}
|
||||
|
||||
async fn write_split_helper(harness_name: &'static str, discard: bool) {
|
||||
let harness = TenantHarness::create(harness_name).await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
@@ -333,16 +510,19 @@ mod tests {
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
image_writer
|
||||
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||
.put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
|
||||
discard
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
.put_value_with_discard_fn(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
|_| async { discard },
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -355,22 +535,39 @@ mod tests {
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
if discard {
|
||||
for layer in image_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
for layer in delta_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
} else {
|
||||
let image_layers = image_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
let delta_layers = delta_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -451,4 +648,49 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split_single_key() {
|
||||
let harness = TenantHarness::create("split_writer_write_split_single_key")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
const N: usize = 2000;
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(i as u64 * 16 + 0x10),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(delta_layers.len(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,21 +61,12 @@ impl BackgroundLoopKind {
|
||||
}
|
||||
}
|
||||
|
||||
static PERMIT_GAUGES: once_cell::sync::Lazy<
|
||||
enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
|
||||
> = once_cell::sync::Lazy::new(|| {
|
||||
enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
|
||||
}))
|
||||
});
|
||||
|
||||
/// Cancellation safe.
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
_ctx: &RequestContext,
|
||||
) -> tokio::sync::SemaphorePermit<'static> {
|
||||
let _guard = PERMIT_GAUGES[loop_kind].guard();
|
||||
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE.measure_acquisition(loop_kind);
|
||||
|
||||
pausable_failpoint!(
|
||||
"initial-size-calculation-permit-pause",
|
||||
@@ -98,7 +89,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("compactor for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -121,7 +112,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("garbage collector for tenant {tenant_shard_id}"),
|
||||
{
|
||||
@@ -144,7 +135,7 @@ pub fn start_background_loops(
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::IngestHousekeeping,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
|
||||
{
|
||||
|
||||
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -44,10 +44,8 @@ use tokio::{
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
fs_ext, pausable_failpoint,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
vec_map::VecMap,
|
||||
};
|
||||
|
||||
use std::pin::pin;
|
||||
@@ -137,7 +135,10 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
|
||||
use super::{
|
||||
config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
|
||||
upload_queue::NotInitialized,
|
||||
};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{
|
||||
@@ -2233,6 +2234,11 @@ impl Timeline {
|
||||
|
||||
handles: Default::default(),
|
||||
};
|
||||
|
||||
if aux_file_policy == Some(AuxFilePolicy::V1) {
|
||||
warn!("this timeline is using deprecated aux file policy V1");
|
||||
}
|
||||
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
@@ -2281,7 +2287,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::LayerFlushTask,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"layer flush task",
|
||||
async move {
|
||||
@@ -2635,7 +2641,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::InitialLogicalSizeCalculation,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"initial size calculation",
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
@@ -2803,7 +2809,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"ondemand logical size calculation",
|
||||
async move {
|
||||
@@ -2996,7 +3002,10 @@ impl Timeline {
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
|
||||
std::cmp::Reverse((
|
||||
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
|
||||
desc.lsn_range.end,
|
||||
))
|
||||
});
|
||||
|
||||
let layers = layers
|
||||
@@ -3589,34 +3598,6 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
|
||||
// This code path will not be hit during regression tests. After #7099 we have a single partition
|
||||
// with two key ranges. If someone wants to fix initdb optimization in the future, this might need
|
||||
// to be fixed.
|
||||
|
||||
// For metadata, always create delta layers.
|
||||
let delta_layer = if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
let metadata_keyspace = &metadata_partition.parts[0];
|
||||
self.create_delta_layer(
|
||||
&frozen_layer,
|
||||
Some(
|
||||
metadata_keyspace.0.ranges.first().unwrap().start
|
||||
..metadata_keyspace.0.ranges.last().unwrap().end,
|
||||
),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| FlushLayerError::from_anyhow(self, e))?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
@@ -3627,13 +3608,27 @@ impl Timeline {
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
layers_to_upload.push(delta_layer.clone());
|
||||
(layers_to_upload, Some(delta_layer))
|
||||
} else {
|
||||
(layers_to_upload, None)
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
&metadata_partition.into_dense(),
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
|
||||
(layers_to_upload, None)
|
||||
} else {
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
@@ -4043,8 +4038,6 @@ impl Timeline {
|
||||
mode: ImageLayerCreationMode,
|
||||
start: Key,
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
assert!(!matches!(mode, ImageLayerCreationMode::Initial));
|
||||
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let data = self
|
||||
@@ -4210,15 +4203,13 @@ impl Timeline {
|
||||
"metadata keys must be partitioned separately"
|
||||
);
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Initial {
|
||||
return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
|
||||
}
|
||||
if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
|
||||
// Skip compaction if there are not enough updates. Metadata compaction will do a scan and
|
||||
// might mess up with evictions.
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
// For initial and force modes, we always generate image layers for metadata keys.
|
||||
} else if let ImageLayerCreationMode::Try = mode {
|
||||
// check_for_image_layers = false -> skip
|
||||
// check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
|
||||
@@ -4226,7 +4217,8 @@ impl Timeline {
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
} else if let ImageLayerCreationMode::Force = mode {
|
||||
}
|
||||
if let ImageLayerCreationMode::Force = mode {
|
||||
// When forced to create image layers, we might try and create them where they already
|
||||
// exist. This mode is only used in tests/debug.
|
||||
let layers = self.layers.read().await;
|
||||
@@ -4240,6 +4232,7 @@ impl Timeline {
|
||||
img_range.start,
|
||||
img_range.end
|
||||
);
|
||||
start = img_range.end;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -4595,7 +4588,7 @@ impl Timeline {
|
||||
// for compact_level0_phase1 creating an L0, which does not happen in practice
|
||||
// because we have not implemented L0 => L0 compaction.
|
||||
duplicated_layers.insert(l.layer_desc().key());
|
||||
} else if LayerMap::is_l0(&l.layer_desc().key_range) {
|
||||
} else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
|
||||
return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
|
||||
} else {
|
||||
insert_layers.push(l.clone());
|
||||
@@ -5162,7 +5155,7 @@ impl Timeline {
|
||||
let task_id = task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"download all remote layers task",
|
||||
async move {
|
||||
@@ -5451,12 +5444,17 @@ impl Timeline {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
if layer.is_delta()
|
||||
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||
&& layer.lsn_range != deltas.lsn_range
|
||||
{
|
||||
if deltas.key_range.start.next() != deltas.key_range.end {
|
||||
let guard = self.layers.read().await;
|
||||
let mut invalid_layers =
|
||||
guard.layer_map()?.iter_historic_layers().filter(|layer| {
|
||||
layer.is_delta()
|
||||
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||
&& layer.lsn_range != deltas.lsn_range
|
||||
// skip single-key layer files
|
||||
&& layer.key_range.start.next() != layer.key_range.end
|
||||
});
|
||||
if let Some(layer) = invalid_layers.next() {
|
||||
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
|
||||
panic!(
|
||||
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
|
||||
@@ -5590,44 +5588,6 @@ enum OpenLayerAction {
|
||||
}
|
||||
|
||||
impl<'a> TimelineWriter<'a> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub(crate) async fn put(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
value.ser_into(&mut buf)?;
|
||||
let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
|
||||
|
||||
let action = self.get_open_layer_action(lsn, buf_size);
|
||||
let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
|
||||
let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
|
||||
|
||||
if res.is_ok() {
|
||||
// Update the current size only when the entire write was ok.
|
||||
// In case of failures, we may have had partial writes which
|
||||
// render the size tracking out of sync. That's ok because
|
||||
// the checkpoint distance should be significantly smaller
|
||||
// than the S3 single shot upload limit of 5GiB.
|
||||
let state = self.write_guard.as_mut().unwrap();
|
||||
|
||||
state.current_size += buf_size;
|
||||
state.prev_lsn = Some(lsn);
|
||||
state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
async fn handle_open_layer_action(
|
||||
&mut self,
|
||||
at: Lsn,
|
||||
@@ -5733,18 +5693,58 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
|
||||
/// Put a batch of keys at the specified Lsns.
|
||||
///
|
||||
/// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
|
||||
pub(crate) async fn put_batch(
|
||||
&mut self,
|
||||
batch: VecMap<Lsn, (Key, Value)>,
|
||||
batch: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
for (lsn, (key, val)) in batch {
|
||||
self.put(key, lsn, &val, ctx).await?
|
||||
if batch.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
|
||||
let batch_max_lsn = serialized_batch.max_lsn;
|
||||
let buf_size: u64 = serialized_batch.raw.len() as u64;
|
||||
|
||||
let action = self.get_open_layer_action(batch_max_lsn, buf_size);
|
||||
let layer = self
|
||||
.handle_open_layer_action(batch_max_lsn, action, ctx)
|
||||
.await?;
|
||||
|
||||
let res = layer.put_batch(serialized_batch, ctx).await;
|
||||
|
||||
if res.is_ok() {
|
||||
// Update the current size only when the entire write was ok.
|
||||
// In case of failures, we may have had partial writes which
|
||||
// render the size tracking out of sync. That's ok because
|
||||
// the checkpoint distance should be significantly smaller
|
||||
// than the S3 single shot upload limit of 5GiB.
|
||||
let state = self.write_guard.as_mut().unwrap();
|
||||
|
||||
state.current_size += buf_size;
|
||||
state.prev_lsn = Some(batch_max_lsn);
|
||||
state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Test helper, for tests that would like to poke individual values without composing a batch
|
||||
pub(crate) async fn put(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use utils::bin_ser::BeSer;
|
||||
let val_ser_size = value.serialized_size().unwrap() as usize;
|
||||
self.put_batch(
|
||||
vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_batch(
|
||||
@@ -5885,7 +5885,7 @@ mod tests {
|
||||
};
|
||||
|
||||
// Apart from L0s, newest Layers should come first
|
||||
if !LayerMap::is_l0(layer.name.key_range()) {
|
||||
if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
|
||||
assert!(layer_lsn <= last_lsn);
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use super::{
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
@@ -32,6 +32,9 @@ use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
||||
};
|
||||
@@ -71,15 +74,60 @@ pub(crate) struct KeyHistoryRetention {
|
||||
}
|
||||
|
||||
impl KeyHistoryRetention {
|
||||
/// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
|
||||
///
|
||||
/// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
|
||||
/// For example, consider the case where a single delta with range [0x10,0x50) exists.
|
||||
/// And we have branches at LSN 0x10, 0x20, 0x30.
|
||||
/// Then we delete branch @ 0x20.
|
||||
/// Bottom-most compaction may now delete the delta [0x20,0x30).
|
||||
/// And that wouldnt' change the shape of the layer.
|
||||
///
|
||||
/// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
|
||||
///
|
||||
/// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
|
||||
async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
|
||||
if dry_run {
|
||||
return true;
|
||||
}
|
||||
let guard = tline.layers.read().await;
|
||||
if !guard.contains_key(key) {
|
||||
return false;
|
||||
}
|
||||
let layer_generation = guard.get_from_key(key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
info!(
|
||||
key=%key,
|
||||
?layer_generation,
|
||||
"discard layer due to duplicated layer key in the same generation",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Pipe a history of a single key to the writers.
|
||||
///
|
||||
/// If `image_writer` is none, the images will be placed into the delta layers.
|
||||
/// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
mut image_writer: Option<&mut ImageLayerWriter>,
|
||||
tline: &Arc<Timeline>,
|
||||
delta_writer: &mut SplitDeltaLayerWriter,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { Self::discard_key(&key, tline, dry_run).await }
|
||||
};
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
@@ -88,28 +136,45 @@ impl KeyHistoryRetention {
|
||||
};
|
||||
stat.produce_image_key(img);
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
image_writer
|
||||
.put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
|
||||
.await?;
|
||||
} else {
|
||||
delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
key,
|
||||
cutoff_lsn,
|
||||
Value::Image(img.clone()),
|
||||
tline,
|
||||
ctx,
|
||||
discard,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1814,11 +1879,27 @@ impl Timeline {
|
||||
}
|
||||
let mut selected_layers = Vec::new();
|
||||
drop(gc_info);
|
||||
// Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
let Some(max_layer_lsn) = layers
|
||||
.iter_historic_layers()
|
||||
.filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
|
||||
.map(|desc| desc.get_lsn_range().end)
|
||||
.max()
|
||||
else {
|
||||
info!("no layers to compact with gc");
|
||||
return Ok(());
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().start <= gc_cutoff {
|
||||
if desc.get_lsn_range().end <= max_layer_lsn {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
info!("no layers to compact with gc");
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
};
|
||||
@@ -1848,27 +1929,53 @@ impl Timeline {
|
||||
lowest_retain_lsn
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
let mut downloaded_layers = Vec::new();
|
||||
let mut delta_split_points = BTreeSet::new();
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
for layer in &layer_selection {
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
|
||||
// so that we can avoid having too many small delta layers.
|
||||
let key_range = desc.get_key_range();
|
||||
delta_split_points.insert(key_range.start);
|
||||
delta_split_points.insert(key_range.end);
|
||||
// ignore single-key layer files
|
||||
if desc.key_range.start.next() != desc.key_range.end {
|
||||
let lsn_range = &desc.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
for layer in &layer_selection {
|
||||
let desc = layer.layer_desc();
|
||||
let key_range = &desc.key_range;
|
||||
if desc.is_delta() && key_range.start.next() != key_range.end {
|
||||
let lsn_range = desc.lsn_range.clone();
|
||||
let intersects = lsn_split_point.range(lsn_range).collect_vec();
|
||||
if intersects.len() > 1 {
|
||||
bail!(
|
||||
"cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
|
||||
desc.key(),
|
||||
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = layer_selection
|
||||
.iter()
|
||||
.map(|l| l.layer_desc().lsn_range.end)
|
||||
.max()
|
||||
.unwrap();
|
||||
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
|
||||
// as an L0 layer.
|
||||
let hack_end_key = Key::NON_L0_MAX;
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
for layer in &layer_selection {
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
@@ -1884,138 +1991,17 @@ impl Timeline {
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
enum FlushDeltaResult {
|
||||
/// Create a new resident layer
|
||||
CreateResidentLayer(ResidentLayer),
|
||||
/// Keep an original delta layer
|
||||
KeepLayer(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
delta_split_points: &[Key],
|
||||
current_delta_split_point: &mut usize,
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
stats: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
last_batch: bool,
|
||||
) -> anyhow::Result<Option<FlushDeltaResult>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
// overlapping layers.
|
||||
//
|
||||
// If we have a structure like this:
|
||||
//
|
||||
// | Delta 1 | | Delta 4 |
|
||||
// |---------| Delta 2 |---------|
|
||||
// | Delta 3 | | Delta 5 |
|
||||
//
|
||||
// And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
|
||||
// A simple solution here is to split the delta layers using the original boundary, while this
|
||||
// might produce a lot of small layers. This should be improved and fixed in the future.
|
||||
let mut need_split = false;
|
||||
while *current_delta_split_point < delta_split_points.len()
|
||||
&& last_key >= delta_split_points[*current_delta_split_point]
|
||||
{
|
||||
*current_delta_split_point += 1;
|
||||
need_split = true;
|
||||
}
|
||||
if !need_split && !last_batch {
|
||||
return Ok(None);
|
||||
}
|
||||
let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
|
||||
if deltas.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
|
||||
let delta_key = PersistentLayerKey {
|
||||
key_range: {
|
||||
let key_start = deltas.first().unwrap().0;
|
||||
let key_end = deltas.last().unwrap().0.next();
|
||||
key_start..key_end
|
||||
},
|
||||
lsn_range: lowest_retain_lsn..end_lsn,
|
||||
is_delta: true,
|
||||
};
|
||||
{
|
||||
// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
|
||||
//
|
||||
// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
|
||||
// For example, consider the case where a single delta with range [0x10,0x50) exists.
|
||||
// And we have branches at LSN 0x10, 0x20, 0x30.
|
||||
// Then we delete branch @ 0x20.
|
||||
// Bottom-most compaction may now delete the delta [0x20,0x30).
|
||||
// And that wouldnt' change the shape of the layer.
|
||||
//
|
||||
// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
|
||||
// That's why it's safe to skip.
|
||||
let guard = tline.layers.read().await;
|
||||
|
||||
if guard.contains_key(&delta_key) {
|
||||
let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
stats.discard_delta_layer();
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%delta_key,
|
||||
?layer_generation,
|
||||
"discard delta layer due to duplicated layer in the same generation"
|
||||
);
|
||||
return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
tline.conf,
|
||||
tline.timeline_id,
|
||||
tline.tenant_shard_id,
|
||||
delta_key.key_range.start,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
|
||||
stats.produce_delta_layer(delta_layer_writer.size());
|
||||
if dry_run {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (desc, path) = delta_layer_writer
|
||||
.finish(delta_key.key_range.end, ctx)
|
||||
.await?;
|
||||
let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
|
||||
Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
|
||||
}
|
||||
|
||||
// Hack the key range to be min..(max-1). Otherwise, the image layer will be
|
||||
// interpreted as an L0 delta layer.
|
||||
let hack_image_layer_range = {
|
||||
let mut end_key = Key::MAX;
|
||||
end_key.field6 -= 1;
|
||||
Key::MIN..end_key
|
||||
};
|
||||
|
||||
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
|
||||
// when some condition meet.
|
||||
let mut image_layer_writer = if self.ancestor_timeline.is_none() {
|
||||
Some(
|
||||
ImageLayerWriter::new(
|
||||
SplitImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&hack_image_layer_range, // covers the full key range
|
||||
Key::MIN,
|
||||
lowest_retain_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -2024,6 +2010,17 @@ impl Timeline {
|
||||
None
|
||||
};
|
||||
|
||||
let mut delta_layer_writer = SplitDeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
Key::MIN,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
@@ -2044,47 +2041,11 @@ impl Timeline {
|
||||
let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
|
||||
Ok(Some((key, tline.ancestor_lsn, img)))
|
||||
}
|
||||
let image_layer_key = PersistentLayerKey {
|
||||
key_range: hack_image_layer_range,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
|
||||
// Like with delta layers, it can happen that we re-produce an already existing image layer.
|
||||
// This could happen when a user triggers force compaction and image generation. In this case,
|
||||
// it's always safe to rewrite the layer.
|
||||
let discard_image_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
if guard.contains_key(&image_layer_key) {
|
||||
let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == self.generation {
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%image_layer_key,
|
||||
?layer_generation,
|
||||
"discard image layer due to duplicated layer key in the same generation",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut delta_values = Vec::new();
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
@@ -2115,27 +2076,14 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_values,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
*last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
false,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
accumulated_values.clear();
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
@@ -2159,43 +2107,75 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_values,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
true,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
assert!(delta_values.is_empty(), "unprocessed keys");
|
||||
|
||||
let image_layer = if discard_image_layer {
|
||||
stat.discard_image_layer();
|
||||
None
|
||||
} else if let Some(writer) = image_layer_writer {
|
||||
stat.produce_image_layer(writer.size());
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
|
||||
};
|
||||
|
||||
let produced_image_layers = if let Some(writer) = image_layer_writer {
|
||||
if !dry_run {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
let (layers, _) = writer.take()?;
|
||||
assert!(layers.is_empty(), "image layers produced in dry run mode?");
|
||||
Vec::new()
|
||||
}
|
||||
} else {
|
||||
None
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let produced_delta_layers = if !dry_run {
|
||||
delta_layer_writer
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = delta_layer_writer.take()?;
|
||||
assert!(layers.is_empty(), "delta layers produced in dry run mode?");
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
for action in produced_image_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
stat.produce_image_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
@@ -2206,28 +2186,11 @@ impl Timeline {
|
||||
}
|
||||
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
if image_layer.is_some() { 1 } else { 0 }
|
||||
"produced {} delta layers and {} image layers, {} layers are kept",
|
||||
produced_delta_layers_len,
|
||||
produced_image_layers_len,
|
||||
layer_selection.len()
|
||||
);
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
for action in delta_layers {
|
||||
match action {
|
||||
FlushDeltaResult::CreateResidentLayer(layer) => {
|
||||
compact_to.push(layer);
|
||||
}
|
||||
FlushDeltaResult::KeepLayer(l) => {
|
||||
keep_layers.insert(l);
|
||||
}
|
||||
}
|
||||
}
|
||||
if discard_image_layer {
|
||||
keep_layers.insert(image_layer_key);
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
compact_to.extend(image_layer);
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
|
||||
@@ -395,7 +395,7 @@ impl DeleteTimelineFlow {
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
async move {
|
||||
|
||||
@@ -60,7 +60,7 @@ impl Timeline {
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::Eviction,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
&format!(
|
||||
"layer eviction for {}/{}",
|
||||
|
||||
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
pgdatadir_mapping::DatadirModification,
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
@@ -345,7 +345,10 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size {
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(uncommitted_records - filtered_records);
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;
|
||||
|
||||
use pageserver_api::models::PageserverUtilization;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::mgr::TenantManager};
|
||||
use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
|
||||
|
||||
pub(crate) fn regenerate(
|
||||
conf: &PageServerConf,
|
||||
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
|
||||
disk_usable_pct,
|
||||
shard_count,
|
||||
max_shard_count: MAX_SHARDS,
|
||||
utilization_score: 0,
|
||||
utilization_score: None,
|
||||
captured_at: utils::serde_system_time::SystemTime(captured_at),
|
||||
};
|
||||
|
||||
doc.refresh_score();
|
||||
|
||||
// TODO: make utilization_score into a metric
|
||||
// Initialize `PageserverUtilization::utilization_score`
|
||||
let score = doc.cached_score();
|
||||
NODE_UTILIZATION_SCORE.set(score);
|
||||
|
||||
Ok(doc)
|
||||
}
|
||||
|
||||
@@ -756,11 +756,23 @@ impl VirtualFile {
|
||||
})
|
||||
}
|
||||
|
||||
/// The function aborts the process if the error is fatal.
|
||||
async fn write_at<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
|
||||
let result = result.maybe_fatal_err("write_at");
|
||||
(slice, result)
|
||||
}
|
||||
|
||||
async fn write_at_inner<B: IoBuf + Send>(
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
|
||||
@@ -1,13 +1,7 @@
|
||||
commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Thu Jun 6 08:02:42 2024 +0000
|
||||
|
||||
Patch expected files to consider Neon's log messages
|
||||
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
index da723b8..f8d0102 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
diff --git a/expected/ut-A.out b/expected/ut-A.out
|
||||
index da723b8..5328114 100644
|
||||
--- a/expected/ut-A.out
|
||||
+++ b/expected/ut-A.out
|
||||
@@ -9,13 +9,16 @@ SET search_path TO public;
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
@@ -25,10 +19,18 @@ index da723b8..f8d0102 100644
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
|
||||
FROM public.pg_stat_statements s
|
||||
JOIN pg_catalog.pg_database d
|
||||
ON (s.dbid = d.oid)
|
||||
+ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
|
||||
ORDER BY 1;
|
||||
query | calls
|
||||
--------------------------------------+-------
|
||||
diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
|
||||
index d372459..6282afe 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
--- a/expected/ut-fdw.out
|
||||
+++ b/expected/ut-fdw.out
|
||||
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
@@ -37,3 +39,15 @@ index d372459..6282afe 100644
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
diff --git a/sql/ut-A.sql b/sql/ut-A.sql
|
||||
index 7c7d58a..4fd1a07 100644
|
||||
--- a/sql/ut-A.sql
|
||||
+++ b/sql/ut-A.sql
|
||||
@@ -963,6 +963,7 @@ SELECT s.query, s.calls
|
||||
FROM public.pg_stat_statements s
|
||||
JOIN pg_catalog.pg_database d
|
||||
ON (s.dbid = d.oid)
|
||||
+ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
|
||||
ORDER BY 1;
|
||||
|
||||
----
|
||||
@@ -41,6 +41,8 @@
|
||||
|
||||
#include "hll.h"
|
||||
|
||||
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||
@@ -51,19 +53,43 @@
|
||||
*
|
||||
* Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
|
||||
* its consistency.
|
||||
|
||||
*
|
||||
* ## Holes
|
||||
*
|
||||
* The LFC can be resized on the fly, up to a maximum size that's determined
|
||||
* at server startup (neon.max_file_cache_size). After server startup, we
|
||||
* expand the underlying file when needed, until it reaches the soft limit
|
||||
* (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
|
||||
* the LFC by punching holes in the underlying file with a
|
||||
* fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
|
||||
* shrink, but the disk space it uses does.
|
||||
*
|
||||
* Each hole is tracked by a dummy FileCacheEntry, which are kept in the
|
||||
* 'holes' linked list. They are entered into the chunk hash table, with a
|
||||
* special key where the blockNumber is used to store the 'offset' of the
|
||||
* hole, and all other fields are zero. Holes are never looked up in the hash
|
||||
* table, we only enter them there to have a FileCacheEntry that we can keep
|
||||
* in the linked list. If the soft limit is raised again, we reuse the holes
|
||||
* before extending the nominal size of the file.
|
||||
*/
|
||||
|
||||
/* Local file storage allocation chunk.
|
||||
* Should be power of two and not less than 32. Using larger than page chunks can
|
||||
* Should be power of two. Using larger than page chunks can
|
||||
* 1. Reduce hash-map memory footprint: 8TB database contains billion pages
|
||||
* and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
|
||||
* 1Mb chunks can reduce hash map size to 320Mb.
|
||||
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
|
||||
*/
|
||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||
/*
|
||||
* Smaller chunk seems to be better for OLTP workload
|
||||
*/
|
||||
// #define BLOCKS_PER_CHUNK 8 /* 64kb chunk */
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
@@ -71,8 +97,8 @@ typedef struct FileCacheEntry
|
||||
uint32 hash;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
uint32 bitmap[CHUNK_BITMAP_SIZE];
|
||||
dlist_node list_node; /* LRU/holes list node */
|
||||
} FileCacheEntry;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
@@ -87,6 +113,7 @@ typedef struct FileCacheControl
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
} FileCacheControl;
|
||||
|
||||
@@ -135,6 +162,7 @@ lfc_disable(char const *op)
|
||||
lfc_ctl->used = 0;
|
||||
lfc_ctl->limit = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
if (lfc_desc > 0)
|
||||
{
|
||||
@@ -214,18 +242,18 @@ lfc_shmem_startup(void)
|
||||
if (!found)
|
||||
{
|
||||
int fd;
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = sizeof(FileCacheEntry);
|
||||
|
||||
/*
|
||||
* lfc_size+1 because we add new element to hash table before eviction
|
||||
* n_chunks+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
lfc_size + 1, lfc_size + 1,
|
||||
n_chunks + 1, n_chunks + 1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->generation = 0;
|
||||
@@ -235,6 +263,7 @@ lfc_shmem_startup(void)
|
||||
lfc_ctl->misses = 0;
|
||||
lfc_ctl->writes = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
@@ -310,14 +339,31 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
* Shrink cache by throwing away least recently accessed chunks and
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *hole;
|
||||
uint32 offset = victim->offset;
|
||||
uint32 hash;
|
||||
bool found;
|
||||
BufferTag holetag;
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
/* We remove the old entry, and re-enter a hole to the hash table */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
|
||||
memset(&holetag, 0, sizeof(holetag));
|
||||
holetag.blockNum = offset;
|
||||
hash = get_hash_value(lfc_hash, &holetag);
|
||||
hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
|
||||
hole->hash = hash;
|
||||
hole->offset = offset;
|
||||
hole->access_count = 0;
|
||||
CriticalAssert(!found);
|
||||
dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
|
||||
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
lfc_ctl->limit = new_size;
|
||||
@@ -409,6 +455,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -440,6 +488,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -470,7 +519,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
{
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
@@ -485,8 +534,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
dlist_push_head(&lfc_ctl->lru, &entry->list_node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -525,6 +574,8 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -551,7 +602,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
/* Unlink entry from LRU list to pin it for the duration of IO operation */
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
generation = lfc_ctl->generation;
|
||||
entry_offset = entry->offset;
|
||||
|
||||
@@ -569,12 +620,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
Assert(LFC_ENABLED());
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
lfc_ctl->hits += 1;
|
||||
pgBufferUsage.file_cache.hits += 1;
|
||||
Assert(entry->access_count > 0);
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
}
|
||||
else
|
||||
result = false;
|
||||
@@ -613,6 +664,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -632,7 +685,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
* operation
|
||||
*/
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
dlist_delete(&entry->list_node);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -655,13 +708,26 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else if (!dlist_is_empty(&lfc_ctl->holes))
|
||||
{
|
||||
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
|
||||
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
|
||||
uint32 offset = hole->offset;
|
||||
bool found;
|
||||
|
||||
hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
|
||||
CriticalAssert(found);
|
||||
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = offset; /* reuse the hole */
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
@@ -689,11 +755,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
|
||||
|
||||
if (lfc_ctl->generation == generation)
|
||||
{
|
||||
Assert(LFC_ENABLED());
|
||||
CriticalAssert(LFC_ENABLED());
|
||||
/* Place entry to the head of LRU list */
|
||||
Assert(entry->access_count > 0);
|
||||
CriticalAssert(entry->access_count > 0);
|
||||
if (--entry->access_count == 0)
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
|
||||
dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
|
||||
|
||||
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
|
||||
}
|
||||
@@ -708,7 +774,6 @@ typedef struct
|
||||
} NeonGetStatsCtx;
|
||||
|
||||
#define NUM_NEON_GET_STATS_COLS 2
|
||||
#define NUM_NEON_GET_STATS_ROWS 3
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
|
||||
Datum
|
||||
@@ -744,7 +809,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
INT8OID, -1, 0);
|
||||
|
||||
fctx->tupdesc = BlessTupleDesc(tupledesc);
|
||||
funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
|
||||
funcctx->user_fctx = fctx;
|
||||
|
||||
/* Return to original context when allocating transient memory */
|
||||
@@ -778,6 +842,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->writes;
|
||||
break;
|
||||
case 4:
|
||||
key = "file_cache_size";
|
||||
if (lfc_ctl)
|
||||
value = lfc_ctl->size;
|
||||
break;
|
||||
default:
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
@@ -901,7 +970,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
|
||||
for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
|
||||
n_pages += pg_popcount32(entry->bitmap[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -550,9 +550,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
case 2:
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 1:
|
||||
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
|
||||
}
|
||||
@@ -1063,7 +1060,7 @@ pg_init_libpagestore(void)
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
2, /* use protocol version 2 */
|
||||
1, /* min */
|
||||
2, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
0, /* no flags required */
|
||||
|
||||
@@ -54,6 +54,10 @@
|
||||
|
||||
#define BufTagGetNRelFileInfo(tag) tag.rnode
|
||||
|
||||
#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
|
||||
|
||||
#define InvalidRelFileNumber InvalidOid
|
||||
|
||||
#define SMgrRelGetRelInfo(reln) \
|
||||
(reln->smgr_rnode.node)
|
||||
|
||||
|
||||
@@ -87,9 +87,8 @@ typedef enum {
|
||||
* can skip traversing through recent layers which we know to not contain any
|
||||
* versions for the requested page.
|
||||
*
|
||||
* These structs describe the V2 of these requests. The old V1 protocol contained
|
||||
* just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
|
||||
* set to 1, we will convert these to the V1 requests before sending.
|
||||
* These structs describe the V2 of these requests. (The old now-defunct V1
|
||||
* protocol contained just one LSN and a boolean 'latest' flag.)
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
|
||||
@@ -1001,51 +1001,10 @@ nm_pack_request(NeonRequest *msg)
|
||||
|
||||
initStringInfo(&s);
|
||||
|
||||
if (neon_protocol_version >= 2)
|
||||
{
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
}
|
||||
else
|
||||
{
|
||||
bool latest;
|
||||
XLogRecPtr lsn;
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
|
||||
/*
|
||||
* In primary, we always request the latest page version.
|
||||
*/
|
||||
if (!RecoveryInProgress())
|
||||
{
|
||||
latest = true;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* In the protocol V1, we cannot represent that we want to read
|
||||
* page at LSN X, and we know that it hasn't been modified since
|
||||
* Y. We can either use 'not_modified_lsn' as the request LSN, and
|
||||
* risk getting an error if that LSN is too old and has already
|
||||
* fallen out of the pageserver's GC horizon, or we can send
|
||||
* 'request_lsn', causing the pageserver to possibly wait for the
|
||||
* recent WAL to arrive unnecessarily. Or something in between. We
|
||||
* choose to use the old LSN and risk GC errors, because that's
|
||||
* what we've done historically.
|
||||
*/
|
||||
latest = false;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendbyte(&s, latest);
|
||||
pq_sendint64(&s, lsn);
|
||||
}
|
||||
|
||||
/*
|
||||
* The rest of the request messages are the same between protocol V1 and
|
||||
* V2
|
||||
*/
|
||||
switch (messageTag(msg))
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
|
||||
@@ -110,7 +110,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
|
||||
|
||||
tag.rinfo = rinfo;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_SHARED);
|
||||
/* We need exclusive lock here because of LRU list manipulation */
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
|
||||
@@ -220,6 +220,64 @@ nwp_register_gucs(void)
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
|
||||
if (coma != NULL) {
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
return n_safekeepers;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept two coma-separated strings with list of safekeeper host:port addresses.
|
||||
* Split them into arrays and return false if two sets do not match, ignoring the order.
|
||||
*/
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
|
||||
if (len_old != len_new)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
|
||||
qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
|
||||
|
||||
for (int i = 0; i < len_new; i++)
|
||||
{
|
||||
if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
|
||||
* the list changed.
|
||||
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
|
||||
/* Copy values because we will modify them in split_safekeepers_list() */
|
||||
char *newval_copy = pstrdup(newval);
|
||||
char *oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (strcmp(wal_acceptors_list, newval) != 0)
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
|
||||
wal_acceptors_list, newval);
|
||||
}
|
||||
pfree(newval_copy);
|
||||
pfree(oldval);
|
||||
}
|
||||
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
static uint64
|
||||
backpressure_lag_impl(void)
|
||||
{
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import argparse
|
||||
import enum
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import List
|
||||
@@ -93,7 +94,7 @@ if __name__ == "__main__":
|
||||
"--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty(),
|
||||
default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
|
||||
new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
|
||||
* postgres
|
||||
uses postgres to select auth secrets of existing roles. Useful for local testing
|
||||
* link
|
||||
* web (or link)
|
||||
sends login link for all usernames
|
||||
|
||||
Also proxy can expose following services to the external world:
|
||||
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
|
||||
```
|
||||
|
||||
If both postgres and proxy are running you may send a SQL query:
|
||||
```json
|
||||
```console
|
||||
curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
|
||||
-H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
|
||||
-H 'Content-Type: application/json' \
|
||||
@@ -44,7 +44,8 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
|
||||
"query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
|
||||
"params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
|
||||
}' | jq
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"command": "SELECT",
|
||||
"fields": [
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
//! Client authentication mechanisms.
|
||||
|
||||
pub mod backend;
|
||||
pub use backend::BackendType;
|
||||
pub use backend::Backend;
|
||||
|
||||
mod credentials;
|
||||
pub use credentials::{
|
||||
pub(crate) use credentials::{
|
||||
check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
|
||||
ComputeUserInfoParseError, IpPattern,
|
||||
};
|
||||
|
||||
mod password_hack;
|
||||
pub use password_hack::parse_endpoint_param;
|
||||
pub(crate) use password_hack::parse_endpoint_param;
|
||||
use password_hack::PasswordHackPayload;
|
||||
|
||||
mod flow;
|
||||
pub use flow::*;
|
||||
pub(crate) use flow::*;
|
||||
use tokio::time::error::Elapsed;
|
||||
|
||||
use crate::{
|
||||
@@ -25,13 +25,13 @@ use std::{io, net::IpAddr};
|
||||
use thiserror::Error;
|
||||
|
||||
/// Convenience wrapper for the authentication error.
|
||||
pub type Result<T> = std::result::Result<T, AuthError>;
|
||||
pub(crate) type Result<T> = std::result::Result<T, AuthError>;
|
||||
|
||||
/// Common authentication error.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum AuthErrorImpl {
|
||||
pub(crate) enum AuthErrorImpl {
|
||||
#[error(transparent)]
|
||||
Link(#[from] backend::LinkAuthError),
|
||||
Web(#[from] backend::WebAuthError),
|
||||
|
||||
#[error(transparent)]
|
||||
GetAuthInfo(#[from] console::errors::GetAuthInfoError),
|
||||
@@ -77,30 +77,30 @@ pub enum AuthErrorImpl {
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[error(transparent)]
|
||||
pub struct AuthError(Box<AuthErrorImpl>);
|
||||
pub(crate) struct AuthError(Box<AuthErrorImpl>);
|
||||
|
||||
impl AuthError {
|
||||
pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
|
||||
pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::BadAuthMethod(name.into()).into()
|
||||
}
|
||||
|
||||
pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
|
||||
pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::AuthFailed(user.into()).into()
|
||||
}
|
||||
|
||||
pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
|
||||
pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
|
||||
AuthErrorImpl::IpAddressNotAllowed(ip).into()
|
||||
}
|
||||
|
||||
pub fn too_many_connections() -> Self {
|
||||
pub(crate) fn too_many_connections() -> Self {
|
||||
AuthErrorImpl::TooManyConnections.into()
|
||||
}
|
||||
|
||||
pub fn is_auth_failed(&self) -> bool {
|
||||
pub(crate) fn is_auth_failed(&self) -> bool {
|
||||
matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
|
||||
}
|
||||
|
||||
pub fn user_timeout(elapsed: Elapsed) -> Self {
|
||||
pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
|
||||
AuthErrorImpl::UserTimeout(elapsed).into()
|
||||
}
|
||||
}
|
||||
@@ -113,38 +113,36 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
||||
|
||||
impl UserFacingError for AuthError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use AuthErrorImpl::*;
|
||||
match self.0.as_ref() {
|
||||
Link(e) => e.to_string_client(),
|
||||
GetAuthInfo(e) => e.to_string_client(),
|
||||
Sasl(e) => e.to_string_client(),
|
||||
AuthFailed(_) => self.to_string(),
|
||||
BadAuthMethod(_) => self.to_string(),
|
||||
MalformedPassword(_) => self.to_string(),
|
||||
MissingEndpointName => self.to_string(),
|
||||
Io(_) => "Internal error".to_string(),
|
||||
IpAddressNotAllowed(_) => self.to_string(),
|
||||
TooManyConnections => self.to_string(),
|
||||
UserTimeout(_) => self.to_string(),
|
||||
AuthErrorImpl::Web(e) => e.to_string_client(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
|
||||
AuthErrorImpl::Sasl(e) => e.to_string_client(),
|
||||
AuthErrorImpl::AuthFailed(_) => self.to_string(),
|
||||
AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
|
||||
AuthErrorImpl::MalformedPassword(_) => self.to_string(),
|
||||
AuthErrorImpl::MissingEndpointName => self.to_string(),
|
||||
AuthErrorImpl::Io(_) => "Internal error".to_string(),
|
||||
AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
|
||||
AuthErrorImpl::TooManyConnections => self.to_string(),
|
||||
AuthErrorImpl::UserTimeout(_) => self.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for AuthError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
use AuthErrorImpl::*;
|
||||
match self.0.as_ref() {
|
||||
Link(e) => e.get_error_kind(),
|
||||
GetAuthInfo(e) => e.get_error_kind(),
|
||||
Sasl(e) => e.get_error_kind(),
|
||||
AuthFailed(_) => crate::error::ErrorKind::User,
|
||||
BadAuthMethod(_) => crate::error::ErrorKind::User,
|
||||
MalformedPassword(_) => crate::error::ErrorKind::User,
|
||||
MissingEndpointName => crate::error::ErrorKind::User,
|
||||
Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
|
||||
TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
UserTimeout(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::Web(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::Sasl(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
|
||||
AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,19 @@
|
||||
mod classic;
|
||||
mod hacks;
|
||||
pub mod jwt;
|
||||
mod link;
|
||||
pub mod local;
|
||||
mod web;
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use ipnet::{Ipv4Net, Ipv6Net};
|
||||
pub use link::LinkAuthError;
|
||||
use local::LocalBackend;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
use tracing::{info, warn};
|
||||
pub(crate) use web::WebAuthError;
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::{validate_password_and_exchange, AuthError};
|
||||
@@ -63,26 +65,27 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
|
||||
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
|
||||
/// this helps us provide the credentials only to those auth
|
||||
/// backends which require them for the authentication process.
|
||||
pub enum BackendType<'a, T, D> {
|
||||
pub enum Backend<'a, T, D> {
|
||||
/// Cloud API (V2).
|
||||
Console(MaybeOwned<'a, ConsoleBackend>, T),
|
||||
/// Authentication via a web browser.
|
||||
Link(MaybeOwned<'a, url::ApiUrl>, D),
|
||||
Web(MaybeOwned<'a, url::ApiUrl>, D),
|
||||
/// Local proxy uses configured auth credentials and does not wake compute
|
||||
Local(MaybeOwned<'a, LocalBackend>),
|
||||
}
|
||||
|
||||
pub trait TestBackend: Send + Sync + 'static {
|
||||
#[cfg(test)]
|
||||
pub(crate) trait TestBackend: Send + Sync + 'static {
|
||||
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||
fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
impl std::fmt::Display for Backend<'_, (), ()> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, _) => match &**api {
|
||||
Self::Console(api, ()) => match &**api {
|
||||
ConsoleBackend::Console(endpoint) => {
|
||||
fmt.debug_tuple("Console").field(&endpoint.url()).finish()
|
||||
}
|
||||
@@ -93,74 +96,76 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
#[cfg(test)]
|
||||
ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
},
|
||||
Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(),
|
||||
Self::Local(_) => fmt.debug_tuple("Local").finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, D> BackendType<'_, T, D> {
|
||||
impl<T, D> Backend<'_, T, D> {
|
||||
/// Very similar to [`std::option::Option::as_ref`].
|
||||
/// This helps us pass structured config to async tasks.
|
||||
pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
|
||||
use BackendType::*;
|
||||
pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
|
||||
match self {
|
||||
Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
|
||||
Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
|
||||
Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x),
|
||||
Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x),
|
||||
Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T, D> BackendType<'a, T, D> {
|
||||
impl<'a, T, D> Backend<'a, T, D> {
|
||||
/// Very similar to [`std::option::Option::map`].
|
||||
/// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
|
||||
/// Maps [`Backend<T>`] to [`Backend<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
|
||||
use BackendType::*;
|
||||
pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
|
||||
match self {
|
||||
Console(c, x) => Console(c, f(x)),
|
||||
Link(c, x) => Link(c, x),
|
||||
Self::Console(c, x) => Backend::Console(c, f(x)),
|
||||
Self::Web(c, x) => Backend::Web(c, x),
|
||||
Self::Local(l) => Backend::Local(l),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
|
||||
impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
|
||||
/// Very similar to [`std::option::Option::transpose`].
|
||||
/// This is most useful for error handling.
|
||||
pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
|
||||
use BackendType::*;
|
||||
pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
|
||||
match self {
|
||||
Console(c, x) => x.map(|x| Console(c, x)),
|
||||
Link(c, x) => Ok(Link(c, x)),
|
||||
Self::Console(c, x) => x.map(|x| Backend::Console(c, x)),
|
||||
Self::Web(c, x) => Ok(Backend::Web(c, x)),
|
||||
Self::Local(l) => Ok(Backend::Local(l)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ComputeCredentials {
|
||||
pub info: ComputeUserInfo,
|
||||
pub keys: ComputeCredentialKeys,
|
||||
pub(crate) struct ComputeCredentials {
|
||||
pub(crate) info: ComputeUserInfo,
|
||||
pub(crate) keys: ComputeCredentialKeys,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfoNoEndpoint {
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfoNoEndpoint {
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfo {
|
||||
pub endpoint: EndpointId,
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfo {
|
||||
pub(crate) endpoint: EndpointId,
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfo {
|
||||
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
self.options.get_cache_key(&self.endpoint)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentialKeys {
|
||||
pub(crate) enum ComputeCredentialKeys {
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
None,
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
@@ -217,7 +222,7 @@ impl RateBucketInfo {
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
pub(crate) fn check_rate_limit(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
@@ -293,7 +298,9 @@ async fn auth_quirks(
|
||||
ctx.set_endpoint_id(res.info.endpoint.clone());
|
||||
let password = match res.keys {
|
||||
ComputeCredentialKeys::Password(p) => p,
|
||||
_ => unreachable!("password hack should return a password"),
|
||||
ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
|
||||
unreachable!("password hack should return a password")
|
||||
}
|
||||
};
|
||||
(res.info, Some(password))
|
||||
}
|
||||
@@ -317,21 +324,20 @@ async fn auth_quirks(
|
||||
};
|
||||
let (cached_entry, secret) = cached_secret.take_value();
|
||||
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
let secret = if let Some(secret) = secret {
|
||||
config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
)?,
|
||||
None => {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
}
|
||||
)?
|
||||
} else {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
};
|
||||
|
||||
match authenticate_with_secret(
|
||||
@@ -397,41 +403,28 @@ async fn authenticate_with_secret(
|
||||
classic::authenticate(ctx, info, client, config, secret).await
|
||||
}
|
||||
|
||||
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||
Link(_, _) => Some("link".into()),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
/// Get username from the credentials.
|
||||
pub fn get_user(&self) -> &str {
|
||||
use BackendType::*;
|
||||
|
||||
pub(crate) fn get_user(&self) -> &str {
|
||||
match self {
|
||||
Console(_, user_info) => &user_info.user,
|
||||
Link(_, _) => "link",
|
||||
Self::Console(_, user_info) => &user_info.user,
|
||||
Self::Web(_, ()) => "web",
|
||||
Self::Local(_) => "local",
|
||||
}
|
||||
}
|
||||
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
pub(crate) async fn authenticate(
|
||||
self,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
|
||||
use BackendType::*;
|
||||
|
||||
) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
|
||||
let res = match self {
|
||||
Console(api, user_info) => {
|
||||
Self::Console(api, user_info) => {
|
||||
info!(
|
||||
user = &*user_info.user,
|
||||
project = user_info.endpoint(),
|
||||
@@ -448,15 +441,18 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
.await?;
|
||||
BackendType::Console(api, credentials)
|
||||
Backend::Console(api, credentials)
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url, _) => {
|
||||
info!("performing link authentication");
|
||||
Self::Web(url, ()) => {
|
||||
info!("performing web authentication");
|
||||
|
||||
let info = link::authenticate(ctx, &url, client).await?;
|
||||
let info = web::authenticate(ctx, &url, client).await?;
|
||||
|
||||
BackendType::Link(url, info)
|
||||
Backend::Web(url, info)
|
||||
}
|
||||
Self::Local(_) => {
|
||||
return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -465,70 +461,72 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
pub async fn get_role_secret(
|
||||
impl Backend<'_, ComputeUserInfo, &()> {
|
||||
pub(crate) async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||
Link(_, _) => Ok(Cached::new_uncached(None)),
|
||||
Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Web(_, ()) => Ok(Cached::new_uncached(None)),
|
||||
Self::Local(_) => Ok(Cached::new_uncached(None)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_allowed_ips_and_secret(
|
||||
pub(crate) async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Link(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||
match self {
|
||||
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||
BackendType::Link(_, _) => None,
|
||||
Self::Console(_, creds) => &creds.keys,
|
||||
Self::Web(_, _) => &ComputeCredentialKeys::None,
|
||||
Self::Local(_) => &ComputeCredentialKeys::None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Web(_, ()) => {
|
||||
unreachable!("web auth flow doesn't support waking the compute")
|
||||
}
|
||||
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||
match self {
|
||||
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||
BackendType::Link(_, _) => None,
|
||||
Self::Console(_, creds) => &creds.keys,
|
||||
Self::Web(_, ()) => &ComputeCredentialKeys::None,
|
||||
Self::Local(_) => &ComputeCredentialKeys::None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use tracing::{info, warn};
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn authenticate_cleartext(
|
||||
pub(crate) async fn authenticate_cleartext(
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -59,7 +59,7 @@ pub async fn authenticate_cleartext(
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
pub(crate) async fn password_hack_no_authentication(
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
|
||||
@@ -1,49 +1,81 @@
|
||||
use std::{future::Future, sync::Arc, time::Duration};
|
||||
use std::{
|
||||
future::Future,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{http::parse_json_body_with_limit, intern::EndpointIdInt};
|
||||
use crate::{context::RequestMonitoring, http::parse_json_body_with_limit, EndpointId, RoleName};
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
const AUTO_RENEW: Duration = Duration::from_secs(300);
|
||||
const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(&self) -> impl Future<Output = anyhow::Result<AuthRules>> + Send;
|
||||
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(
|
||||
&self,
|
||||
role_name: RoleName,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FetchAuthRulesFromCplane {
|
||||
#[allow(dead_code)]
|
||||
endpoint: EndpointIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for FetchAuthRulesFromCplane {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Err(anyhow::anyhow!("not yet implemented"))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AuthRules {
|
||||
jwks_urls: Vec<url::Url>,
|
||||
pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
pub(crate) audience: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
pub(crate) struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<EndpointIdInt, Arc<JwkCacheEntryLock>>,
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntryLock {
|
||||
pub(crate) struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<String, KeySet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntry {
|
||||
fn find_jwk_and_audience(&self, key_id: &str) -> Option<(&jose_jwk::Jwk, Option<&str>)> {
|
||||
self.key_sets.values().find_map(|key_set| {
|
||||
key_set
|
||||
.find_key(key_id)
|
||||
.map(|jwk| (jwk, key_set.audience.as_deref()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySet {
|
||||
jwks: jose_jwk::JwkSet,
|
||||
audience: Option<String>,
|
||||
}
|
||||
|
||||
impl KeySet {
|
||||
fn find_key(&self, key_id: &str) -> Option<&jose_jwk::Jwk> {
|
||||
self.jwks
|
||||
.keys
|
||||
.iter()
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(key_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct JwkCacheEntryLock {
|
||||
cached: ArcSwapOption<JwkCacheEntry>,
|
||||
lookup: tokio::sync::Semaphore,
|
||||
}
|
||||
@@ -57,15 +89,6 @@ impl Default for JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
|
||||
/// cplane will return multiple JWKs urls that we need to scrape.
|
||||
key_sets: ahash::HashMap<url::Url, jose_jwk::JwkSet>,
|
||||
}
|
||||
|
||||
impl JwkCacheEntryLock {
|
||||
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
|
||||
JwkRenewalPermit::acquire_permit(self).await
|
||||
@@ -79,6 +102,7 @@ impl JwkCacheEntryLock {
|
||||
&self,
|
||||
_permit: JwkRenewalPermit<'_>,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
@@ -91,20 +115,19 @@ impl JwkCacheEntryLock {
|
||||
}
|
||||
}
|
||||
|
||||
let rules = auth_rules.fetch_auth_rules().await?;
|
||||
let mut key_sets = ahash::HashMap::with_capacity_and_hasher(
|
||||
rules.jwks_urls.len(),
|
||||
ahash::RandomState::new(),
|
||||
);
|
||||
let rules = auth_rules.fetch_auth_rules(role_name).await?;
|
||||
let mut key_sets =
|
||||
ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
|
||||
// TODO(conrad): run concurrently
|
||||
// TODO(conrad): strip the JWKs urls (should be checked by cplane as well - cloud#16284)
|
||||
for url in rules.jwks_urls {
|
||||
let req = client.get(url.clone());
|
||||
for rule in rules {
|
||||
let req = client.get(rule.jwks_url.clone());
|
||||
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
|
||||
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
|
||||
match req.send().await.and_then(|r| r.error_for_status()) {
|
||||
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
|
||||
// I expect these failures would be quite sparse.
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not fetch JWKs"),
|
||||
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
|
||||
Ok(r) => {
|
||||
let resp: http::Response<reqwest::Body> = r.into();
|
||||
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
|
||||
@@ -113,9 +136,17 @@ impl JwkCacheEntryLock {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => tracing::warn!(?url, error=?e, "could not decode JWKs"),
|
||||
Err(e) => {
|
||||
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
|
||||
}
|
||||
Ok(jwks) => {
|
||||
key_sets.insert(url, jwks);
|
||||
key_sets.insert(
|
||||
rule.id,
|
||||
KeySet {
|
||||
jwks,
|
||||
audience: rule.audience,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -133,7 +164,9 @@ impl JwkCacheEntryLock {
|
||||
|
||||
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
let now = Instant::now();
|
||||
@@ -141,18 +174,20 @@ impl JwkCacheEntryLock {
|
||||
|
||||
// if we have no cached JWKs, try and get some
|
||||
let Some(cached) = guard else {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
return self.renew_jwks(permit, client, role_name, fetch).await;
|
||||
};
|
||||
|
||||
let last_update = now.duration_since(cached.last_retrieved);
|
||||
|
||||
// check if the cached JWKs need updating.
|
||||
if last_update > MAX_RENEW {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let permit = self.acquire_permit().await;
|
||||
|
||||
// it's been too long since we checked the keys. wait for them to update.
|
||||
return self.renew_jwks(permit, client, fetch).await;
|
||||
return self.renew_jwks(permit, client, role_name, fetch).await;
|
||||
}
|
||||
|
||||
// every 5 minutes we should spawn a job to eagerly update the token.
|
||||
@@ -164,7 +199,7 @@ impl JwkCacheEntryLock {
|
||||
let client = client.clone();
|
||||
let fetch = fetch.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, &fetch).await {
|
||||
if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
|
||||
tracing::warn!(error=?e, "could not fetch JWKs in background job");
|
||||
}
|
||||
});
|
||||
@@ -178,8 +213,10 @@ impl JwkCacheEntryLock {
|
||||
|
||||
async fn check_jwt<F: FetchAuthRules>(
|
||||
self: &Arc<Self>,
|
||||
jwt: String,
|
||||
ctx: &RequestMonitoring,
|
||||
jwt: &str,
|
||||
client: &reqwest::Client,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// JWT compact form is defined to be
|
||||
@@ -187,38 +224,38 @@ impl JwkCacheEntryLock {
|
||||
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
|
||||
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once(".")
|
||||
.rsplit_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let (header, _payload) = header_payload
|
||||
.split_once(".")
|
||||
let (header, payload) = header_payload
|
||||
.split_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let header = serde_json::from_slice::<JWTHeader>(&header)
|
||||
let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
ensure!(header.typ == "JWT");
|
||||
let kid = header.kid.context("missing key id")?;
|
||||
let kid = header.key_id.context("missing key id")?;
|
||||
|
||||
let mut guard = self.get_or_update_jwk_cache(client, fetch).await?;
|
||||
let mut guard = self
|
||||
.get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
|
||||
.await?;
|
||||
|
||||
// get the key from the JWKs if possible. If not, wait for the keys to update.
|
||||
let jwk = loop {
|
||||
let jwk = guard
|
||||
.key_sets
|
||||
.values()
|
||||
.flat_map(|jwks| &jwks.keys)
|
||||
.find(|jwk| jwk.prm.kid.as_deref() == Some(kid));
|
||||
|
||||
match jwk {
|
||||
let (jwk, expected_audience) = loop {
|
||||
match guard.find_jwk_and_audience(kid) {
|
||||
Some(jwk) => break jwk,
|
||||
None if guard.last_retrieved.elapsed() > MIN_RENEW => {
|
||||
let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
|
||||
let permit = self.acquire_permit().await;
|
||||
guard = self.renew_jwks(permit, client, fetch).await?;
|
||||
guard = self
|
||||
.renew_jwks(permit, client, role_name.clone(), fetch)
|
||||
.await?;
|
||||
}
|
||||
_ => {
|
||||
bail!("jwk not found");
|
||||
@@ -227,7 +264,7 @@ impl JwkCacheEntryLock {
|
||||
};
|
||||
|
||||
ensure!(
|
||||
jwk.is_supported(&header.alg),
|
||||
jwk.is_supported(&header.algorithm),
|
||||
"signature algorithm not supported"
|
||||
);
|
||||
|
||||
@@ -241,31 +278,57 @@ impl JwkCacheEntryLock {
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
};
|
||||
|
||||
// TODO(conrad): verify iss, exp, nbf, etc...
|
||||
let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
tracing::debug!(?payload, "JWT signature valid with claims");
|
||||
|
||||
match (expected_audience, payload.audience) {
|
||||
// check the audience matches
|
||||
(Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
|
||||
// the audience is expected but is missing
|
||||
(Some(_), None) => bail!("invalid JWT token audience"),
|
||||
// we don't care for the audience field
|
||||
(None, _) => {}
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
ensure!(nbf < now + CLOCK_SKEW_LEEWAY);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl JwkCache {
|
||||
pub async fn check_jwt(
|
||||
pub(crate) async fn check_jwt<F: FetchAuthRules>(
|
||||
&self,
|
||||
endpoint: EndpointIdInt,
|
||||
jwt: String,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
role_name: RoleName,
|
||||
fetch: &F,
|
||||
jwt: &str,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// try with just a read lock first
|
||||
let entry = self.map.get(&endpoint).as_deref().map(Arc::clone);
|
||||
let entry = match entry {
|
||||
Some(entry) => entry,
|
||||
None => {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(endpoint).or_default();
|
||||
Arc::clone(&*entry)
|
||||
}
|
||||
};
|
||||
let key = (endpoint, role_name.clone());
|
||||
let entry = self.map.get(&key).as_deref().map(Arc::clone);
|
||||
let entry = entry.unwrap_or_else(|| {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(key).or_default();
|
||||
Arc::clone(&*entry)
|
||||
});
|
||||
|
||||
let fetch = FetchAuthRulesFromCplane { endpoint };
|
||||
entry.check_jwt(jwt, &self.client, &fetch).await
|
||||
entry
|
||||
.check_jwt(ctx, jwt, &self.client, role_name, fetch)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -315,13 +378,49 @@ fn verify_rsa_signature(
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize)]
|
||||
struct JWTHeader<'a> {
|
||||
struct JwtHeader<'a> {
|
||||
/// must be "JWT"
|
||||
#[serde(rename = "typ")]
|
||||
typ: &'a str,
|
||||
/// must be a supported alg
|
||||
alg: jose_jwa::Algorithm,
|
||||
#[serde(rename = "alg")]
|
||||
algorithm: jose_jwa::Algorithm,
|
||||
/// key id, must be provided for our usecase
|
||||
kid: Option<&'a str>,
|
||||
#[serde(rename = "kid")]
|
||||
key_id: Option<&'a str>,
|
||||
}
|
||||
|
||||
/// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
|
||||
#[derive(serde::Deserialize, serde::Serialize, Debug)]
|
||||
struct JwtPayload<'a> {
|
||||
/// Audience - Recipient for which the JWT is intended
|
||||
#[serde(rename = "aud")]
|
||||
audience: Option<&'a str>,
|
||||
/// Expiration - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
|
||||
expiration: Option<SystemTime>,
|
||||
/// Not before - Time after which the JWT expires
|
||||
#[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
|
||||
not_before: Option<SystemTime>,
|
||||
|
||||
// the following entries are only extracted for the sake of debug logging.
|
||||
/// Issuer of the JWT
|
||||
#[serde(rename = "iss")]
|
||||
issuer: Option<&'a str>,
|
||||
/// Subject of the JWT (the user)
|
||||
#[serde(rename = "sub")]
|
||||
subject: Option<&'a str>,
|
||||
/// Unique token identifier
|
||||
#[serde(rename = "jti")]
|
||||
jwt_id: Option<&'a str>,
|
||||
/// Unique session identifier
|
||||
#[serde(rename = "sid")]
|
||||
session_id: Option<&'a str>,
|
||||
}
|
||||
|
||||
fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
|
||||
let d = <Option<u64>>::deserialize(d)?;
|
||||
Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
|
||||
}
|
||||
|
||||
struct JwkRenewalPermit<'a> {
|
||||
@@ -340,7 +439,7 @@ impl JwkRenewalPermit<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit {
|
||||
async fn acquire_permit(from: &Arc<JwkCacheEntryLock>) -> JwkRenewalPermit<'_> {
|
||||
match from.lookup.acquire().await {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
@@ -352,7 +451,7 @@ impl JwkRenewalPermit<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit> {
|
||||
fn try_acquire_permit(from: &Arc<JwkCacheEntryLock>) -> Option<JwkRenewalPermit<'_>> {
|
||||
match from.lookup.try_acquire() {
|
||||
Ok(permit) => {
|
||||
permit.forget();
|
||||
@@ -388,6 +487,8 @@ impl Drop for JwkRenewalPermit<'_> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::RoleName;
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
|
||||
@@ -431,10 +532,10 @@ mod tests {
|
||||
}
|
||||
|
||||
fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
|
||||
let header = JWTHeader {
|
||||
let header = JwtHeader {
|
||||
typ: "JWT",
|
||||
alg: jose_jwa::Algorithm::Signing(sig),
|
||||
kid: Some(&kid),
|
||||
algorithm: jose_jwa::Algorithm::Signing(sig),
|
||||
key_id: Some(&kid),
|
||||
};
|
||||
let body = typed_json::json! {{
|
||||
"exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
|
||||
@@ -524,33 +625,40 @@ mod tests {
|
||||
struct Fetch(SocketAddr);
|
||||
|
||||
impl FetchAuthRules for Fetch {
|
||||
async fn fetch_auth_rules(&self) -> anyhow::Result<AuthRules> {
|
||||
Ok(AuthRules {
|
||||
jwks_urls: vec![
|
||||
format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
],
|
||||
})
|
||||
async fn fetch_auth_rules(
|
||||
&self,
|
||||
_role_name: RoleName,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
},
|
||||
AuthRule {
|
||||
id: "bar".to_owned(),
|
||||
jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
|
||||
audience: None,
|
||||
},
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
let role_name = RoleName::from("user");
|
||||
|
||||
let jwk_cache = Arc::new(JwkCacheEntryLock::default());
|
||||
|
||||
jwk_cache
|
||||
.check_jwt(jwt1, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt2, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt3, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
jwk_cache
|
||||
.check_jwt(jwt4, &client, &Fetch(addr))
|
||||
.await
|
||||
.unwrap();
|
||||
for token in [jwt1, jwt2, jwt3, jwt4] {
|
||||
jwk_cache
|
||||
.check_jwt(
|
||||
&RequestMonitoring::test(),
|
||||
&token,
|
||||
&client,
|
||||
role_name.clone(),
|
||||
&Fetch(addr),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
77
proxy/src/auth/backend/local.rs
Normal file
77
proxy/src/auth/backend/local.rs
Normal file
@@ -0,0 +1,77 @@
|
||||
use std::{collections::HashMap, net::SocketAddr};
|
||||
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwapOption;
|
||||
|
||||
use crate::{
|
||||
compute::ConnCfg,
|
||||
console::{
|
||||
messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
|
||||
NodeInfo,
|
||||
},
|
||||
intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
|
||||
RoleName,
|
||||
};
|
||||
|
||||
use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
|
||||
|
||||
pub struct LocalBackend {
|
||||
pub(crate) jwks_cache: JwkCache,
|
||||
pub(crate) node_info: NodeInfo,
|
||||
}
|
||||
|
||||
impl LocalBackend {
|
||||
pub fn new(postgres_addr: SocketAddr) -> Self {
|
||||
LocalBackend {
|
||||
jwks_cache: JwkCache::default(),
|
||||
node_info: NodeInfo {
|
||||
config: {
|
||||
let mut cfg = ConnCfg::new();
|
||||
cfg.host(&postgres_addr.ip().to_string());
|
||||
cfg.port(postgres_addr.port());
|
||||
cfg
|
||||
},
|
||||
// TODO(conrad): make this better reflect compute info rather than endpoint info.
|
||||
aux: MetricsAuxInfo {
|
||||
endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
|
||||
project_id: ProjectIdTag::get_interner().get_or_intern("local"),
|
||||
branch_id: BranchIdTag::get_interner().get_or_intern("local"),
|
||||
cold_start_info: ColdStartInfo::WarmCached,
|
||||
},
|
||||
allow_self_signed_compute: false,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct StaticAuthRules;
|
||||
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct JwksRoleSettings {
|
||||
pub roles: HashMap<RoleName, EndpointJwksResponse>,
|
||||
pub project_id: ProjectIdInt,
|
||||
pub branch_id: BranchIdInt,
|
||||
}
|
||||
|
||||
impl FetchAuthRules for StaticAuthRules {
|
||||
async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
|
||||
let mappings = JWKS_ROLE_MAP.load();
|
||||
let role_mappings = mappings
|
||||
.as_deref()
|
||||
.and_then(|m| m.roles.get(&role_name))
|
||||
.context("JWKs settings for this role were not configured")?;
|
||||
let mut rules = vec![];
|
||||
for setting in &role_mappings.jwks {
|
||||
rules.push(AuthRule {
|
||||
id: setting.id.clone(),
|
||||
jwks_url: setting.jwks_url.clone(),
|
||||
audience: setting.jwt_audience.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(rules)
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
|
||||
use tracing::{info, info_span};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum LinkAuthError {
|
||||
pub(crate) enum WebAuthError {
|
||||
#[error(transparent)]
|
||||
WaiterRegister(#[from] waiters::RegisterError),
|
||||
|
||||
@@ -24,18 +24,18 @@ pub enum LinkAuthError {
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
impl UserFacingError for LinkAuthError {
|
||||
impl UserFacingError for WebAuthError {
|
||||
fn to_string_client(&self) -> String {
|
||||
"Internal error".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for LinkAuthError {
|
||||
impl ReportableError for WebAuthError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
|
||||
LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
|
||||
LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
|
||||
Self::WaiterWait(_) => crate::error::ErrorKind::Service,
|
||||
Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,7 +52,7 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn new_psql_session_id() -> String {
|
||||
pub(crate) fn new_psql_session_id() -> String {
|
||||
hex::encode(rand::random::<[u8; 8]>())
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let span = info_span!("web", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
|
||||
// Give user a URL to spawn a new database.
|
||||
@@ -87,7 +87,7 @@ pub(super) async fn authenticate(
|
||||
|
||||
// Wait for web console response (see `mgmt`).
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
let db_info = waiter.await.map_err(LinkAuthError::from)?;
|
||||
let db_info = waiter.await.map_err(WebAuthError::from)?;
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
@@ -16,7 +16,7 @@ use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
|
||||
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||
pub enum ComputeUserInfoParseError {
|
||||
pub(crate) enum ComputeUserInfoParseError {
|
||||
#[error("Parameter '{0}' is missing in startup packet.")]
|
||||
MissingKey(&'static str),
|
||||
|
||||
@@ -51,20 +51,20 @@ impl ReportableError for ComputeUserInfoParseError {
|
||||
/// Various client credentials which we use for authentication.
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ComputeUserInfoMaybeEndpoint {
|
||||
pub user: RoleName,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfoMaybeEndpoint {
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) endpoint_id: Option<EndpointId>,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
#[inline]
|
||||
pub fn endpoint(&self) -> Option<&str> {
|
||||
pub(crate) fn endpoint(&self) -> Option<&str> {
|
||||
self.endpoint_id.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn endpoint_sni(
|
||||
pub(crate) fn endpoint_sni(
|
||||
sni: &str,
|
||||
common_names: &HashSet<String>,
|
||||
) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
|
||||
@@ -83,16 +83,18 @@ pub fn endpoint_sni(
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
pub fn parse(
|
||||
pub(crate) fn parse(
|
||||
ctx: &RequestMonitoring,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<&HashSet<String>>,
|
||||
) -> Result<Self, ComputeUserInfoParseError> {
|
||||
use ComputeUserInfoParseError::*;
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let get_param = |key| {
|
||||
params
|
||||
.get(key)
|
||||
.ok_or(ComputeUserInfoParseError::MissingKey(key))
|
||||
};
|
||||
let user: RoleName = get_param("user")?.into();
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
@@ -122,12 +124,18 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
let endpoint = match (endpoint_option, endpoint_from_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(option), Some(domain)) if option != domain => {
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
Some(Err(ComputeUserInfoParseError::InconsistentProjectNames {
|
||||
domain,
|
||||
option,
|
||||
}))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
|
||||
false => Err(MalformedProjectName(name)),
|
||||
true => Ok(name),
|
||||
(a, b) => a.or(b).map(|name| {
|
||||
if project_name_valid(name.as_ref()) {
|
||||
Ok(name)
|
||||
} else {
|
||||
Err(ComputeUserInfoParseError::MalformedProjectName(name))
|
||||
}
|
||||
}),
|
||||
}
|
||||
.transpose()?;
|
||||
@@ -165,12 +173,12 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
|
||||
pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
|
||||
ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub enum IpPattern {
|
||||
pub(crate) enum IpPattern {
|
||||
Subnet(ipnet::IpNet),
|
||||
Range(IpAddr, IpAddr),
|
||||
Single(IpAddr),
|
||||
@@ -186,7 +194,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
|
||||
impl<'de> serde::de::Visitor<'de> for StrVisitor {
|
||||
type Value = IpPattern;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
|
||||
}
|
||||
|
||||
|
||||
@@ -17,17 +17,20 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
/// Every authentication selector is supposed to implement this trait.
|
||||
pub trait AuthMethod {
|
||||
pub(crate) trait AuthMethod {
|
||||
/// Any authentication selector should provide initial backend message
|
||||
/// containing auth method name and parameters, e.g. md5 salt.
|
||||
fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
|
||||
}
|
||||
|
||||
/// Initial state of [`AuthFlow`].
|
||||
pub struct Begin;
|
||||
pub(crate) struct Begin;
|
||||
|
||||
/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
|
||||
pub(crate) struct Scram<'a>(
|
||||
pub(crate) &'a scram::ServerSecret,
|
||||
pub(crate) &'a RequestMonitoring,
|
||||
);
|
||||
|
||||
impl AuthMethod for Scram<'_> {
|
||||
#[inline(always)]
|
||||
@@ -44,7 +47,7 @@ impl AuthMethod for Scram<'_> {
|
||||
|
||||
/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in
|
||||
/// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
|
||||
pub struct PasswordHack;
|
||||
pub(crate) struct PasswordHack;
|
||||
|
||||
impl AuthMethod for PasswordHack {
|
||||
#[inline(always)]
|
||||
@@ -55,10 +58,10 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword {
|
||||
pub pool: Arc<ThreadPool>,
|
||||
pub endpoint: EndpointIdInt,
|
||||
pub secret: AuthSecret,
|
||||
pub(crate) struct CleartextPassword {
|
||||
pub(crate) pool: Arc<ThreadPool>,
|
||||
pub(crate) endpoint: EndpointIdInt,
|
||||
pub(crate) secret: AuthSecret,
|
||||
}
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
@@ -70,7 +73,7 @@ impl AuthMethod for CleartextPassword {
|
||||
|
||||
/// This wrapper for [`PqStream`] performs client authentication.
|
||||
#[must_use]
|
||||
pub struct AuthFlow<'a, S, State> {
|
||||
pub(crate) struct AuthFlow<'a, S, State> {
|
||||
/// The underlying stream which implements libpq's protocol.
|
||||
stream: &'a mut PqStream<Stream<S>>,
|
||||
/// State might contain ancillary data (see [`Self::begin`]).
|
||||
@@ -81,7 +84,7 @@ pub struct AuthFlow<'a, S, State> {
|
||||
/// Initial state of the stream wrapper.
|
||||
impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
/// Create a new wrapper for client authentication.
|
||||
pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
|
||||
pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
|
||||
let tls_server_end_point = stream.get_ref().tls_server_end_point();
|
||||
|
||||
Self {
|
||||
@@ -92,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
}
|
||||
|
||||
/// Move to the next step by sending auth method's name & params to client.
|
||||
pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
|
||||
pub(crate) async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
|
||||
self.stream
|
||||
.write_message(&method.first_message(self.tls_server_end_point.supported()))
|
||||
.await?;
|
||||
@@ -107,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
pub(crate) async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -126,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -151,7 +154,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Stream wrapper for handling [SCRAM](crate::scram) auth.
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
|
||||
pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! Payload for ad hoc authentication method for clients that don't support SNI.
|
||||
//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
|
||||
//! See the `impl` for [`super::backend::Backend<ClientCredentials>`].
|
||||
//! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
|
||||
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
|
||||
|
||||
@@ -7,13 +7,13 @@ use bstr::ByteSlice;
|
||||
|
||||
use crate::EndpointId;
|
||||
|
||||
pub struct PasswordHackPayload {
|
||||
pub endpoint: EndpointId,
|
||||
pub password: Vec<u8>,
|
||||
pub(crate) struct PasswordHackPayload {
|
||||
pub(crate) endpoint: EndpointId,
|
||||
pub(crate) password: Vec<u8>,
|
||||
}
|
||||
|
||||
impl PasswordHackPayload {
|
||||
pub fn parse(bytes: &[u8]) -> Option<Self> {
|
||||
pub(crate) fn parse(bytes: &[u8]) -> Option<Self> {
|
||||
// The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
|
||||
let separators = [";", "$"];
|
||||
for sep in separators {
|
||||
@@ -30,7 +30,7 @@ impl PasswordHackPayload {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
|
||||
pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> {
|
||||
bytes
|
||||
.strip_prefix("project=")
|
||||
.or_else(|| bytes.strip_prefix("endpoint="))
|
||||
|
||||
316
proxy/src/bin/local_proxy.rs
Normal file
316
proxy/src/bin/local_proxy.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use std::{
|
||||
net::SocketAddr,
|
||||
path::{Path, PathBuf},
|
||||
pin::pin,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure};
|
||||
use dashmap::DashMap;
|
||||
use futures::{future::Either, FutureExt};
|
||||
use proxy::{
|
||||
auth::backend::local::{JwksRoleSettings, LocalBackend, JWKS_ROLE_MAP},
|
||||
cancellation::CancellationHandlerMain,
|
||||
config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
|
||||
console::{locks::ApiLocks, messages::JwksRoleMapping},
|
||||
http::health_server::AppMetrics,
|
||||
metrics::{Metrics, ThreadPoolMetrics},
|
||||
rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
|
||||
scram::threadpool::ThreadPool,
|
||||
serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::Parser;
|
||||
use tokio::{net::TcpListener, task::JoinSet};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Neon proxy/router
|
||||
#[derive(Parser)]
|
||||
#[command(version = GIT_VERSION, about)]
|
||||
struct LocalProxyCliArgs {
|
||||
/// listen for incoming metrics connections on ip:port
|
||||
#[clap(long, default_value = "127.0.0.1:7001")]
|
||||
metrics: String,
|
||||
/// listen for incoming http connections on ip:port
|
||||
#[clap(long)]
|
||||
http: String,
|
||||
/// timeout for the TLS handshake
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
handshake_timeout: tokio::time::Duration,
|
||||
/// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||
#[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)]
|
||||
connect_compute_lock: String,
|
||||
#[clap(flatten)]
|
||||
sql_over_http: SqlOverHttpArgs,
|
||||
/// User rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
user_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
/// Address of the postgres server
|
||||
#[clap(long, default_value = "127.0.0.1:5432")]
|
||||
compute: SocketAddr,
|
||||
/// File address of the local proxy config file
|
||||
#[clap(long, default_value = "./localproxy.json")]
|
||||
config_path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
struct SqlOverHttpArgs {
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 200)]
|
||||
sql_over_http_pool_max_total_conns: usize,
|
||||
|
||||
/// How long pooled connections should remain idle for before closing
|
||||
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_idle_timeout: tokio::time::Duration,
|
||||
|
||||
#[clap(long, default_value_t = 100)]
|
||||
sql_over_http_client_conn_threshold: u64,
|
||||
|
||||
#[clap(long, default_value_t = 16)]
|
||||
sql_over_http_cancel_set_shards: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
|
||||
Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
info!("Build_tag: {BUILD_TAG}");
|
||||
let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
|
||||
revision: GIT_VERSION,
|
||||
build_tag: BUILD_TAG,
|
||||
});
|
||||
|
||||
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
|
||||
Ok(t) => Some(t),
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not start jemalloc metrics loop");
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let args = LocalProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
|
||||
let metrics_listener = TcpListener::bind(args.metrics).await?.into_std()?;
|
||||
let http_listener = TcpListener::bind(args.http).await?;
|
||||
let shutdown = CancellationToken::new();
|
||||
|
||||
// todo: should scale with CU
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
|
||||
LeakyBucketConfig {
|
||||
rps: 10.0,
|
||||
max: 100.0,
|
||||
},
|
||||
16,
|
||||
));
|
||||
|
||||
refresh_config(args.config_path.clone()).await;
|
||||
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), move || {
|
||||
refresh_config(args.config_path.clone()).map(Ok)
|
||||
}));
|
||||
maintenance_tasks.spawn(proxy::http::health_server::task_main(
|
||||
metrics_listener,
|
||||
AppMetrics {
|
||||
jemalloc,
|
||||
neon_metrics,
|
||||
proxy: proxy::metrics::Metrics::get(),
|
||||
},
|
||||
));
|
||||
|
||||
let task = serverless::task_main(
|
||||
config,
|
||||
http_listener,
|
||||
shutdown.clone(),
|
||||
Arc::new(CancellationHandlerMain::new(
|
||||
Arc::new(DashMap::new()),
|
||||
None,
|
||||
proxy::metrics::CancellationSource::Local,
|
||||
)),
|
||||
endpoint_rate_limiter,
|
||||
);
|
||||
|
||||
match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
|
||||
// exit immediately on maintenance task completion
|
||||
Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
|
||||
// exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
|
||||
Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
|
||||
// exit immediately on client task error
|
||||
Either::Right((res, _)) => res?,
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// ProxyConfig is created at proxy startup, and lives forever.
|
||||
fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let config::ConcurrencyLockOptions {
|
||||
shards,
|
||||
limiter,
|
||||
epoch,
|
||||
timeout,
|
||||
} = args.connect_compute_lock.parse()?;
|
||||
info!(
|
||||
?limiter,
|
||||
shards,
|
||||
?epoch,
|
||||
"Using NodeLocks (connect_compute)"
|
||||
);
|
||||
let connect_compute_locks = ApiLocks::new(
|
||||
"connect_compute_lock",
|
||||
limiter,
|
||||
shards,
|
||||
timeout,
|
||||
epoch,
|
||||
&Metrics::get().proxy.connect_compute_lock,
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: false,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
gc_epoch: Duration::from_secs(60),
|
||||
pool_shards: 2,
|
||||
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
|
||||
opt_in: false,
|
||||
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
|
||||
},
|
||||
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
|
||||
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
|
||||
};
|
||||
|
||||
Ok(Box::leak(Box::new(ProxyConfig {
|
||||
tls_config: None,
|
||||
auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
|
||||
LocalBackend::new(args.compute),
|
||||
)),
|
||||
metric_collection: None,
|
||||
allow_self_signed_compute: false,
|
||||
http_config,
|
||||
authentication_config: AuthenticationConfig {
|
||||
thread_pool: ThreadPool::new(0),
|
||||
scram_protocol_timeout: Duration::from_secs(10),
|
||||
rate_limiter_enabled: false,
|
||||
rate_limiter: BucketRateLimiter::new(vec![]),
|
||||
rate_limit_ip_subnet: 64,
|
||||
},
|
||||
require_client_ip: false,
|
||||
handshake_timeout: Duration::from_secs(10),
|
||||
region: "local".into(),
|
||||
wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
|
||||
connect_compute_locks,
|
||||
connect_to_compute_retry_config: RetryConfig::parse(
|
||||
RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES,
|
||||
)?,
|
||||
})))
|
||||
}
|
||||
|
||||
async fn refresh_config(path: PathBuf) {
|
||||
match refresh_config_inner(&path).await {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(error=?e, ?path, "could not read config file");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh_config_inner(path: &Path) -> anyhow::Result<()> {
|
||||
let bytes = tokio::fs::read(&path).await?;
|
||||
let mut data: JwksRoleMapping = serde_json::from_slice(&bytes)?;
|
||||
|
||||
let mut settings = None;
|
||||
|
||||
for mapping in data.roles.values_mut() {
|
||||
for jwks in &mut mapping.jwks {
|
||||
ensure!(
|
||||
jwks.jwks_url.has_authority()
|
||||
&& (jwks.jwks_url.scheme() == "http" || jwks.jwks_url.scheme() == "https"),
|
||||
"Invalid JWKS url. Must be HTTP",
|
||||
);
|
||||
|
||||
ensure!(
|
||||
jwks.jwks_url
|
||||
.host()
|
||||
.is_some_and(|h| h != url::Host::Domain("")),
|
||||
"Invalid JWKS url. No domain listed",
|
||||
);
|
||||
|
||||
// clear username, password and ports
|
||||
jwks.jwks_url.set_username("").expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
jwks.jwks_url.set_password(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
// local testing is hard if we need to have a specific restricted port
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks.jwks_url.set_port(None).expect(
|
||||
"url can be a base and has a valid host and is not a file. should not error",
|
||||
);
|
||||
}
|
||||
|
||||
// clear query params
|
||||
jwks.jwks_url.set_fragment(None);
|
||||
jwks.jwks_url.query_pairs_mut().clear().finish();
|
||||
|
||||
if jwks.jwks_url.scheme() != "https" {
|
||||
// local testing is hard if we need to set up https support.
|
||||
if cfg!(not(feature = "testing")) {
|
||||
jwks.jwks_url
|
||||
.set_scheme("https")
|
||||
.expect("should not error to set the scheme to https if it was http");
|
||||
} else {
|
||||
warn!(scheme = jwks.jwks_url.scheme(), "JWKS url is not HTTPS");
|
||||
}
|
||||
}
|
||||
|
||||
let (pr, br) = settings.get_or_insert((jwks.project_id, jwks.branch_id));
|
||||
ensure!(
|
||||
*pr == jwks.project_id,
|
||||
"inconsistent project IDs configured"
|
||||
);
|
||||
ensure!(*br == jwks.branch_id, "inconsistent branch IDs configured");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((project_id, branch_id)) = settings {
|
||||
JWKS_ROLE_MAP.store(Some(Arc::new(JwksRoleSettings {
|
||||
roles: data.roles,
|
||||
project_id,
|
||||
branch_id,
|
||||
})));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -133,7 +133,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token));
|
||||
let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || async {
|
||||
Ok(())
|
||||
}));
|
||||
|
||||
// the signal task cant ever succeed.
|
||||
// the main task can error, or can succeed on cancellation.
|
||||
|
||||
@@ -60,11 +60,14 @@ use clap::{Parser, ValueEnum};
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackend {
|
||||
enum AuthBackendType {
|
||||
Console,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres,
|
||||
Link,
|
||||
// clap only shows the name, not the alias, in usage text.
|
||||
// TODO: swap name/alias and deprecate "link"
|
||||
#[value(name("link"), alias("web"))]
|
||||
Web,
|
||||
}
|
||||
|
||||
/// Neon proxy/router
|
||||
@@ -77,8 +80,8 @@ struct ProxyCliArgs {
|
||||
/// listen for incoming client connections on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:4432")]
|
||||
proxy: String,
|
||||
#[clap(value_enum, long, default_value_t = AuthBackend::Link)]
|
||||
auth_backend: AuthBackend,
|
||||
#[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
|
||||
auth_backend: AuthBackendType,
|
||||
/// listen for management callback connection on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:7000")]
|
||||
mgmt: String,
|
||||
@@ -88,7 +91,7 @@ struct ProxyCliArgs {
|
||||
/// listen for incoming wss connections on ip:port
|
||||
#[clap(long)]
|
||||
wss: Option<String>,
|
||||
/// redirect unauthenticated users to the given uri in case of link auth
|
||||
/// redirect unauthenticated users to the given uri in case of web auth
|
||||
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
|
||||
uri: String,
|
||||
/// cloud API endpoint for authenticating users
|
||||
@@ -148,7 +151,7 @@ struct ProxyCliArgs {
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
|
||||
/// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
@@ -173,9 +176,6 @@ struct ProxyCliArgs {
|
||||
/// cache for `role_secret` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
role_secret_cache: String,
|
||||
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_ip_check_for_http: bool,
|
||||
/// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections)
|
||||
#[clap(long)]
|
||||
redis_notifications: Option<String>,
|
||||
@@ -450,7 +450,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
|
||||
maintenance_tasks.spawn(proxy::handle_signals(
|
||||
cancellation_token.clone(),
|
||||
|| async { Ok(()) },
|
||||
));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(
|
||||
http_listener,
|
||||
AppMetrics {
|
||||
@@ -470,7 +473,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
}
|
||||
|
||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||
if let auth::Backend::Console(api, _) = &config.auth_backend {
|
||||
if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
@@ -575,7 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
}
|
||||
|
||||
let auth_backend = match &args.auth_backend {
|
||||
AuthBackend::Console => {
|
||||
AuthBackendType::Console => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
@@ -624,18 +627,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
);
|
||||
let api = console::provider::ConsoleBackend::Console(api);
|
||||
auth::BackendType::Console(MaybeOwned::Owned(api), ())
|
||||
auth::Backend::Console(MaybeOwned::Owned(api), ())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackend::Postgres => {
|
||||
AuthBackendType::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
let api = console::provider::ConsoleBackend::Postgres(api);
|
||||
auth::BackendType::Console(MaybeOwned::Owned(api), ())
|
||||
auth::Backend::Console(MaybeOwned::Owned(api), ())
|
||||
}
|
||||
AuthBackend::Link => {
|
||||
AuthBackendType::Web => {
|
||||
let url = args.uri.parse()?;
|
||||
auth::BackendType::Link(MaybeOwned::Owned(url), ())
|
||||
auth::Backend::Web(MaybeOwned::Owned(url), ())
|
||||
}
|
||||
};
|
||||
|
||||
@@ -661,6 +664,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
)?;
|
||||
|
||||
let http_config = HttpConfig {
|
||||
accept_websockets: true,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user