Merge pull request #9710 from neondatabase/rc/2024-11-11

Storage & Compute release 2024-11-11
This commit is contained in:
Alex Chi Z.
2024-11-11 11:05:41 -05:00
committed by GitHub
144 changed files with 5359 additions and 2627 deletions

View File

@@ -1,36 +0,0 @@
name: "Set custom docker config directory"
description: "Create a directory for docker config and set DOCKER_CONFIG"
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
runs:
using: "composite"
steps:
- name: Show warning on GitHub-hosted runners
if: runner.environment == 'github-hosted'
shell: bash -euo pipefail {0}
run: |
# Using the following environment variables to find a path to the workflow file
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
# ${GITHUB_REPOSITORY} - octocat/hello-world
# ${GITHUB_REF} - refs/heads/my_branch
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
filename=${filename_with_ref%"@$GITHUB_REF"}
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
echo "::warning file=${filename},title=${title}::${message}"
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
env:
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
with:
main: |
mkdir -p "${DOCKER_CONFIG}"
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
post: |
if [ -d "${DOCKER_CONFIG}" ]; then
rm -r "${DOCKER_CONFIG}"
fi

View File

@@ -0,0 +1,37 @@
name: Check Codestyle Python
on:
workflow_call:
inputs:
build-tools-image:
description: 'build-tools image'
required: true
type: string
defaults:
run:
shell: bash -euxo pipefail {0}
jobs:
check-codestyle-python:
runs-on: [ self-hosted, small ]
container:
image: ${{ inputs.build-tools-image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
- run: ./scripts/pysync
- run: poetry run ruff check .
- run: poetry run ruff format --check .
- run: poetry run mypy .

View File

@@ -64,7 +64,7 @@ jobs:
- uses: actions/checkout@v4
- uses: ./.github/actions/set-docker-config-dir
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false

View File

@@ -90,35 +90,10 @@ jobs:
check-codestyle-python:
needs: [ check-permissions, build-build-tools-image ]
runs-on: [ self-hosted, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Cache poetry deps
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Run `ruff check` to ensure code format
run: poetry run ruff check .
- name: Run `ruff format` to ensure code format
run: poetry run ruff format --check .
- name: Run mypy to check types
run: poetry run mypy .
uses: ./.github/workflows/_check-codestyle-python.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
secrets: inherit
check-codestyle-jsonnet:
needs: [ check-permissions, build-build-tools-image ]
@@ -141,6 +116,7 @@ jobs:
# Check that the vendor/postgres-* submodules point to the
# corresponding REL_*_STABLE_neon branches.
check-submodules:
needs: [ check-permissions ]
runs-on: ubuntu-22.04
steps:
- name: Checkout
@@ -552,7 +528,7 @@ jobs:
with:
submodules: true
- uses: ./.github/actions/set-docker-config-dir
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -643,7 +619,7 @@ jobs:
with:
submodules: true
- uses: ./.github/actions/set-docker-config-dir
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -824,7 +800,7 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
- uses: ./.github/actions/set-docker-config-dir
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -860,7 +836,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/set-docker-config-dir
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

94
.github/workflows/pre-merge-checks.yml vendored Normal file
View File

@@ -0,0 +1,94 @@
name: Pre-merge checks
on:
merge_group:
branches:
- main
defaults:
run:
shell: bash -euxo pipefail {0}
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
jobs:
get-changed-files:
runs-on: ubuntu-22.04
outputs:
python-changed: ${{ steps.python-src.outputs.any_changed }}
steps:
- uses: actions/checkout@v4
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
id: python-src
with:
files: |
.github/workflows/pre-merge-checks.yml
**/**.py
poetry.lock
pyproject.toml
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
env:
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
run: |
echo "${PYTHON_CHANGED_FILES}"
check-build-tools-image:
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
check-codestyle-python:
if: needs.get-changed-files.outputs.python-changed == 'true'
needs: [ get-changed-files, build-build-tools-image ]
uses: ./.github/workflows/_check-codestyle-python.yml
with:
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
secrets: inherit
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
# Currently we require 2 jobs (checks with exact name):
# - conclusion
# - neon-cloud-e2e
conclusion:
if: always()
permissions:
statuses: write # for `github.repos.createCommitStatus(...)`
needs:
- get-changed-files
- check-codestyle-python
runs-on: ubuntu-22.04
steps:
- name: Create fake `neon-cloud-e2e` check
uses: actions/github-script@v7
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { repo, owner } = context.repo;
const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
await github.rest.repos.createCommitStatus({
owner: owner,
repo: repo,
sha: context.sha,
context: `neon-cloud-e2e`,
state: `success`,
target_url: targetUrl,
description: `fake check for merge queue`,
});
- name: Fail the job if any of the dependencies do not succeed or skipped
run: exit 1
if: |
(contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
|| contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')

View File

@@ -23,6 +23,7 @@ on:
- Test Postgres client libraries
- Trigger E2E Tests
- cleanup caches by a branch
- Pre-merge checks
types: [completed]
jobs:

6
Cargo.lock generated
View File

@@ -4365,6 +4365,7 @@ dependencies = [
"walkdir",
"workspace_hack",
"x509-parser",
"zerocopy",
]
[[package]]
@@ -4742,6 +4743,7 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pemfile 2.1.1",
"rustls-pki-types",
"serde",
@@ -5145,6 +5147,7 @@ dependencies = [
"chrono",
"clap",
"crc32c",
"criterion",
"desim",
"fail",
"futures",
@@ -5152,6 +5155,7 @@ dependencies = [
"http 1.1.0",
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
"metrics",
"once_cell",
"parking_lot 0.12.1",
@@ -7374,6 +7378,7 @@ dependencies = [
"tracing",
"tracing-core",
"url",
"zerocopy",
"zeroize",
"zstd",
"zstd-safe",
@@ -7446,6 +7451,7 @@ version = "0.7.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
dependencies = [
"byteorder",
"zerocopy-derive",
]

View File

@@ -196,6 +196,7 @@ walkdir = "2.3.2"
rustls-native-certs = "0.8"
x509-parser = "0.16"
whoami = "1.5.1"
zerocopy = { version = "0.7", features = ["derive"] }
## TODO replace this with tracing
env_logger = "0.10"

View File

@@ -1,12 +1,66 @@
ARG DEBIAN_VERSION=bullseye
FROM debian:${DEBIAN_VERSION}-slim
FROM debian:bookworm-slim AS pgcopydb_builder
ARG DEBIAN_VERSION
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
set -e && \
apt update && \
apt install -y --no-install-recommends \
ca-certificates wget gpg && \
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
apt-get update && \
apt install -y --no-install-recommends \
build-essential \
autotools-dev \
libedit-dev \
libgc-dev \
libpam0g-dev \
libreadline-dev \
libselinux1-dev \
libxslt1-dev \
libssl-dev \
libkrb5-dev \
zlib1g-dev \
liblz4-dev \
libpq5 \
libpq-dev \
libzstd-dev \
postgresql-16 \
postgresql-server-dev-16 \
postgresql-common \
python3-sphinx && \
wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
mkdir /tmp/pgcopydb && \
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
cd /tmp/pgcopydb && \
make -s clean && \
make -s -j12 install && \
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
mkdir -p /pgcopydb/lib && \
cp "$libpq_path" /pgcopydb/lib/; \
else \
# copy command below will fail if we don't have dummy files, so we create them for other debian versions
mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
fi
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
ARG DEBIAN_VERSION
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
RUN mkdir -p /pgcopydb/bin && \
mkdir -p /pgcopydb/lib && \
chmod -R 755 /pgcopydb && \
chown -R nonroot:nonroot /pgcopydb
COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
# System deps
#
# 'gdb' is included so that we get backtraces of core dumps produced in
@@ -38,7 +92,7 @@ RUN set -e \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
libtool \
libxml2-dev \
libxmlsec1-dev \
@@ -235,7 +289,13 @@ RUN whoami \
&& cargo --version --verbose \
&& rustup --version --verbose \
&& rustc --version --verbose \
&& clang --version
&& clang --version
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
else \
echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
fi
# Set following flag to check in Makefile if its running in Docker
RUN touch /home/nonroot/.docker_build

View File

@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
"v17") \
export TIMESCALEDB_VERSION=2.17.0 \
export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
export TIMESCALEDB_VERSION=2.17.1 \
export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
;; \
esac && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
@@ -1151,8 +1151,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# The topmost commit in the `neon` branch at the time of writing this
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \

View File

@@ -3,7 +3,7 @@
metrics: [
import 'sql_exporter/checkpoints_req.libsonnet',
import 'sql_exporter/checkpoints_timed.libsonnet',
import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
import 'sql_exporter/compute_current_lsn.libsonnet',
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
import 'sql_exporter/compute_receive_lsn.libsonnet',

View File

@@ -1 +0,0 @@
SELECT neon.backpressure_throttling_time() AS throttled;

View File

@@ -1,10 +1,10 @@
{
metric_name: 'compute_backpressure_throttling_ms',
metric_name: 'compute_backpressure_throttling_seconds',
type: 'gauge',
help: 'Time compute has spent throttled',
key_labels: null,
values: [
'throttled',
],
query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
}

View File

@@ -0,0 +1 @@
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;

View File

@@ -364,11 +364,29 @@ impl ComputeNode {
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
_ => format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
),
Lsn(0) => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id
)
} else {
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
}
}
_ => {
if spec.spec.mode != ComputeMode::Primary {
format!(
"basebackup {} {} {} --gzip --replica",
spec.tenant_id, spec.timeline_id, lsn
)
} else {
format!(
"basebackup {} {} {} --gzip",
spec.tenant_id, spec.timeline_id, lsn
)
}
}
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;

View File

@@ -73,6 +73,12 @@ pub fn write_postgres_conf(
)?;
}
// Locales
writeln!(file, "lc_messages='C.UTF-8'")?;
writeln!(file, "lc_monetary='C.UTF-8'")?;
writeln!(file, "lc_time='C.UTF-8'")?;
writeln!(file, "lc_numeric='C.UTF-8'")?;
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {

View File

@@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
// Typical developer machines use disks with slow fsync, and we don't care
// about data integrity: disable disk syncs.
no_sync: true,
}
})
.collect(),

View File

@@ -225,6 +225,7 @@ pub struct PageServerConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub no_sync: bool,
}
impl Default for PageServerConf {
@@ -235,6 +236,7 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
no_sync: false,
}
}
}
@@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub no_sync: bool,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
@@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
other: _,
} = conf;
Self {
@@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
no_sync: *no_sync,
}
}
}
@@ -569,6 +575,8 @@ impl LocalEnv {
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
#[serde(default)]
no_sync: bool,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -591,6 +599,7 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
} = config_toml;
let IdentityTomlSubset {
id: identity_toml_id,
@@ -607,6 +616,7 @@ impl LocalEnv {
listen_http_addr,
pg_auth_type,
http_auth_type,
no_sync,
};
pageservers.push(conf);
}

View File

@@ -273,6 +273,7 @@ impl PageServerNode {
)
})?;
let args = vec!["-D", datadir_path_str];
background_process::start_process(
"pageserver",
&datadir,
@@ -334,17 +335,20 @@ impl PageServerNode {
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()?,
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()?,
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_algorithm: settings
.remove("compaction_algorithm")
.map(serde_json::from_str)
@@ -353,16 +357,19 @@ impl PageServerNode {
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()?,
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()?,
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
@@ -403,6 +410,11 @@ impl PageServerNode {
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
timeline_offloading: settings
.remove("timeline_offloading")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'timeline_offloading' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -414,97 +426,9 @@ impl PageServerNode {
pub async fn tenant_config(
&self,
tenant_id: TenantId,
mut settings: HashMap<&str, &str>,
settings: HashMap<&str, &str>,
) -> anyhow::Result<()> {
let config = {
// Braces to make the diff easier to read
models::TenantConfig {
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
compaction_algorithm: settings
.remove("compactin_algorithm")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'compaction_algorithm' json")?,
gc_horizon: settings
.remove("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings
.remove("lagging_wal_timeout")
.map(|x| x.to_string()),
max_lsn_wal_lag: settings
.remove("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
eviction_policy: settings
.remove("eviction_policy")
.map(serde_json::from_str)
.transpose()
.context("Failed to parse 'eviction_policy' json")?,
min_resident_size_override: settings
.remove("min_resident_size_override")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'min_resident_size_override' as an integer")?,
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(|x| x.to_string()),
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
}
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
}
let config = Self::parse_config(settings)?;
self.http_client
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
.await?;

View File

@@ -110,6 +110,23 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
&[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
/// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end
/// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32.
pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
assert_ne!(start, 0);
assert!(start <= end);
let start = match start.checked_next_power_of_two() {
Some(n) if n == start => n, // start already power of two
Some(n) => n >> 1, // power of two below start
None => panic!("start too large"),
};
let end = end.checked_next_power_of_two().expect("end too large");
std::iter::successors(Some(start), |n| n.checked_mul(2))
.take_while(|n| n <= &end)
.map(|n| n as f64)
.collect()
}
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
@@ -595,3 +612,67 @@ where
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1);
#[test]
fn pow2_buckets_cases() {
assert_eq!(pow2_buckets(1, 1), vec![1.0]);
assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]);
assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]);
assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]);
assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(
pow2_buckets(1, 200),
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0]
);
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]);
assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]);
assert_eq!(pow2_buckets(8, 8), vec![8.0]);
assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]);
// Largest valid values.
assert_eq!(
pow2_buckets(1, POW2_BUCKETS_MAX).len(),
usize::BITS as usize
);
assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1);
}
#[test]
#[should_panic]
fn pow2_buckets_zero_start() {
pow2_buckets(0, 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_lt_start() {
pow2_buckets(2, 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_overflow_min() {
pow2_buckets(1, POW2_BUCKETS_MAX + 1);
}
#[test]
#[should_panic]
fn pow2_buckets_end_overflow_max() {
pow2_buckets(1, usize::MAX);
}
}

View File

@@ -64,6 +64,7 @@ pub struct ConfigToml {
#[serde(with = "humantime_serde")]
pub wal_redo_timeout: Duration,
pub superuser: String,
pub locale: String,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -106,6 +107,8 @@ pub struct ConfigToml {
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: Option<crate::models::L0FlushConfig>,
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
#[serde(skip_serializing_if = "Option::is_none")]
pub no_sync: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -259,6 +262,10 @@ pub struct TenantConfigToml {
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
pub lsn_lease_length_for_ts: Duration,
/// Enable auto-offloading of timelines.
/// (either this flag or the pageserver-global one need to be set)
pub timeline_offloading: bool,
}
pub mod defaults {
@@ -270,6 +277,7 @@ pub mod defaults {
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
pub const DEFAULT_LOCALE: &str = "C.UTF-8";
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -320,6 +328,7 @@ impl Default for ConfigToml {
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
.expect("cannot parse default wal redo timeout")),
superuser: (DEFAULT_SUPERUSER.to_string()),
locale: DEFAULT_LOCALE.to_string(),
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
@@ -385,6 +394,7 @@ impl Default for ConfigToml {
l0_flush: None,
virtual_file_io_mode: None,
tenant_config: TenantConfigToml::default(),
no_sync: None,
}
}
}
@@ -471,6 +481,7 @@ impl Default for TenantConfigToml {
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
timeline_offloading: false,
}
}
}

View File

@@ -310,6 +310,7 @@ pub struct TenantConfig {
pub image_layer_creation_check_threshold: Option<u8>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
pub timeline_offloading: Option<bool>,
}
/// The policy for the aux file storage.

View File

@@ -1,10 +1,10 @@
use std::ffi::CStr;
use std::ffi::{CStr, CString};
use bytes::{Bytes, BytesMut};
use crc32c::crc32c_append;
use utils::lsn::Lsn;
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
use super::xlog_utils::{
XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
XLP_FIRST_IS_CONTRECORD,
@@ -16,11 +16,65 @@ use crate::pg_constants::{
};
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
/// boundaries if desired. Not optimized for performance.
/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
pub struct Record {
pub rmid: RmgrId,
pub info: u8,
pub data: Bytes,
}
impl Record {
/// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
/// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
// Prefix data with block ID and length.
let data_header = Bytes::from(match self.data.len() {
0 => vec![],
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
256.. => {
let len_bytes = (self.data.len() as u32).to_le_bytes();
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
}
});
// Construct the WAL record header.
let mut header = XLogRecord {
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
xl_xid: 0,
xl_prev: prev_lsn.into(),
xl_info: self.info,
xl_rmid: self.rmid,
__bindgen_padding_0: [0; 2],
xl_crc: 0, // see below
};
// Compute the CRC checksum for the data, and the header up to the CRC field.
let mut crc = 0;
crc = crc32c_append(crc, &data_header);
crc = crc32c_append(crc, &self.data);
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
header.xl_crc = crc;
// Encode the final header and record.
let header = header.encode().unwrap();
[header, data_header, self.data.clone()].concat().into()
}
}
/// Generates WAL record payloads.
///
/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
/// that creates a table and inserts rows.
pub trait RecordGenerator: Iterator<Item = Record> {}
impl<I: Iterator<Item = Record>> RecordGenerator for I {}
/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
/// including internal page headers if it spans pages. Concatenating the bytes will yield a
/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
/// for performance.
///
/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
@@ -31,10 +85,10 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
/// | Segment 1 | Segment 2 | Segment 3 |
/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
/// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 |
///
/// TODO: support generating actual tables and rows.
#[derive(Default)]
pub struct WalGenerator {
pub struct WalGenerator<R: RecordGenerator> {
/// Generates record payloads for the WAL.
pub record_generator: R,
/// Current LSN to append the next record at.
///
/// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
@@ -46,73 +100,35 @@ pub struct WalGenerator {
pub prev_lsn: Lsn,
}
impl WalGenerator {
// For now, hardcode the message payload.
// TODO: support specifying the payload size.
const PREFIX: &CStr = c"prefix";
const MESSAGE: &[u8] = b"message";
// Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
impl<R: RecordGenerator> WalGenerator<R> {
// Hardcode the sys and timeline ID. We can make them configurable if we care about them.
const SYS_ID: u64 = 0;
const TIMELINE_ID: u32 = 1;
const DB_ID: u32 = 0;
/// Creates a new WAL generator, which emits logical message records (noops).
pub fn new() -> Self {
Self::default()
/// Creates a new WAL generator with the given record generator.
pub fn new(record_generator: R) -> WalGenerator<R> {
Self {
record_generator,
lsn: Lsn(0),
prev_lsn: Lsn(0),
}
}
/// Encodes a logical message (basically a noop), with the given prefix and message.
pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
let prefix = prefix.to_bytes_with_nul();
let header = XlLogicalMessage {
db_id: Self::DB_ID,
transactional: 0,
prefix_size: prefix.len() as u64,
message_size: message.len() as u64,
};
[&header.encode(), prefix, message].concat().into()
/// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
/// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
let record = record.encode(self.prev_lsn);
let record = Self::insert_pages(record, self.lsn);
let record = Self::pad_record(record, self.lsn);
let lsn = self.lsn;
self.prev_lsn = self.lsn;
self.lsn += record.len() as u64;
(lsn, record)
}
/// Encode a WAL record with the given payload data (e.g. a logical message).
pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
// Prefix data with block ID and length.
let data_header = Bytes::from(match data.len() {
0 => vec![],
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
256.. => {
let len_bytes = (data.len() as u32).to_le_bytes();
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
}
});
// Construct the WAL record header.
let mut header = XLogRecord {
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
xl_xid: 0,
xl_prev: prev_lsn.into(),
xl_info: info,
xl_rmid: rmid,
__bindgen_padding_0: [0; 2],
xl_crc: 0, // see below
};
// Compute the CRC checksum for the data, and the header up to the CRC field.
let mut crc = 0;
crc = crc32c_append(crc, &data_header);
crc = crc32c_append(crc, &data);
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
header.xl_crc = crc;
// Encode the final header and record.
let header = header.encode().unwrap();
[header, data_header, data].concat().into()
}
/// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
/// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
/// is to be appended.
fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
// Fast path: record fits in current page, and the page already has a header.
if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
return record;
@@ -173,31 +189,71 @@ impl WalGenerator {
}
[record, Bytes::from(vec![0; padding])].concat().into()
}
/// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
let record = Self::encode_record(data, rmid, info, self.prev_lsn);
let record = Self::encode_pages(record, self.lsn);
let record = Self::pad_record(record, self.lsn);
self.prev_lsn = self.lsn;
self.lsn += record.len() as u64;
record
}
/// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
let data = Self::encode_logical_message(prefix, message);
self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
}
}
/// Generate WAL records as an iterator.
impl Iterator for WalGenerator {
/// Generates WAL records as an iterator.
impl<R: RecordGenerator> Iterator for WalGenerator<R> {
type Item = (Lsn, Bytes);
fn next(&mut self) -> Option<Self::Item> {
let lsn = self.lsn;
let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
Some((lsn, record))
let record = self.record_generator.next()?;
Some(self.append_record(record))
}
}
/// Generates logical message records (effectively noops) with a fixed message.
pub struct LogicalMessageGenerator {
prefix: CString,
message: Vec<u8>,
}
impl LogicalMessageGenerator {
const DB_ID: u32 = 0; // hardcoded for now
const RM_ID: RmgrId = RM_LOGICALMSG_ID;
const INFO: u8 = XLOG_LOGICAL_MESSAGE;
/// Creates a new LogicalMessageGenerator.
pub fn new(prefix: &CStr, message: &[u8]) -> Self {
Self {
prefix: prefix.to_owned(),
message: message.to_owned(),
}
}
/// Encodes a logical message.
fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
let prefix = prefix.to_bytes_with_nul();
let header = XlLogicalMessage {
db_id: Self::DB_ID,
transactional: 0,
prefix_size: prefix.len() as u64,
message_size: message.len() as u64,
};
[&header.encode(), prefix, message].concat().into()
}
}
impl Iterator for LogicalMessageGenerator {
type Item = Record;
fn next(&mut self) -> Option<Self::Item> {
Some(Record {
rmid: Self::RM_ID,
info: Self::INFO,
data: Self::encode(&self.prefix, &self.message),
})
}
}
impl WalGenerator<LogicalMessageGenerator> {
/// Convenience method for appending a WAL record with an arbitrary logical message at the
/// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
let record = Record {
rmid: LogicalMessageGenerator::RM_ID,
info: LogicalMessageGenerator::INFO,
data: LogicalMessageGenerator::encode(prefix, message),
};
self.append_record(record)
}
}

View File

@@ -12,9 +12,9 @@ use super::bindings::{
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
};
use super::wal_generator::WalGenerator;
use super::wal_generator::LogicalMessageGenerator;
use super::PG_MAJORVERSION;
use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
use crate::pg_constants;
use crate::PG_TLI;
use crate::{uint32, uint64, Oid};
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -493,12 +493,10 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
// This function can take untrusted input, so discard any NUL bytes in the prefix string.
let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
let message = message.as_bytes();
WalGenerator::encode_record(
WalGenerator::encode_logical_message(&prefix, message),
RM_LOGICALMSG_ID,
XLOG_LOGICAL_MESSAGE,
Lsn(0),
)
LogicalMessageGenerator::new(&prefix, message)
.next()
.unwrap()
.encode(Lsn(0))
}
#[cfg(test)]

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -euxo pipefail
@@ -6,9 +6,44 @@ PG_BIN=$1
WAL_PATH=$2
DATA_DIR=$3
PORT=$4
PG_VERSION=$5
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
# The way that initdb is invoked must match how the pageserver runs initdb.
function initdb_with_args {
local cmd=(
"$PG_BIN"/initdb
-E utf8
-U cloud_admin
-D "$DATA_DIR"
--locale 'C.UTF-8'
--lc-collate 'C.UTF-8'
--lc-ctype 'C.UTF-8'
--lc-messages 'C.UTF-8'
--lc-monetary 'C.UTF-8'
--lc-numeric 'C.UTF-8'
--lc-time 'C.UTF-8'
--sysid="$SYSID"
)
case "$PG_VERSION" in
14)
# Postgres 14 and below didn't support --locale-provider
;;
15 | 16)
cmd+=(--locale-provider 'libc')
;;
*)
# Postgres 17 added the builtin provider
cmd+=(--locale-provider 'builtin')
;;
esac
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
}
rm -fr "$DATA_DIR"
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
initdb_with_args
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)

View File

@@ -5,7 +5,7 @@ edition.workspace = true
license.workspace = true
[features]
testing = []
testing = ["pageserver_api/testing"]
[dependencies]
anyhow.workspace = true

View File

@@ -2,15 +2,13 @@
//! raw bytes which represent a raw Postgres WAL record.
use crate::models::*;
use bytes::{Buf, Bytes, BytesMut};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use crate::serialized_batch::SerializedValueBatch;
use bytes::{Buf, Bytes};
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use pageserver_api::value::Value;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
use postgres_ffi::walrecord::*;
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
use utils::lsn::Lsn;
impl InterpretedWalRecord {
@@ -21,11 +19,12 @@ impl InterpretedWalRecord {
pub fn from_bytes_filtered(
buf: Bytes,
shard: &ShardIdentity,
lsn: Lsn,
record_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<InterpretedWalRecord> {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(buf, &mut decoded, pg_version)?;
let xid = decoded.xl_xid;
let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
FlushUncommittedRecords::Yes
@@ -33,96 +32,20 @@ impl InterpretedWalRecord {
FlushUncommittedRecords::No
};
let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
let mut blocks = Vec::default();
for blk in decoded.blocks.iter() {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !key.is_valid_key_on_write_path() {
anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
}
let key_is_local = shard.is_key_local(&key);
tracing::debug!(
lsn=%lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
blocks.push((key.to_compact(), None));
}
continue;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let value = if blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
blocks.push((key.to_compact(), Some(value)));
}
let metadata_record = MetadataRecord::from_decoded(&decoded, record_end_lsn, pg_version)?;
let batch = SerializedValueBatch::from_decoded_filtered(
decoded,
shard,
record_end_lsn,
pg_version,
)?;
Ok(InterpretedWalRecord {
metadata_record,
blocks,
lsn,
batch,
end_lsn: record_end_lsn,
flush_uncommitted,
xid: decoded.xl_xid,
xid,
})
}
}
@@ -130,7 +53,7 @@ impl InterpretedWalRecord {
impl MetadataRecord {
fn from_decoded(
decoded: &DecodedWALRecord,
lsn: Lsn,
record_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<Option<MetadataRecord>> {
// Note: this doesn't actually copy the bytes since
@@ -151,7 +74,7 @@ impl MetadataRecord {
Ok(None)
}
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, record_end_lsn),
pg_constants::RM_MULTIXACT_ID => {
Self::decode_multixact_record(&mut buf, decoded, pg_version)
}
@@ -163,7 +86,7 @@ impl MetadataRecord {
//
// Alternatively, one can make the checkpoint part of the subscription protocol
// to the pageserver. This should work fine, but can be done at a later point.
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, record_end_lsn),
pg_constants::RM_LOGICALMSG_ID => {
Self::decode_logical_message_record(&mut buf, decoded)
}

View File

@@ -1,2 +1,3 @@
pub mod decoder;
pub mod models;
pub mod serialized_batch;

View File

@@ -2,7 +2,8 @@
//! ready for the pageserver to interpret. They are derived from the original
//! WAL records, so that each struct corresponds closely to one WAL record of
//! a specific kind. They contain the same information as the original WAL records,
//! just decoded into structs and fields for easier access.
//! but the values are already serialized in a [`SerializedValueBatch`], which
//! is the format that the pageserver is expecting them in.
//!
//! The ingestion code uses these structs to help with parsing the WAL records,
//! and it splits them into a stream of modifications to the key-value pairs that
@@ -25,9 +26,7 @@
//! |--> write to KV store within the pageserver
use bytes::Bytes;
use pageserver_api::key::CompactKey;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::value::Value;
use postgres_ffi::walrecord::{
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
XlSmgrTruncate, XlXactParsedRecord,
@@ -35,6 +34,8 @@ use postgres_ffi::walrecord::{
use postgres_ffi::{Oid, TransactionId};
use utils::lsn::Lsn;
use crate::serialized_batch::SerializedValueBatch;
pub enum FlushUncommittedRecords {
Yes,
No,
@@ -45,12 +46,11 @@ pub struct InterpretedWalRecord {
/// Optional metadata record - may cause writes to metadata keys
/// in the storage engine
pub metadata_record: Option<MetadataRecord>,
/// Images or deltas for blocks modified in the original WAL record.
/// The [`Value`] is optional to avoid sending superfluous data to
/// shard 0 for relation size tracking.
pub blocks: Vec<(CompactKey, Option<Value>)>,
/// A pre-serialized batch along with the required metadata for ingestion
/// by the pageserver
pub batch: SerializedValueBatch,
/// Byte offset within WAL for the end of the original PG WAL record
pub lsn: Lsn,
pub end_lsn: Lsn,
/// Whether to flush all uncommitted modifications to the storage engine
/// before ingesting this record. This is currently only used for legacy PG
/// database creations which read pages from a template database. Such WAL

View File

@@ -0,0 +1,862 @@
//! This module implements batch type for serialized [`pageserver_api::value::Value`]
//! instances. Each batch contains a raw buffer (serialized values)
//! and a list of metadata for each (key, LSN) tuple present in the batch.
//!
//! Such batches are created from decoded PG wal records and ingested
//! by the pageserver by writing directly to the ephemeral file.
use std::collections::BTreeSet;
use bytes::{Bytes, BytesMut};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::{key::CompactKey, value::Value};
use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
use utils::bin_ser::BeSer;
use utils::lsn::Lsn;
use pageserver_api::key::Key;
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
/// Accompanying metadata for the batch
/// A value may be serialized and stored into the batch or just "observed".
/// Shard 0 currently "observes" all values in order to accurately track
/// relation sizes. In the case of "observed" values, we only need to know
/// the key and LSN, so two types of metadata are supported to save on network
/// bandwidth.
pub enum ValueMeta {
Serialized(SerializedValueMeta),
Observed(ObservedValueMeta),
}
impl ValueMeta {
pub fn key(&self) -> CompactKey {
match self {
Self::Serialized(ser) => ser.key,
Self::Observed(obs) => obs.key,
}
}
pub fn lsn(&self) -> Lsn {
match self {
Self::Serialized(ser) => ser.lsn,
Self::Observed(obs) => obs.lsn,
}
}
}
/// Wrapper around [`ValueMeta`] that implements ordering by
/// (key, LSN) tuples
struct OrderedValueMeta(ValueMeta);
impl Ord for OrderedValueMeta {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
(self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn()))
}
}
impl PartialOrd for OrderedValueMeta {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for OrderedValueMeta {
fn eq(&self, other: &Self) -> bool {
(self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn())
}
}
impl Eq for OrderedValueMeta {}
/// Metadata for a [`Value`] serialized into the batch.
pub struct SerializedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
/// Starting offset of the value for the (key, LSN) tuple
/// in [`SerializedValueBatch::raw`]
pub batch_offset: u64,
pub len: usize,
pub will_init: bool,
}
/// Metadata for a [`Value`] observed by the batch
pub struct ObservedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
}
/// Batch of serialized [`Value`]s.
pub struct SerializedValueBatch {
/// [`Value`]s serialized in EphemeralFile's native format,
/// ready for disk write by the pageserver
pub raw: Vec<u8>,
/// Metadata to make sense of the bytes in [`Self::raw`]
/// and represent "observed" values.
///
/// Invariant: Metadata entries for any given key are ordered
/// by LSN. Note that entries for a key do not have to be contiguous.
pub metadata: Vec<ValueMeta>,
/// The highest LSN of any value in the batch
pub max_lsn: Lsn,
/// Number of values encoded by [`Self::raw`]
pub len: usize,
}
impl Default for SerializedValueBatch {
fn default() -> Self {
Self {
raw: Default::default(),
metadata: Default::default(),
max_lsn: Lsn(0),
len: 0,
}
}
}
impl SerializedValueBatch {
/// Build a batch of serialized values from a decoded PG WAL record
///
/// The batch will only contain values for keys targeting the specifiec
/// shard. Shard 0 is a special case, where any keys that don't belong to
/// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`],
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
pub(crate) fn from_decoded_filtered(
decoded: DecodedWALRecord,
shard: &ShardIdentity,
record_end_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<SerializedValueBatch> {
// First determine how big the buffer needs to be and allocate it up-front.
// This duplicates some of the work below, but it's empirically much faster.
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
let mut max_lsn: Lsn = Lsn(0);
let mut len: usize = 0;
for blk in decoded.blocks.iter() {
let relative_off = buf.len() as u64;
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !key.is_valid_key_on_write_path() {
anyhow::bail!("Unsupported key decoded at LSN {}: {}", record_end_lsn, key);
}
let key_is_local = shard.is_key_local(&key);
tracing::debug!(
lsn=%record_end_lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
metadata.push(ValueMeta::Observed(ObservedValueMeta {
key: key.to_compact(),
lsn: record_end_lsn,
}))
}
continue;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let val = if Self::block_is_image(&decoded, blk, pg_version) {
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, record_end_lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
val.ser_into(&mut buf)
.expect("Writing into in-memory buffer is infallible");
let val_ser_size = buf.len() - relative_off as usize;
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
key: key.to_compact(),
lsn: record_end_lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
max_lsn = std::cmp::max(max_lsn, record_end_lsn);
len += 1;
}
if cfg!(any(debug_assertions, test)) {
let batch = Self {
raw: buf,
metadata,
max_lsn,
len,
};
batch.validate_lsn_order();
return Ok(batch);
}
Ok(Self {
raw: buf,
metadata,
max_lsn,
len,
})
}
/// Look into the decoded PG WAL record and determine
/// roughly how large the buffer for serialized values needs to be.
fn estimate_buffer_size(
decoded: &DecodedWALRecord,
shard: &ShardIdentity,
pg_version: u32,
) -> usize {
let mut estimate: usize = 0;
for blk in decoded.blocks.iter() {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
if !shard.is_key_local(&key) {
continue;
}
if Self::block_is_image(decoded, blk, pg_version) {
// 4 bytes for the Value::Image discriminator
// 8 bytes for encoding the size of the buffer
// BLCKSZ for the raw image
estimate += (4 + 8 + BLCKSZ) as usize;
} else {
// 4 bytes for the Value::WalRecord discriminator
// 4 bytes for the NeonWalRecord::Postgres discriminator
// 1 bytes for NeonWalRecord::Postgres::will_init
// 8 bytes for encoding the size of the buffer
// length of the raw record
estimate += 8 + 1 + 8 + decoded.record.len();
}
}
estimate
}
fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
blk.apply_image
&& blk.has_image
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
}
/// Encode a list of values and metadata into a serialized batch
///
/// This is used by the pageserver ingest code to conveniently generate
/// batches for metadata writes.
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
let mut buf = Vec::<u8>::with_capacity(buffer_size);
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(batch.len());
let mut max_lsn: Lsn = Lsn(0);
let len = batch.len();
for (key, lsn, val_ser_size, val) in batch {
let relative_off = buf.len() as u64;
val.ser_into(&mut buf)
.expect("Writing into in-memory buffer is infallible");
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
key,
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
max_lsn = std::cmp::max(max_lsn, lsn);
}
// Assert that we didn't do any extra allocations while building buffer.
debug_assert!(buf.len() <= buffer_size);
if cfg!(any(debug_assertions, test)) {
let batch = Self {
raw: buf,
metadata,
max_lsn,
len,
};
batch.validate_lsn_order();
return batch;
}
Self {
raw: buf,
metadata,
max_lsn,
len,
}
}
/// Add one value to the batch
///
/// This is used by the pageserver ingest code to include metadata block
/// updates for a single key.
pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) {
let relative_off = self.raw.len() as u64;
value.ser_into(&mut self.raw).unwrap();
let val_ser_size = self.raw.len() - relative_off as usize;
self.metadata
.push(ValueMeta::Serialized(SerializedValueMeta {
key,
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: value.will_init(),
}));
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
self.len += 1;
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Extend with the contents of another batch
///
/// One batch is generated for each decoded PG WAL record.
/// They are then merged to accumulate reasonably sized writes.
pub fn extend(&mut self, mut other: SerializedValueBatch) {
let extend_batch_start_offset = self.raw.len() as u64;
self.raw.extend(other.raw);
// Shift the offsets in the batch we are extending with
other.metadata.iter_mut().for_each(|meta| match meta {
ValueMeta::Serialized(ser) => {
ser.batch_offset += extend_batch_start_offset;
if cfg!(debug_assertions) {
let value_end = ser.batch_offset + ser.len as u64;
assert!((value_end as usize) <= self.raw.len());
}
}
ValueMeta::Observed(_) => {}
});
self.metadata.extend(other.metadata);
self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn);
self.len += other.len;
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Add zero images for the (key, LSN) tuples specified
///
/// PG versions below 16 do not zero out pages before extending
/// a relation and may leave gaps. Such gaps need to be identified
/// by the pageserver ingest logic and get patched up here.
///
/// Note that this function does not validate that the gaps have been
/// identified correctly (it does not know relation sizes), so it's up
/// to the call-site to do it properly.
pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) {
// Implementation note:
//
// Values within [`SerializedValueBatch::raw`] do not have any ordering requirements,
// but the metadata entries should be ordered properly (see
// [`SerializedValueBatch::metadata`]).
//
// Exploiting this observation we do:
// 1. Drain all the metadata entries into an ordered set.
// The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never
// includes more than one update to the same block in the same WAL record.
// 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer
// and add an index entry to the ordered metadata set.
// 3. Drain the ordered set back into a metadata vector
let mut ordered_metas = self
.metadata
.drain(..)
.map(OrderedValueMeta)
.collect::<BTreeSet<_>>();
for (keyspace, lsn) in gaps {
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
for gap_range in keyspace.ranges {
let mut key = gap_range.start;
while key != gap_range.end {
let relative_off = self.raw.len() as u64;
// TODO(vlad): Can we be cheeky and write only one zero image, and
// make all index entries requiring a zero page point to it?
// Alternatively, we can change the index entry format to represent zero pages
// without writing them at all.
Value::Image(ZERO_PAGE.clone())
.ser_into(&mut self.raw)
.unwrap();
let val_ser_size = self.raw.len() - relative_off as usize;
ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized(
SerializedValueMeta {
key: key.to_compact(),
lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: true,
},
)));
self.len += 1;
key = key.next();
}
}
}
self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect();
if cfg!(any(debug_assertions, test)) {
self.validate_lsn_order();
}
}
/// Checks if the batch is empty
///
/// A batch is empty when it contains no serialized values.
/// Note that it may still contain observed values.
pub fn is_empty(&self) -> bool {
let empty = self.raw.is_empty();
if cfg!(debug_assertions) && empty {
assert!(self
.metadata
.iter()
.all(|meta| matches!(meta, ValueMeta::Observed(_))));
}
empty
}
/// Returns the number of values serialized in the batch
pub fn len(&self) -> usize {
self.len
}
/// Returns the size of the buffer wrapped by the batch
pub fn buffer_size(&self) -> usize {
self.raw.len()
}
pub fn updates_key(&self, key: &Key) -> bool {
self.metadata.iter().any(|meta| match meta {
ValueMeta::Serialized(ser) => key.to_compact() == ser.key,
ValueMeta::Observed(_) => false,
})
}
pub fn validate_lsn_order(&self) {
use std::collections::HashMap;
let mut last_seen_lsn_per_key: HashMap<CompactKey, Lsn> = HashMap::default();
for meta in self.metadata.iter() {
let lsn = meta.lsn();
let key = meta.key();
if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) {
assert!(
lsn >= prev_lsn,
"Ordering violated by {}: {} < {}",
Key::from_compact(key),
lsn,
prev_lsn
);
}
}
}
}
#[cfg(all(test, feature = "testing"))]
mod tests {
use super::*;
fn validate_batch(
batch: &SerializedValueBatch,
values: &[(CompactKey, Lsn, usize, Value)],
gaps: Option<&Vec<(KeySpace, Lsn)>>,
) {
// Invariant 1: The metadata for a given entry in the batch
// is correct and can be used to deserialize back to the original value.
for (key, lsn, size, value) in values.iter() {
let meta = batch
.metadata
.iter()
.find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn))
.unwrap();
let meta = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => unreachable!(),
};
assert_eq!(meta.len, *size);
assert_eq!(meta.will_init, value.will_init());
let start = meta.batch_offset as usize;
let end = meta.batch_offset as usize + meta.len;
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
assert_eq!(&value_from_batch, value);
}
let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum();
let mut gap_pages_count: usize = 0;
// Invariant 2: Zero pages were added for identified gaps and their metadata
// is correct.
if let Some(gaps) = gaps {
for (gap_keyspace, lsn) in gaps {
for gap_range in &gap_keyspace.ranges {
let mut gap_key = gap_range.start;
while gap_key != gap_range.end {
let meta = batch
.metadata
.iter()
.find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn))
.unwrap();
let meta = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => unreachable!(),
};
let zero_value = Value::Image(ZERO_PAGE.clone());
let zero_value_size = zero_value.serialized_size().unwrap() as usize;
assert_eq!(meta.len, zero_value_size);
assert_eq!(meta.will_init, zero_value.will_init());
let start = meta.batch_offset as usize;
let end = meta.batch_offset as usize + meta.len;
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
assert_eq!(value_from_batch, zero_value);
gap_pages_count += 1;
expected_buffer_size += zero_value_size;
gap_key = gap_key.next();
}
}
}
}
// Invariant 3: The length of the batch is equal to the number
// of values inserted, plus the number of gap pages. This extends
// to the raw buffer size.
assert_eq!(batch.len(), values.len() + gap_pages_count);
assert_eq!(expected_buffer_size, batch.buffer_size());
// Invariant 4: Metadata entries for any given key are sorted in LSN order.
batch.validate_lsn_order();
}
#[test]
fn test_creation_from_values() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("baz")),
),
(
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let batch = SerializedValueBatch::from_values(values.clone());
validate_batch(&batch, &values, None);
assert!(!batch.is_empty());
}
#[test]
fn test_put() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
];
let mut values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
validate_batch(&batch, &values, None);
let value = (
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("baz")),
);
let serialized_size = value.2.serialized_size().unwrap() as usize;
let value = (value.0, value.1, serialized_size, value.2);
values.push(value.clone());
batch.put(value.0, value.3, value.1);
validate_batch(&batch, &values, None);
let value = (
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
);
let serialized_size = value.2.serialized_size().unwrap() as usize;
let value = (value.0, value.1, serialized_size, value.2);
values.push(value.clone());
batch.put(value.0, value.3, value.1);
validate_batch(&batch, &values, None);
}
#[test]
fn test_extension() {
const LSN: Lsn = Lsn(0x10);
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let values = vec![
(
key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.next().next().to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let mut values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
let other_values = vec![
(
key.to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo")),
),
(
key.next().to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("bar")),
),
(
key.next().next().to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("taz")),
),
];
let other_values = other_values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let other_batch = SerializedValueBatch::from_values(other_values.clone());
values.extend(other_values);
batch.extend(other_batch);
validate_batch(&batch, &values, None);
}
#[test]
fn test_gap_zeroing() {
const LSN: Lsn = Lsn(0x10);
let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap();
let rel_bar_base_key = {
let mut key = rel_foo_base_key;
key.field4 += 1;
key
};
let values = vec![
(
rel_foo_base_key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo1")),
),
(
rel_foo_base_key.add(1).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo2")),
),
(
rel_foo_base_key.add(5).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("foo3")),
),
(
rel_foo_base_key.add(1).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo4")),
),
(
rel_foo_base_key.add(10).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo5")),
),
(
rel_foo_base_key.add(11).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo6")),
),
(
rel_foo_base_key.add(12).to_compact(),
Lsn(LSN.0 + 0x10),
Value::WalRecord(NeonWalRecord::wal_append("foo7")),
),
(
rel_bar_base_key.to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar1")),
),
(
rel_bar_base_key.add(4).to_compact(),
LSN,
Value::WalRecord(NeonWalRecord::wal_append("bar2")),
),
];
let values = values
.into_iter()
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
.collect::<Vec<_>>();
let mut batch = SerializedValueBatch::from_values(values.clone());
let gaps = vec![
(
KeySpace {
ranges: vec![
rel_foo_base_key.add(2)..rel_foo_base_key.add(5),
rel_bar_base_key.add(1)..rel_bar_base_key.add(4),
],
},
LSN,
),
(
KeySpace {
ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)],
},
Lsn(LSN.0 + 0x10),
),
];
batch.zero_gaps(gaps.clone());
validate_batch(&batch, &values, Some(&gaps));
}
}

View File

@@ -9,7 +9,6 @@ use pageserver::{
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
task_mgr::TaskKind,
tenant::storage_layer::inmemory_layer::SerializedBatch,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
@@ -18,6 +17,7 @@ use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
// A very cheap hash for generating non-sequential keys.
fn murmurhash32(mut h: u32) -> u32 {
@@ -102,13 +102,13 @@ async fn ingest(
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
if batch.len() >= BATCH_SIZE {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
let serialized = SerializedValueBatch::from_values(this_batch);
layer.put_batch(serialized, &ctx).await?;
}
}
if !batch.is_empty() {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
let serialized = SerializedValueBatch::from_values(this_batch);
layer.put_batch(serialized, &ctx).await?;
}
layer.freeze(lsn + 1).await;

View File

@@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> {
},
};
let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
if conf.no_sync {
info!("Skipping syncfs on startup");
} else {
let started = Instant::now();
syncfs(dirfd)?;
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
}
}
// Initialize up failpoints support

View File

@@ -69,6 +69,7 @@ pub struct PageServerConf {
pub wal_redo_timeout: Duration,
pub superuser: String,
pub locale: String,
pub page_cache_size: usize,
pub max_file_descriptors: usize,
@@ -178,6 +179,9 @@ pub struct PageServerConf {
/// Direct IO settings
pub virtual_file_io_mode: virtual_file::IoMode,
/// Optionally disable disk syncs (unsafe!)
pub no_sync: bool,
}
/// Token for authentication to safekeepers
@@ -298,6 +302,7 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
locale,
page_cache_size,
max_file_descriptors,
pg_distrib_dir,
@@ -332,6 +337,7 @@ impl PageServerConf {
concurrent_tenant_size_logical_size_queries,
virtual_file_io_engine,
tenant_config,
no_sync,
} = config_toml;
let mut conf = PageServerConf {
@@ -344,6 +350,7 @@ impl PageServerConf {
wait_lsn_timeout,
wal_redo_timeout,
superuser,
locale,
page_cache_size,
max_file_descriptors,
http_auth_type,
@@ -409,6 +416,7 @@ impl PageServerConf {
.map(crate::l0_flush::L0FlushConfig::from)
.unwrap_or_default(),
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
no_sync: no_sync.unwrap_or(false),
};
// ------------------------------------------------------------

View File

@@ -37,6 +37,7 @@ use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TenantState;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TimelineCreateRequestMode;
use pageserver_api::models::TimelinesInfoAndOffloaded;
@@ -295,6 +296,9 @@ impl From<GetActiveTenantError> for ApiError {
GetActiveTenantError::Broken(reason) => {
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
}
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
ApiError::ShuttingDown
}
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
GetActiveTenantError::NotFound(gte) => gte.into(),
@@ -1998,9 +2002,9 @@ async fn timeline_offload_handler(
"timeline has attached children".into(),
));
}
if !timeline.can_offload() {
if let (false, reason) = timeline.can_offload() {
return Err(ApiError::PreconditionFailed(
"Timeline::can_offload() returned false".into(),
format!("Timeline::can_offload() check failed: {}", reason) .into(),
));
}
offload_timeline(&tenant, &timeline)
@@ -2165,6 +2169,21 @@ async fn timeline_detach_ancestor_handler(
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
// Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
// during shutdown. This early upload ensures the pageserver does not need to upload too many
// things and creates downtime during timeline reloads.
for timeline in tenant.list_timelines() {
timeline
.remote_client
.wait_completion()
.await
.map_err(|e| {
ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
})?;
}
tracing::info!("all timeline upload queues are drained");
let timeline = tenant.get_timeline(timeline_id, true)?;
let progress = timeline

View File

@@ -1,10 +1,11 @@
//! The Page Service listens for client connections and serves their GetPage@LSN
//! requests.
use anyhow::Context;
use anyhow::{bail, Context};
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use futures::FutureExt;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
@@ -1221,6 +1222,222 @@ impl PageServerHandler {
}
}
/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
#[derive(Debug, Clone, Eq, PartialEq)]
struct BaseBackupCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Option<Lsn>,
gzip: bool,
replica: bool,
}
/// `fullbackup tenant timeline [lsn] [prev_lsn]`
#[derive(Debug, Clone, Eq, PartialEq)]
struct FullBackupCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
}
/// `pagestream_v2 tenant timeline`
#[derive(Debug, Clone, Eq, PartialEq)]
struct PageStreamCmd {
tenant_id: TenantId,
timeline_id: TimelineId,
}
/// `lease lsn tenant timeline lsn`
#[derive(Debug, Clone, Eq, PartialEq)]
struct LeaseLsnCmd {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
}
#[derive(Debug, Clone, Eq, PartialEq)]
enum PageServiceCmd {
Set,
PageStream(PageStreamCmd),
BaseBackup(BaseBackupCmd),
FullBackup(FullBackupCmd),
LeaseLsn(LeaseLsnCmd),
}
impl PageStreamCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 2 {
bail!(
"invalid number of parameters for pagestream command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
Ok(Self {
tenant_id,
timeline_id,
})
}
}
impl FullBackupCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() < 2 || parameters.len() > 4 {
bail!(
"invalid number of parameters for basebackup command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = parameters.get(2) {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
)
} else {
None
};
Ok(Self {
tenant_id,
timeline_id,
lsn,
prev_lsn,
})
}
}
impl BaseBackupCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() < 2 {
bail!(
"invalid number of parameters for basebackup command: {}",
query
);
}
let tenant_id = TenantId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
let lsn;
let flags_parse_from;
if let Some(maybe_lsn) = parameters.get(2) {
if *maybe_lsn == "latest" {
lsn = None;
flags_parse_from = 3;
} else if maybe_lsn.starts_with("--") {
lsn = None;
flags_parse_from = 2;
} else {
lsn = Some(
Lsn::from_str(maybe_lsn)
.with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
);
flags_parse_from = 3;
}
} else {
lsn = None;
flags_parse_from = 2;
}
let mut gzip = false;
let mut replica = false;
for &param in &parameters[flags_parse_from..] {
match param {
"--gzip" => {
if gzip {
bail!("duplicate parameter for basebackup command: {param}")
}
gzip = true
}
"--replica" => {
if replica {
bail!("duplicate parameter for basebackup command: {param}")
}
replica = true
}
_ => bail!("invalid parameter for basebackup command: {param}"),
}
}
Ok(Self {
tenant_id,
timeline_id,
lsn,
gzip,
replica,
})
}
}
impl LeaseLsnCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let parameters = query.split_whitespace().collect_vec();
if parameters.len() != 3 {
bail!(
"invalid number of parameters for lease lsn command: {}",
query
);
}
let tenant_shard_id = TenantShardId::from_str(parameters[0])
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
let timeline_id = TimelineId::from_str(parameters[1])
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
let lsn = Lsn::from_str(parameters[2])
.with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
Ok(Self {
tenant_shard_id,
timeline_id,
lsn,
})
}
}
impl PageServiceCmd {
fn parse(query: &str) -> anyhow::Result<Self> {
let query = query.trim();
let Some((cmd, other)) = query.split_once(' ') else {
bail!("cannot parse query: {query}")
};
match cmd.to_ascii_lowercase().as_str() {
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
"lease" => {
let Some((cmd2, other)) = other.split_once(' ') else {
bail!("invalid lease command: {cmd}");
};
let cmd2 = cmd2.to_ascii_lowercase();
if cmd2 == "lsn" {
Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
} else {
bail!("invalid lease command: {cmd}");
}
}
"set" => Ok(Self::Set),
_ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
}
}
}
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1277,206 +1494,137 @@ where
fail::fail_point!("ps::connection-start::process-query");
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
let parts = query_string.split_whitespace().collect::<Vec<_>>();
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
debug!("process query {query_string}");
let query = PageServiceCmd::parse(query_string)?;
match query {
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for basebackup command"
)));
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V2,
ctx,
)
.await?;
}
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn,
gzip,
replica,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
self.check_permission(Some(tenant_id))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();
let mut lsn = None;
let mut replica = false;
let mut gzip = false;
for param in &params[2..] {
if param.starts_with("--") {
match *param {
"--gzip" => gzip = true,
"--replica" => replica = true,
_ => {
return Err(QueryError::Other(anyhow::anyhow!(
"Unknown parameter {param}",
)))
}
}
} else {
lsn = Some(
Lsn::from_str(param)
.with_context(|| format!("Failed to parse Lsn from {param}"))?,
);
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
let res = async {
self.handle_basebackup_request(
pgb,
tenant_id,
timeline_id,
lsn,
None,
false,
gzip,
replica,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Result::<(), QueryError>::Ok(())
}
.await;
metric_recording.observe(&res);
res?;
}
// same as basebackup, but result includes relational data as well
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn,
prev_lsn,
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
let res = async {
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();
// Check that the timeline exists
self.handle_basebackup_request(
pgb,
tenant_id,
timeline_id,
lsn,
None,
prev_lsn,
true,
false,
false,
gzip,
replica,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Result::<(), QueryError>::Ok(())
}
.await;
metric_recording.observe(&res);
res?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for fullbackup command"
)));
PageServiceCmd::Set => {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if let Some(lsn_str) = params.get(2) {
Some(
Lsn::from_str(lsn_str)
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
)
} else {
None
};
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
Some(
Lsn::from_str(prev_lsn_str)
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
)
} else {
None
};
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();
// Check that the timeline exists
self.handle_basebackup_request(
pgb,
tenant_id,
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
timeline_id,
lsn,
prev_lsn,
true,
false,
false,
&ctx,
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.to_ascii_lowercase().starts_with("set ") {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("lease lsn ") {
let params = &parts[2..];
if params.len() != 3 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number {} for lease lsn command",
params.len()
)));
}) => {
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => {
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
}
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
}
let tenant_shard_id = TenantShardId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_shard_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_shard_id.tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();
// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
match self
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error obtaining lsn lease for {lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"
)));
}
Ok(())
@@ -1525,3 +1673,181 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
);
debug_assert_current_span_has_tenant_and_timeline_id();
}
#[cfg(test)]
mod tests {
use utils::shard::ShardCount;
use super::*;
#[test]
fn pageservice_cmd_parse() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let cmd =
PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::PageStream(PageStreamCmd {
tenant_id,
timeline_id
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: false,
replica: false
})
);
let cmd =
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: true,
replica: false
})
);
let cmd =
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: false,
replica: false
})
);
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: false,
replica: false
})
);
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --replica --gzip"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: None,
gzip: true,
replica: true
})
);
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::BaseBackup(BaseBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
gzip: true,
replica: true
})
);
let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
assert_eq!(
cmd,
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn: None,
prev_lsn: None
})
);
let cmd = PageServiceCmd::parse(&format!(
"fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::FullBackup(FullBackupCmd {
tenant_id,
timeline_id,
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
})
);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let cmd = PageServiceCmd::parse(&format!(
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
timeline_id,
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
})
);
let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
let cmd = PageServiceCmd::parse(&format!(
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
))
.unwrap();
assert_eq!(
cmd,
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
tenant_shard_id,
timeline_id,
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
})
);
let cmd = PageServiceCmd::parse("set a = b").unwrap();
assert_eq!(cmd, PageServiceCmd::Set);
let cmd = PageServiceCmd::parse("SET foo").unwrap();
assert_eq!(cmd, PageServiceCmd::Set);
}
#[test]
fn pageservice_cmd_err_handling() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let cmd = PageServiceCmd::parse("unknown_command");
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse("pagestream_v2");
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip --gzip"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip --unknown"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!(
"basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
));
assert!(cmd.is_err());
let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
assert!(cmd.is_err());
}
}

View File

@@ -24,6 +24,7 @@ use pageserver_api::key::{
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use pageserver_api::value::Value;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -38,12 +39,13 @@ use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::{bin_ser::BeSer, lsn::Lsn};
use wal_decoder::serialized_batch::SerializedValueBatch;
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
#[derive(Debug)]
pub enum LsnForTimestamp {
@@ -170,12 +172,11 @@ impl Timeline {
tline: self,
pending_lsns: Vec::new(),
pending_metadata_pages: HashMap::new(),
pending_data_pages: Vec::new(),
pending_zero_data_pages: Default::default(),
pending_data_batch: None,
pending_deletions: Vec::new(),
pending_nblocks: 0,
pending_directory_entries: Vec::new(),
pending_bytes: 0,
pending_metadata_bytes: 0,
lsn,
}
}
@@ -1025,21 +1026,14 @@ pub struct DatadirModification<'a> {
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
/// which keys are stored here.
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
// if we encounter a write from postgres in the same wal record, we will drop this entry.
//
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
pending_zero_data_pages: HashSet<CompactKey>,
pending_data_batch: Option<SerializedValueBatch>,
/// For special "directory" keys that store key-value maps, track the size of the map
/// if it was updated in this modification.
pending_directory_entries: Vec<(DirectoryKind, usize)>,
/// An **approximation** of how large our EphemeralFile write will be when committed.
pending_bytes: usize,
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
pending_metadata_bytes: usize,
}
impl<'a> DatadirModification<'a> {
@@ -1054,11 +1048,17 @@ impl<'a> DatadirModification<'a> {
}
pub(crate) fn approx_pending_bytes(&self) -> usize {
self.pending_bytes
self.pending_data_batch
.as_ref()
.map_or(0, |b| b.buffer_size())
+ self.pending_metadata_bytes
}
pub(crate) fn has_dirty_data_pages(&self) -> bool {
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
pub(crate) fn has_dirty_data(&self) -> bool {
!self
.pending_data_batch
.as_ref()
.map_or(true, |b| b.is_empty())
}
/// Set the current lsn
@@ -1070,9 +1070,6 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
// If we are advancing LSN, then state from previous wal record should have been flushed.
assert!(self.pending_zero_data_pages.is_empty());
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
@@ -1147,6 +1144,107 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
/// Creates a relation if it is not already present.
/// Returns the current size of the relation
pub(crate) async fn create_relation_if_required(
&mut self,
rel: RelTag,
ctx: &RequestContext,
) -> Result<u32, PageReconstructError> {
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
Ok(nblocks)
} else if !self
.tline
.get_rel_exists(rel, Version::Modified(self), ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
self.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
Ok(0)
} else {
self.tline
.get_rel_size(rel, Version::Modified(self), ctx)
.await
}
}
/// Given a block number for a relation (which represents a newly written block),
/// the previous block count of the relation, and the shard info, find the gaps
/// that were created by the newly written block if any.
fn find_gaps(
rel: RelTag,
blkno: u32,
previous_nblocks: u32,
shard: &ShardIdentity,
) -> Option<KeySpace> {
let mut key = rel_block_to_key(rel, blkno);
let mut gap_accum = None;
for gap_blkno in previous_nblocks..blkno {
key.field6 = gap_blkno;
if shard.get_shard_number(&key) != shard.number {
continue;
}
gap_accum
.get_or_insert_with(KeySpaceAccum::new)
.add_key(key);
}
gap_accum.map(|accum| accum.to_keyspace())
}
pub async fn ingest_batch(
&mut self,
mut batch: SerializedValueBatch,
// TODO(vlad): remove this argument and replace the shard check with is_key_local
shard: &ShardIdentity,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut gaps_at_lsns = Vec::default();
for meta in batch.metadata.iter() {
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
let new_nblocks = blkno + 1;
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
if new_nblocks > old_nblocks {
self.put_rel_extend(rel, new_nblocks, ctx).await?;
}
if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
gaps_at_lsns.push((gaps, meta.lsn()));
}
}
if !gaps_at_lsns.is_empty() {
batch.zero_gaps(gaps_at_lsns);
}
match self.pending_data_batch.as_mut() {
Some(pending_batch) => {
pending_batch.extend(batch);
}
None if !batch.is_empty() => {
self.pending_data_batch = Some(batch);
}
None => {
// Nothing to initialize the batch with
}
}
Ok(())
}
/// Put a new page version that can be constructed from a WAL record
///
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1229,8 +1327,13 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
}
self.pending_zero_data_pages.insert(key.to_compact());
self.pending_bytes += ZERO_PAGE.len();
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
Ok(())
}
@@ -1248,17 +1351,14 @@ impl<'a> DatadirModification<'a> {
self.lsn
);
}
self.pending_zero_data_pages.insert(key.to_compact());
self.pending_bytes += ZERO_PAGE.len();
Ok(())
}
/// Call this at the end of each WAL record.
pub(crate) fn on_record_end(&mut self) {
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
for key in pending_zero_data_pages {
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
}
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
Ok(())
}
/// Store a relmapper file (pg_filenode.map) in the repository
@@ -1750,12 +1850,17 @@ impl<'a> DatadirModification<'a> {
let mut writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
if let Some(batch) = self.pending_data_batch.take() {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(pending_data_pages, ctx).await?;
self.pending_bytes = 0;
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(batch, ctx).await?;
}
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1775,9 +1880,6 @@ impl<'a> DatadirModification<'a> {
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
// Commit should never be called mid-wal-record
assert!(self.pending_zero_data_pages.is_empty());
let mut writer = self.tline.writer().await;
let pending_nblocks = self.pending_nblocks;
@@ -1785,21 +1887,49 @@ impl<'a> DatadirModification<'a> {
// Ordering: the items in this batch do not need to be in any global order, but values for
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
// this to do efficient updates to its index.
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
// this to do efficient updates to its index. See [`wal_decoder::serialized_batch`] for
// more details.
write_batch.extend(
self.pending_metadata_pages
let metadata_batch = {
let pending_meta = self
.pending_metadata_pages
.drain()
.flat_map(|(key, values)| {
values
.into_iter()
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
}),
);
})
.collect::<Vec<_>>();
if !write_batch.is_empty() {
writer.put_batch(write_batch, ctx).await?;
if pending_meta.is_empty() {
None
} else {
Some(SerializedValueBatch::from_values(pending_meta))
}
};
let data_batch = self.pending_data_batch.take();
let maybe_batch = match (data_batch, metadata_batch) {
(Some(mut data), Some(metadata)) => {
data.extend(metadata);
Some(data)
}
(Some(data), None) => Some(data),
(None, Some(metadata)) => Some(metadata),
(None, None) => None,
};
if let Some(batch) = maybe_batch {
tracing::debug!(
"Flushing batch with max_lsn={}. Last record LSN is {}",
batch.max_lsn,
self.tline.get_last_record_lsn()
);
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(batch, ctx).await?;
}
if !self.pending_deletions.is_empty() {
@@ -1809,6 +1939,9 @@ impl<'a> DatadirModification<'a> {
self.pending_lsns.push(self.lsn);
for pending_lsn in self.pending_lsns.drain(..) {
// TODO(vlad): pretty sure the comment below is not valid anymore
// and we can call finish write with the latest LSN
//
// Ideally, we should be able to call writer.finish_write() only once
// with the highest LSN. However, the last_record_lsn variable in the
// timeline keeps track of the latest LSN and the immediate previous LSN
@@ -1824,14 +1957,14 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
self.pending_bytes = 0;
self.pending_metadata_bytes = 0;
Ok(())
}
pub(crate) fn len(&self) -> usize {
self.pending_metadata_pages.len()
+ self.pending_data_pages.len()
+ self.pending_data_batch.as_ref().map_or(0, |b| b.len())
+ self.pending_deletions.len()
}
@@ -1873,11 +2006,10 @@ impl<'a> DatadirModification<'a> {
// modifications before ingesting DB create operations, which are the only kind that reads
// data pages during ingest.
if cfg!(debug_assertions) {
for (dirty_key, _, _, _) in &self.pending_data_pages {
debug_assert!(&key.to_compact() != dirty_key);
}
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
assert!(!self
.pending_data_batch
.as_ref()
.map_or(false, |b| b.updates_key(&key)));
}
}
@@ -1895,18 +2027,10 @@ impl<'a> DatadirModification<'a> {
}
fn put_data(&mut self, key: CompactKey, val: Value) {
let val_serialized_size = val.serialized_size().unwrap() as usize;
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
// and the subsequent postgres-originating write
if self.pending_zero_data_pages.remove(&key) {
self.pending_bytes -= ZERO_PAGE.len();
}
self.pending_bytes += val_serialized_size;
self.pending_data_pages
.push((key, self.lsn, val_serialized_size, val))
let batch = self
.pending_data_batch
.get_or_insert_with(SerializedValueBatch::default);
batch.put(key, val, self.lsn);
}
fn put_metadata(&mut self, key: CompactKey, val: Value) {
@@ -1914,10 +2038,10 @@ impl<'a> DatadirModification<'a> {
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
// Update the pending_bytes contribution from this entry, and update the serialized size in place
self.pending_bytes -= *last_value_ser_size;
// Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
self.pending_metadata_bytes -= *last_value_ser_size;
*last_value_ser_size = val.serialized_size().unwrap() as usize;
self.pending_bytes += *last_value_ser_size;
self.pending_metadata_bytes += *last_value_ser_size;
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
// have been generated by synthesized zero page writes prior to the first real write to a page.
@@ -1927,8 +2051,12 @@ impl<'a> DatadirModification<'a> {
}
let val_serialized_size = val.serialized_size().unwrap() as usize;
self.pending_bytes += val_serialized_size;
self.pending_metadata_bytes += val_serialized_size;
values.push((self.lsn, val_serialized_size, val));
if key == CHECKPOINT_KEY.to_compact() {
tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
}
}
fn delete(&mut self, key_range: Range<Key>) {
@@ -2037,7 +2165,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
#[cfg(test)]
mod tests {
use hex_literal::hex;
use utils::id::TimelineId;
use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
use utils::{
id::TimelineId,
shard::{ShardCount, ShardNumber},
};
use super::*;
@@ -2091,6 +2223,93 @@ mod tests {
Ok(())
}
#[test]
fn gap_finding() {
let rel = RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
};
let base_blkno = 1;
let base_key = rel_block_to_key(rel, base_blkno);
let before_base_key = rel_block_to_key(rel, base_blkno - 1);
let shard = ShardIdentity::unsharded();
let mut previous_nblocks = 0;
for i in 0..10 {
let crnt_blkno = base_blkno + i;
let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
previous_nblocks = crnt_blkno + 1;
if i == 0 {
// The first block we write is 1, so we should find the gap.
assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
} else {
assert!(gaps.is_none());
}
}
// This is an update to an already existing block. No gaps here.
let update_blkno = 5;
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
assert!(gaps.is_none());
// This is an update past the current end block.
let after_gap_blkno = 20;
let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
let gap_start_key = rel_block_to_key(rel, previous_nblocks);
let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
assert_eq!(
gaps.unwrap(),
KeySpace::single(gap_start_key..after_gap_key)
);
}
#[test]
fn sharded_gap_finding() {
let rel = RelTag {
spcnode: 1663,
dbnode: 208101,
relnode: 2620,
forknum: 0,
};
let first_blkno = 6;
// This shard will get the even blocks
let shard = ShardIdentity::from_params(
ShardNumber(0),
&ShardParameters {
count: ShardCount(2),
stripe_size: ShardStripeSize(1),
},
);
// Only keys belonging to this shard are considered as gaps.
let mut previous_nblocks = 0;
let gaps =
DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
assert!(!gaps.ranges.is_empty());
for gap_range in gaps.ranges {
let mut k = gap_range.start;
while k != gap_range.end {
assert_eq!(shard.get_shard_number(&k), shard.number);
k = k.next();
}
}
previous_nblocks = first_blkno;
let update_blkno = 2;
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
assert!(gaps.is_none());
}
/*
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
let incremental = timeline.get_current_logical_size();

View File

@@ -2493,14 +2493,22 @@ impl Tenant {
timelines_to_compact_or_offload = timelines
.iter()
.filter_map(|(timeline_id, timeline)| {
let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
let (is_active, (can_offload, _)) =
(timeline.is_active(), timeline.can_offload());
let has_no_unoffloaded_children = {
!timelines
.iter()
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
};
let config_allows_offload = self.conf.timeline_offloading
|| self
.tenant_conf
.load()
.tenant_conf
.timeline_offloading
.unwrap_or_default();
let can_offload =
can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
can_offload && has_no_unoffloaded_children && config_allows_offload;
if (is_active, can_offload) == (false, false) {
None
} else {
@@ -4772,10 +4780,18 @@ async fn run_initdb(
let _permit = INIT_DB_SEMAPHORE.acquire().await;
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
initdb_command
.args(["--pgdata", initdb_target_dir.as_ref()])
.args(["--username", &conf.superuser])
.args(["--encoding", "utf8"])
.args(["--locale", &conf.locale])
.args(["--lc-collate", &conf.locale])
.args(["--lc-ctype", &conf.locale])
.args(["--lc-messages", &conf.locale])
.args(["--lc-monetary", &conf.locale])
.args(["--lc-numeric", &conf.locale])
.args(["--lc-time", &conf.locale])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
@@ -4785,15 +4801,27 @@ async fn run_initdb(
// stdout invocation produces the same output every time, we don't need it
.stdout(std::process::Stdio::null())
// we would be interested in the stderr output, if there was any
.stderr(std::process::Stdio::piped())
.spawn()?;
.stderr(std::process::Stdio::piped());
// Before version 14, only the libc provide was available.
if pg_version > 14 {
// Version 17 brought with it a builtin locale provider which only provides
// C and C.UTF-8. While being safer for collation purposes since it is
// guaranteed to be consistent throughout a major release, it is also more
// performant.
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
initdb_command.args(["--locale-provider", locale_provider]);
}
let initdb_proc = initdb_command.spawn()?;
// Ideally we'd select here with the cancellation token, but the problem is that
// we can't safely terminate initdb: it launches processes of its own, and killing
// initdb doesn't kill them. After we return from this function, we want the target
// directory to be able to be cleaned up.
// See https://github.com/neondatabase/neon/issues/6385
let initdb_output = initdb_command.wait_with_output().await?;
let initdb_output = initdb_proc.wait_with_output().await?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(
initdb_output.status,
@@ -4902,6 +4930,7 @@ pub(crate) mod harness {
),
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
timeline_offloading: Some(tenant_conf.timeline_offloading),
}
}
}

View File

@@ -349,6 +349,10 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
#[serde(default)]
pub lsn_lease_length_for_ts: Option<Duration>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub timeline_offloading: Option<bool>,
}
impl TenantConfOpt {
@@ -411,6 +415,9 @@ impl TenantConfOpt {
lsn_lease_length_for_ts: self
.lsn_lease_length_for_ts
.unwrap_or(global_conf.lsn_lease_length_for_ts),
timeline_offloading: self
.lazy_slru_download
.unwrap_or(global_conf.timeline_offloading),
}
}
}
@@ -464,6 +471,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
timeline_offloading: value.timeline_offloading,
}
}
}

View File

@@ -1959,7 +1959,7 @@ impl TenantManager {
attempt.before_reset_tenant();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
match tenant.shutdown(progress, ShutdownMode::Flush).await {
Ok(()) => {
slot_guard.drop_old_value().expect("it was just shutdown");
}

View File

@@ -1445,7 +1445,7 @@ impl RemoteTimelineClient {
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
uploaded.metadata().shard,
&uploaded.layer_desc().layer_name(),
uploaded.metadata().generation,
);
@@ -1486,7 +1486,7 @@ impl RemoteTimelineClient {
&adopted
.get_timeline_id()
.expect("Source timeline should be alive"),
self.tenant_shard_id.to_index(),
adopted.metadata().shard,
&adopted.layer_desc().layer_name(),
adopted.metadata().generation,
);
@@ -1494,7 +1494,7 @@ impl RemoteTimelineClient {
let target_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
adopted_as.metadata().shard,
&adopted_as.layer_desc().layer_name(),
adopted_as.metadata().generation,
);
@@ -2201,6 +2201,18 @@ impl RemoteTimelineClient {
inner.initialized_mut()?;
Ok(UploadQueueAccessor { inner })
}
pub(crate) fn no_pending_work(&self) -> bool {
let inner = self.upload_queue.lock().unwrap();
match &*inner {
UploadQueue::Uninitialized
| UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
x.upload_queue_for_deletion.no_pending_work()
}
UploadQueue::Initialized(x) => x.no_pending_work(),
}
}
}
pub(crate) struct UploadQueueAccessor<'a> {

View File

@@ -12,7 +12,7 @@ pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use bytes::Bytes;
use pageserver_api::key::Key;
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::value::Value;
@@ -196,6 +196,9 @@ impl ValuesReconstructState {
/// Returns true if this was the last value needed for the key and false otherwise.
///
/// If the key is done after the update, mark it as such.
///
/// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
/// `key_done`.
pub(crate) fn update_key(
&mut self,
key: &Key,
@@ -206,10 +209,18 @@ impl ValuesReconstructState {
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => unreachable!(),
ValueReconstructSituation::Complete => {
if is_sparse_key {
// Sparse keyspace might be visited multiple times because
// we don't track unmapped keyspaces.
return ValueReconstructSituation::Complete;
} else {
unreachable!()
}
}
ValueReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
@@ -234,7 +245,9 @@ impl ValuesReconstructState {
if key_done && state.situation == ValueReconstructSituation::Continue {
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
if !is_sparse_key {
self.keys_done.add_key(*key);
}
}
state.situation

View File

@@ -12,7 +12,7 @@ use crate::tenant::timeline::GetVectoredError;
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::{l0_flush, page_cache};
use anyhow::{anyhow, Context, Result};
use anyhow::{anyhow, Result};
use camino::Utf8PathBuf;
use pageserver_api::key::CompactKey;
use pageserver_api::key::Key;
@@ -25,6 +25,7 @@ use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
@@ -66,6 +67,8 @@ pub struct InMemoryLayer {
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
estimated_in_mem_size: AtomicU64,
}
impl std::fmt::Debug for InMemoryLayer {
@@ -452,6 +455,7 @@ impl InMemoryLayer {
len,
will_init,
} = index_entry.unpack();
reads.entry(key).or_default().push(ValueRead {
entry_lsn: *entry_lsn,
read: vectored_dio_read::LogicalRead::new(
@@ -513,68 +517,6 @@ impl InMemoryLayer {
}
}
/// Offset of a particular Value within a serialized batch.
struct SerializedBatchOffset {
key: CompactKey,
lsn: Lsn,
// TODO: separate type when we start serde-serializing this value, to avoid coupling
// in-memory representation to serialization format.
index_entry: IndexEntry,
}
pub struct SerializedBatch {
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
pub(crate) raw: Vec<u8>,
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
offsets: Vec<SerializedBatchOffset>,
/// The highest LSN of any value in the batch
pub(crate) max_lsn: Lsn,
}
impl SerializedBatch {
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
let mut max_lsn: Lsn = Lsn(0);
for (key, lsn, val_ser_size, val) in batch {
let relative_off = cursor.position();
val.ser_into(&mut cursor)
.expect("Writing into in-memory buffer is infallible");
offsets.push(SerializedBatchOffset {
key,
lsn,
index_entry: IndexEntry::new(IndexEntryNewArgs {
base_offset: 0,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
})
.context("higher-level code ensures that values are within supported ranges")?,
});
max_lsn = std::cmp::max(max_lsn, lsn);
}
let buffer = cursor.into_inner();
// Assert that we didn't do any extra allocations while building buffer.
debug_assert!(buffer.len() <= buffer_size);
Ok(Self {
raw: buffer,
offsets,
max_lsn,
})
}
}
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
}
@@ -603,6 +545,10 @@ impl InMemoryLayer {
Ok(inner.file.len())
}
pub fn estimated_in_mem_size(&self) -> u64 {
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
}
/// Create a new, empty, in-memory layer
pub async fn create(
conf: &'static PageServerConf,
@@ -632,6 +578,7 @@ impl InMemoryLayer {
file,
resource_units: GlobalResourceUnits::new(),
}),
estimated_in_mem_size: AtomicU64::new(0),
})
}
@@ -642,7 +589,7 @@ impl InMemoryLayer {
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
pub async fn put_batch(
&self,
serialized_batch: SerializedBatch,
serialized_batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut inner = self.inner.write().await;
@@ -650,27 +597,13 @@ impl InMemoryLayer {
let base_offset = inner.file.len();
let SerializedBatch {
let SerializedValueBatch {
raw,
mut offsets,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
// Add the base_offset to the batch's index entries which are relative to the batch start.
for offset in &mut offsets {
let IndexEntryUnpacked {
will_init,
len,
pos,
} = offset.index_entry.unpack();
offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
base_offset,
batch_offset: pos,
len: len.into_usize(),
will_init,
})?;
}
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
@@ -683,12 +616,28 @@ impl InMemoryLayer {
assert_eq!(new_size, expected_new_len);
// Update the index with the new entries
for SerializedBatchOffset {
key,
lsn,
index_entry,
} in offsets
{
for meta in metadata {
let SerializedValueMeta {
key,
lsn,
batch_offset,
len,
will_init,
} = match meta {
ValueMeta::Serialized(ser) => ser,
ValueMeta::Observed(_) => {
continue;
}
};
// Add the base_offset to the batch's index entries which are relative to the batch start.
let index_entry = IndexEntry::new(IndexEntryNewArgs {
base_offset,
batch_offset,
len,
will_init,
})?;
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
if old.is_some() {
@@ -700,6 +649,12 @@ impl InMemoryLayer {
// because this case is unexpected, and we would like tests to fail if this happens.
warn!("Key {} at {} written twice at same LSN", key, lsn);
}
self.estimated_in_mem_size.fetch_add(
(std::mem::size_of::<CompactKey>()
+ std::mem::size_of::<Lsn>()
+ std::mem::size_of::<IndexEntry>()) as u64,
AtomicOrdering::Relaxed,
);
}
inner.resource_units.maybe_publish_size(new_size);

View File

@@ -23,9 +23,10 @@ use handle::ShardTimelineId;
use offload::OffloadError;
use once_cell::sync::Lazy;
use pageserver_api::{
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
key::{
CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
NON_INHERITED_SPARSE_RANGE,
},
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
models::{
@@ -49,6 +50,7 @@ use utils::{
fs_ext, pausable_failpoint,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -131,7 +133,6 @@ use crate::task_mgr::TaskKind;
use crate::tenant::gc_result::GcResult;
use crate::ZERO_PAGE;
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -141,9 +142,7 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::{
config::TenantConf,
storage_layer::{inmemory_layer, LayerVisibilityHint},
upload_queue::NotInitialized,
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
MaybeOffloaded,
};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -157,6 +156,9 @@ use super::{
GcError,
};
#[cfg(test)]
use pageserver_api::value::Value;
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) enum FlushLoopState {
NotStarted,
@@ -851,6 +853,10 @@ pub(crate) enum ShutdownMode {
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
FreezeAndFlush,
/// Only flush the layers to the remote storage without freezing any open layers. This is the
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
/// the generation number.
Flush,
/// Shut down immediately, without waiting for any open layers to flush.
Hard,
}
@@ -1564,12 +1570,16 @@ impl Timeline {
///
/// This is neccessary but not sufficient for offloading of the timeline as it might have
/// child timelines that are not offloaded yet.
pub(crate) fn can_offload(&self) -> bool {
pub(crate) fn can_offload(&self) -> (bool, &'static str) {
if self.remote_client.is_archived() != Some(true) {
return false;
return (false, "the timeline is not archived");
}
if !self.remote_client.no_pending_work() {
// if the remote client is still processing some work, we can't offload
return (false, "the upload queue is not drained yet");
}
true
(true, "ok")
}
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
@@ -1677,11 +1687,6 @@ impl Timeline {
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
debug_assert_current_span_has_tenant_and_timeline_id();
let try_freeze_and_flush = match mode {
ShutdownMode::FreezeAndFlush => true,
ShutdownMode::Hard => false,
};
// Regardless of whether we're going to try_freeze_and_flush
// or not, stop ingesting any more data. Walreceiver only provides
// cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1703,7 +1708,7 @@ impl Timeline {
// ... and inform any waiters for newer LSNs that there won't be any.
self.last_record_lsn.shutdown();
if try_freeze_and_flush {
if let ShutdownMode::FreezeAndFlush = mode {
if let Some((open, frozen)) = self
.layers
.read()
@@ -1745,6 +1750,20 @@ impl Timeline {
warn!("failed to freeze and flush: {e:#}");
}
}
// `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
// we also do a final check here to ensure that the queue is empty.
if !self.remote_client.no_pending_work() {
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
}
}
if let ShutdownMode::Flush = mode {
// drain the upload queue
self.remote_client.shutdown().await;
if !self.remote_client.no_pending_work() {
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
}
}
// Signal any subscribers to our cancellation token to drop out
@@ -3487,18 +3506,37 @@ impl Timeline {
let timer = self.metrics.flush_time_histo.start_timer();
let num_frozen_layers;
let frozen_layer_total_size;
let layer_to_flush = {
let guard = self.layers.read().await;
let Ok(lm) = guard.layer_map() else {
info!("dropping out of flush loop for timeline shutdown");
return;
};
num_frozen_layers = lm.frozen_layers.len();
frozen_layer_total_size = lm
.frozen_layers
.iter()
.map(|l| l.estimated_in_mem_size())
.sum::<u64>();
lm.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
let Some(layer_to_flush) = layer_to_flush else {
break Ok(());
};
if num_frozen_layers
> std::cmp::max(
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
)
&& frozen_layer_total_size >= /* 128 MB */ 128000000
{
tracing::warn!(
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
);
}
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(this_layer_to_lsn) => {
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
@@ -4089,6 +4127,7 @@ impl Timeline {
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
// Metadata keys image layer creation.
let mut reconstruct_state = ValuesReconstructState::default();
let begin = Instant::now();
let data = self
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
.await?;
@@ -4105,14 +4144,11 @@ impl Timeline {
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
};
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
let elapsed = begin.elapsed();
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
debug!(
trigger_generation,
delta_files_accessed,
total_kb_retrieved,
total_keys_retrieved,
"generate metadata images"
info!(
"metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
);
if !trigger_generation && mode == ImageLayerCreationMode::Try {
@@ -5736,23 +5772,22 @@ impl<'a> TimelineWriter<'a> {
/// Put a batch of keys at the specified Lsns.
pub(crate) async fn put_batch(
&mut self,
batch: Vec<(CompactKey, Lsn, usize, Value)>,
batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
if batch.is_empty() {
return Ok(());
}
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
let batch_max_lsn = serialized_batch.max_lsn;
let buf_size: u64 = serialized_batch.raw.len() as u64;
let batch_max_lsn = batch.max_lsn;
let buf_size: u64 = batch.buffer_size() as u64;
let action = self.get_open_layer_action(batch_max_lsn, buf_size);
let layer = self
.handle_open_layer_action(batch_max_lsn, action, ctx)
.await?;
let res = layer.put_batch(serialized_batch, ctx).await;
let res = layer.put_batch(batch, ctx).await;
if res.is_ok() {
// Update the current size only when the entire write was ok.
@@ -5787,11 +5822,14 @@ impl<'a> TimelineWriter<'a> {
);
}
let val_ser_size = value.serialized_size().unwrap() as usize;
self.put_batch(
vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
ctx,
)
.await
let batch = SerializedValueBatch::from_values(vec![(
key.to_compact(),
lsn,
val_ser_size,
value.clone(),
)]);
self.put_batch(batch, ctx).await
}
pub(crate) async fn delete_batch(

View File

@@ -12,7 +12,7 @@ use crate::{
virtual_file::{MaybeFatalIo, VirtualFile},
};
use anyhow::Context;
use pageserver_api::models::detach_ancestor::AncestorDetached;
use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -376,8 +376,14 @@ pub(super) async fn prepare(
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let owned =
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
let owned = remote_copy(
&adopted,
&timeline,
timeline.generation,
timeline.shard_identity,
&timeline.cancel,
)
.await?;
tracing::info!(layer=%owned, "remote copied");
Ok(owned)
}
@@ -629,6 +635,7 @@ async fn remote_copy(
adopted: &Layer,
adoptee: &Arc<Timeline>,
generation: Generation,
shard_identity: ShardIdentity,
cancel: &CancellationToken,
) -> Result<Layer, Error> {
// depending if Layer::keep_resident we could hardlink
@@ -636,6 +643,7 @@ async fn remote_copy(
let mut metadata = adopted.metadata();
debug_assert!(metadata.generation <= generation);
metadata.generation = generation;
metadata.shard = shard_identity.shard_index();
let owned = crate::tenant::storage_layer::Layer::for_evicted(
adoptee.conf,

View File

@@ -47,21 +47,18 @@ pub(crate) async fn offload_timeline(
match is_archived {
Some(true) => (),
Some(false) => {
tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
tracing::warn!("tried offloading a non-archived timeline");
return Err(OffloadError::NotArchived);
}
None => {
// This is legal: calls to this function can race with the timeline shutting down
tracing::info!(
?is_archived,
"tried offloading a timeline whose remote storage is not initialized"
);
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
return Err(OffloadError::Cancelled);
}
}
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Hard).await;
timeline.shutdown(super::ShutdownMode::Flush).await;
// TODO extend guard mechanism above with method
// to make deletions possible while offloading is in progress

View File

@@ -331,11 +331,11 @@ pub(super) async fn handle_walreceiver_connection(
Ok(())
}
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
while let Some((record_end_lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
if !lsn.is_aligned() {
if !record_end_lsn.is_aligned() {
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
@@ -343,7 +343,7 @@ pub(super) async fn handle_walreceiver_connection(
let interpreted = InterpretedWalRecord::from_bytes_filtered(
recdata,
modification.tline.get_shard_identity(),
lsn,
record_end_lsn,
modification.tline.pg_version,
)?;
@@ -366,9 +366,11 @@ pub(super) async fn handle_walreceiver_connection(
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
.with_context(|| {
format!("could not ingest record at {record_end_lsn}")
})?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
tracing::debug!("ingest: filtered out record @ LSN {record_end_lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
@@ -378,7 +380,7 @@ pub(super) async fn handle_walreceiver_connection(
// to timeout the tests.
fail_point!("walreceiver-after-ingest");
last_rec_lsn = lsn;
last_rec_lsn = record_end_lsn;
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.

View File

@@ -28,14 +28,13 @@ use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::key::Key;
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::fsm_logical_to_physical;
use postgres_ffi::walrecord::*;
use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
use wal_decoder::models::*;
use anyhow::{bail, Context, Result};
use anyhow::{bail, Result};
use bytes::{Buf, Bytes};
use tracing::*;
use utils::failpoint_support;
@@ -51,7 +50,6 @@ use crate::ZERO_PAGE;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::value::Value;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::TransactionId;
@@ -156,12 +154,12 @@ impl WalIngest {
WAL_INGEST.records_received.inc();
let prev_len = modification.len();
modification.set_lsn(interpreted.lsn)?;
modification.set_lsn(interpreted.end_lsn)?;
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
// Records of this type should always be preceded by a commit(), as they
// rely on reading data pages back from the Timeline.
assert!(!modification.has_dirty_data_pages());
assert!(!modification.has_dirty_data());
}
assert!(!self.checkpoint_modified);
@@ -275,28 +273,9 @@ impl WalIngest {
}
}
// Iterate through all the key value pairs provided in the interpreted block
// and update the modification currently in-flight to include them.
for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
match maybe_value {
Some(Value::Image(img)) => {
self.put_rel_page_image(modification, rel, blk, img, ctx)
.await?;
}
Some(Value::WalRecord(rec)) => {
self.put_rel_wal_record(modification, rel, blk, rec, ctx)
.await?;
}
None => {
// Shard 0 tracks relation sizes. We will observe
// its blkno in case it implicitly extends a relation.
assert!(self.shard.is_shard_zero());
self.observe_decoded_block(modification, rel, blk, ctx)
.await?;
}
}
}
modification
.ingest_batch(interpreted.batch, &self.shard, ctx)
.await?;
// If checkpoint data was updated, store the new version in the repository
if self.checkpoint_modified {
@@ -310,8 +289,6 @@ impl WalIngest {
// until commit() is called to flush the data into the repository and update
// the latest LSN.
modification.on_record_end();
Ok(modification.len() > prev_len)
}
@@ -334,17 +311,6 @@ impl WalIngest {
Ok((epoch as u64) << 32 | xid as u64)
}
/// Do not store this block, but observe it for the purposes of updating our relation size state.
async fn observe_decoded_block(
&mut self,
modification: &mut DatadirModification<'_>,
rel: RelTag,
blkno: BlockNumber,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
self.handle_rel_extend(modification, rel, blkno, ctx).await
}
async fn ingest_clear_vm_bits(
&mut self,
clear_vm_bits: ClearVmBits,
@@ -1248,6 +1214,7 @@ impl WalIngest {
Ok(())
}
#[cfg(test)]
async fn put_rel_page_image(
&mut self,
modification: &mut DatadirModification<'_>,
@@ -1297,36 +1264,7 @@ impl WalIngest {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = modification
.tline
.get_cached_rel_size(&rel, modification.get_lsn())
{
nblocks
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
modification
.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await?
};
let old_nblocks = modification.create_relation_if_required(rel, ctx).await?;
if new_nblocks > old_nblocks {
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
@@ -1553,25 +1491,21 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1713,7 +1647,6 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_eq!(
tline
@@ -1739,7 +1672,6 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
.await?;
m.on_record_end();
m.commit(&ctx).await?;
assert_eq!(
tline

View File

@@ -611,6 +611,17 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
recptr = startptr;
nbytes = count;
/* Try to read directly from WAL buffers first. */
#if PG_MAJORVERSION_NUM >= 17
{
Size rbytes;
rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
recptr += rbytes;
nbytes -= rbytes;
p += rbytes;
}
#endif
while (nbytes > 0)
{
uint32 startoff;

View File

@@ -1361,29 +1361,35 @@ SendAppendRequests(Safekeeper *sk)
if (sk->active_state == SS_ACTIVE_READ_WAL)
{
char *errmsg;
int req_len;
req = &sk->appendRequest;
req_len = req->endLsn - req->beginLsn;
switch (wp->api.wal_read(sk,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req->endLsn - req->beginLsn,
&errmsg))
/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
if (req_len > 0)
{
case NEON_WALREAD_SUCCESS:
break;
case NEON_WALREAD_WOULDBLOCK:
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
Assert(false);
switch (wp->api.wal_read(sk,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req_len,
&errmsg))
{
case NEON_WALREAD_SUCCESS:
break;
case NEON_WALREAD_WOULDBLOCK:
return true;
case NEON_WALREAD_ERROR:
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port, errmsg);
ShutdownConnection(sk);
return false;
default:
Assert(false);
}
}
sk->outbuf.len += req->endLsn - req->beginLsn;
sk->outbuf.len += req_len;
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);

View File

@@ -1489,33 +1489,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
{
NeonWALReadResult res;
#if PG_MAJORVERSION_NUM >= 17
if (!sk->wp->config->syncSafekeepers)
{
Size rbytes;
rbytes = WALReadFromBuffers(buf, startptr, count,
walprop_pg_get_timeline_id());
startptr += rbytes;
count -= rbytes;
}
#endif
if (count == 0)
{
res = NEON_WALREAD_SUCCESS;
}
else
{
Assert(count > 0);
/* Now read the remaining WAL from the WAL file */
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
}
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
if (res == NEON_WALREAD_SUCCESS)
{

155
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -2106,83 +2106,78 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
[[package]]
name = "psycopg2-binary"
version = "2.9.9"
version = "2.9.10"
description = "psycopg2 - Python-PostgreSQL Database Adapter"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
{file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3e9c76f0ac6f92ecfc79516a8034a544926430f7b080ec5a0537bca389ee0906"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad26b467a405c798aaa1458ba09d7e2b6e5f96b1ce0ac15d82fd9f95dc38a92"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:270934a475a0e4b6925b5f804e3809dd5f90f8613621d062848dd82f9cd62007"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48b338f08d93e7be4ab2b5f1dbe69dc5e9ef07170fe1f86514422076d9c010d0"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4152f8f76d2023aac16285576a9ecd2b11a9895373a1f10fd9db54b3ff06b4"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32581b3020c72d7a421009ee1c6bf4a131ef5f0a968fab2e2de0c9d2bb4577f1"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2ce3e21dc3437b1d960521eca599d57408a695a0d3c26797ea0f72e834c7ffe5"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e984839e75e0b60cfe75e351db53d6db750b00de45644c5d1f7ee5d1f34a1ce5"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c4745a90b78e51d9ba06e2088a2fe0c693ae19cc8cb051ccda44e8df8a6eb53"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-win32.whl", hash = "sha256:e5720a5d25e3b99cd0dc5c8a440570469ff82659bb09431c1439b92caf184d3b"},
{file = "psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:3c18f74eb4386bf35e92ab2354a12c17e5eb4d9798e4c0ad3a00783eae7cd9f1"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392"},
{file = "psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64"},
{file = "psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:056470c3dc57904bbf63d6f534988bafc4e970ffd50f6271fc4ee7daad9498a5"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aa0e31fa4bb82578f3a6c74a73c273367727de397a7a0f07bd83cbea696baa"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8de718c0e1c4b982a54b41779667242bc630b2197948405b7bd8ce16bcecac92"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5c370b1e4975df846b0277b4deba86419ca77dbc25047f535b0bb03d1a544d44"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ffe8ed017e4ed70f68b7b371d84b7d4a790368db9203dfc2d222febd3a9c8863"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8aecc5e80c63f7459a1a2ab2c64df952051df196294d9f739933a9f6687e86b3"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:7a813c8bdbaaaab1f078014b9b0b13f5de757e2b5d9be6403639b298a04d218b"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00924255d7fc916ef66e4bf22f354a940c67179ad3fd7067d7a0a9c84d2fbfc"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7559bce4b505762d737172556a4e6ea8a9998ecac1e39b5233465093e8cee697"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8b58f0a96e7a1e341fc894f62c1177a7c83febebb5ff9123b579418fdc8a481"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b269105e59ac96aba877c1707c600ae55711d9dcd3fc4b5012e4af68e30c648"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:79625966e176dc97ddabc142351e0409e28acf4660b88d1cf6adb876d20c490d"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8aabf1c1a04584c168984ac678a668094d831f152859d06e055288fa515e4d30"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:19721ac03892001ee8fdd11507e6a2e01f4e37014def96379411ca99d78aeb2c"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7f5d859928e635fa3ce3477704acee0f667b3a3d3e4bb109f2b18d4005f38287"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-win32.whl", hash = "sha256:3216ccf953b3f267691c90c6fe742e45d890d8272326b4a8b20850a03d05b7b8"},
{file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"},
]
[[package]]
@@ -3013,13 +3008,13 @@ files = [
[[package]]
name = "types-psycopg2"
version = "2.9.21.10"
version = "2.9.21.20241019"
description = "Typing stubs for psycopg2"
optional = false
python-versions = "*"
python-versions = ">=3.8"
files = [
{file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"},
{file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"},
{file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
{file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
]
[[package]]
@@ -3489,4 +3484,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508"

View File

@@ -23,7 +23,7 @@ bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
chrono.workspace = true
clap.workspace = true
clap = { workspace = true, features = ["derive", "env"] }
compute_api.workspace = true
consumption_metrics.workspace = true
dashmap.workspace = true
@@ -60,7 +60,7 @@ prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest.workspace = true
reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
reqwest-middleware = { workspace = true, features = ["json"] }
reqwest-retry.workspace = true
reqwest-tracing.workspace = true
@@ -98,6 +98,7 @@ rustls-native-certs.workspace = true
x509-parser.workspace = true
postgres-protocol.workspace = true
redis.workspace = true
zerocopy.workspace = true
# jwt stuff
jose-jwa = "0.1.2"

View File

@@ -51,7 +51,7 @@ pub(super) async fn authenticate(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*creds.user));
return Err(auth::AuthError::password_failed(&*creds.user));
}
};

View File

@@ -9,15 +9,14 @@ use super::ComputeCredentialKeys;
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::provider::NodeInfo;
use crate::control_plane::{self, CachedNodeInfo};
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::error::{ReportableError, UserFacingError};
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::stream::PqStream;
use crate::{auth, compute, waiters};
#[derive(Debug, Error)]
pub(crate) enum WebAuthError {
pub(crate) enum ConsoleRedirectError {
#[error(transparent)]
WaiterRegister(#[from] waiters::RegisterError),
@@ -33,13 +32,13 @@ pub struct ConsoleRedirectBackend {
console_uri: reqwest::Url,
}
impl UserFacingError for WebAuthError {
impl UserFacingError for ConsoleRedirectError {
fn to_string_client(&self) -> String {
"Internal error".to_string()
}
}
impl ReportableError for WebAuthError {
impl ReportableError for ConsoleRedirectError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
@@ -104,7 +103,7 @@ async fn authenticate(
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> {
ctx.set_auth_method(crate::context::AuthMethod::Web);
ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
// registering waiter can fail if we get unlucky with rng.
// just try again.
@@ -117,7 +116,7 @@ async fn authenticate(
}
};
let span = info_span!("web", psql_session_id = &psql_session_id);
let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
let greeting = hello_message(link_uri, &psql_session_id);
// Give user a URL to spawn a new database.
@@ -128,14 +127,16 @@ async fn authenticate(
.write_message(&Be::NoticeResponse(&greeting))
.await?;
// Wait for web console response (see `mgmt`).
// Wait for console response via control plane (see `mgmt`).
info!(parent: &span, "waiting for console's reply...");
let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
let db_info = tokio::time::timeout(auth_config.console_redirect_confirmation_timeout, waiter)
.await
.map_err(|_elapsed| {
auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
auth::AuthError::confirmation_timeout(
auth_config.console_redirect_confirmation_timeout.into(),
)
})?
.map_err(WebAuthError::from)?;
.map_err(ConsoleRedirectError::from)?;
if auth_config.ip_allowlist_check_enabled {
if let Some(allowed_ips) = &db_info.allowed_ips {

View File

@@ -46,7 +46,7 @@ pub(crate) async fn authenticate_cleartext(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.user));
return Err(auth::AuthError::password_failed(&*info.user));
}
};

View File

@@ -7,8 +7,11 @@ use arc_swap::ArcSwapOption;
use dashmap::DashMap;
use jose_jwk::crypto::KeyInfo;
use reqwest::{redirect, Client};
use reqwest_retry::policies::ExponentialBackoff;
use reqwest_retry::RetryTransientMiddleware;
use serde::de::Visitor;
use serde::{Deserialize, Deserializer};
use serde_json::value::RawValue;
use signature::Verifier;
use thiserror::Error;
use tokio::time::Instant;
@@ -16,7 +19,7 @@ use tokio::time::Instant;
use crate::auth::backend::ComputeCredentialKeys;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::GetEndpointJwksError;
use crate::http::parse_json_body_with_limit;
use crate::http::read_body_with_limit;
use crate::intern::RoleNameInt;
use crate::types::{EndpointId, RoleName};
@@ -28,6 +31,10 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
const JWKS_USER_AGENT: &str = "neon-proxy";
const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5);
const JWKS_FETCH_RETRIES: u32 = 3;
/// How to get the JWT auth rules
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
fn fetch_auth_rules(
@@ -55,7 +62,7 @@ pub(crate) struct AuthRule {
}
pub struct JwkCache {
client: reqwest::Client,
client: reqwest_middleware::ClientWithMiddleware,
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
}
@@ -117,6 +124,14 @@ impl Default for JwkCacheEntryLock {
}
}
#[derive(Deserialize)]
struct JwkSet<'a> {
/// we parse into raw-value because not all keys in a JWKS are ones
/// we can parse directly, so we parse them lazily.
#[serde(borrow)]
keys: Vec<&'a RawValue>,
}
impl JwkCacheEntryLock {
async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
JwkRenewalPermit::acquire_permit(self).await
@@ -130,7 +145,7 @@ impl JwkCacheEntryLock {
&self,
_permit: JwkRenewalPermit<'_>,
ctx: &RequestMonitoring,
client: &reqwest::Client,
client: &reqwest_middleware::ClientWithMiddleware,
endpoint: EndpointId,
auth_rules: &F,
) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -154,22 +169,73 @@ impl JwkCacheEntryLock {
let req = client.get(rule.jwks_url.clone());
// TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
// TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
match req.send().await.and_then(|r| r.error_for_status()) {
match req.send().await.and_then(|r| {
r.error_for_status()
.map_err(reqwest_middleware::Error::Reqwest)
}) {
// todo: should we re-insert JWKs if we want to keep this JWKs URL?
// I expect these failures would be quite sparse.
Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
Ok(r) => {
let resp: http::Response<reqwest::Body> = r.into();
match parse_json_body_with_limit::<jose_jwk::JwkSet>(
resp.into_body(),
MAX_JWK_BODY_SIZE,
)
.await
let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
.await
{
Ok(bytes) => bytes,
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
continue;
}
};
match serde_json::from_slice::<JwkSet>(&bytes) {
Err(e) => {
tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
}
Ok(jwks) => {
// size_of::<&RawValue>() == 16
// size_of::<jose_jwk::Jwk>() == 288
// better to not pre-allocate this as it might be pretty large - especially if it has many
// keys we don't want or need.
// trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
// this would consume 8MiB just like that!
let mut keys = vec![];
let mut failed = 0;
for key in jwks.keys {
match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
Ok(key) => {
// if `use` (called `cls` in rust) is specified to be something other than signing,
// we can skip storing it.
if key
.prm
.cls
.as_ref()
.is_some_and(|c| *c != jose_jwk::Class::Signing)
{
continue;
}
keys.push(key);
}
Err(e) => {
tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
failed += 1;
}
}
}
keys.shrink_to_fit();
if failed > 0 {
tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
}
if keys.is_empty() {
tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
continue;
}
let jwks = jose_jwk::JwkSet { keys };
key_sets.insert(
rule.id,
KeySet {
@@ -179,7 +245,7 @@ impl JwkCacheEntryLock {
},
);
}
}
};
}
}
}
@@ -196,7 +262,7 @@ impl JwkCacheEntryLock {
async fn get_or_update_jwk_cache<F: FetchAuthRules>(
self: &Arc<Self>,
ctx: &RequestMonitoring,
client: &reqwest::Client,
client: &reqwest_middleware::ClientWithMiddleware,
endpoint: EndpointId,
fetch: &F,
) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -250,7 +316,7 @@ impl JwkCacheEntryLock {
self: &Arc<Self>,
ctx: &RequestMonitoring,
jwt: &str,
client: &reqwest::Client,
client: &reqwest_middleware::ClientWithMiddleware,
endpoint: EndpointId,
role_name: &RoleName,
fetch: &F,
@@ -369,8 +435,19 @@ impl Default for JwkCache {
let client = Client::builder()
.user_agent(JWKS_USER_AGENT)
.redirect(redirect::Policy::none())
.tls_built_in_native_certs(true)
.connect_timeout(JWKS_CONNECT_TIMEOUT)
.timeout(JWKS_FETCH_TIMEOUT)
.build()
.expect("using &str and standard redirect::Policy");
.expect("client config should be valid");
// Retry up to 3 times with increasing intervals between attempts.
let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES);
let client = reqwest_middleware::ClientBuilder::new(client)
.with(RetryTransientMiddleware::new_with_policy(retry_policy))
.build();
JwkCache {
client,
map: DashMap::default(),
@@ -1209,4 +1286,63 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
}
}
}
#[tokio::test]
async fn check_jwk_keycloak_regression() {
let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into());
let valid_jwk = serde_json::to_value(valid_jwk).unwrap();
// This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones.
// This is taken directly from keycloak.
let invalid_jwk = serde_json::json! {
{
"kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ",
"kty": "RSA",
"alg": "RSA-OAEP",
"use": "enc",
"n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw",
"e": "AQAB",
"x5c": [
"MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI="
],
"x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs",
"x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU"
}
};
let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }};
let jwks_addr = jwks_server(move |path| match path {
"/" => Some(serde_json::to_vec(&jwks).unwrap()),
_ => None,
})
.await;
let role_name = RoleName::from("anonymous");
let role = RoleNameInt::from(&role_name);
let rules = vec![AuthRule {
id: "foo".to_owned(),
jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
audience: None,
role_names: vec![role],
}];
let fetch = Fetch(rules);
let jwk_cache = JwkCache::default();
let endpoint = EndpointId::from("ep");
let token = new_rsa_jwt("rs1".into(), rs);
jwk_cache
.check_jwt(
&RequestMonitoring::test(),
endpoint.clone(),
&role_name,
&fetch,
&token,
)
.await
.unwrap();
}
}

View File

@@ -9,7 +9,7 @@ use std::sync::Arc;
use std::time::Duration;
pub use console_redirect::ConsoleRedirectBackend;
pub(crate) use console_redirect::WebAuthError;
pub(crate) use console_redirect::ConsoleRedirectError;
use ipnet::{Ipv4Net, Ipv6Net};
use local::LocalBackend;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -21,11 +21,11 @@ use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserIn
use crate::cache::Cached;
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ControlPlaneClient;
use crate::control_plane::errors::GetAuthInfoError;
use crate::control_plane::provider::{
CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
use crate::control_plane::{
self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
};
use crate::control_plane::{self, Api, AuthSecret};
use crate::intern::EndpointIdInt;
use crate::metrics::Metrics;
use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -62,42 +62,26 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
/// backends which require them for the authentication process.
pub enum Backend<'a, T> {
/// Cloud API (V2).
ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
ControlPlane(MaybeOwned<'a, ControlPlaneClient>, T),
/// Local proxy uses configured auth credentials and does not wake compute
Local(MaybeOwned<'a, LocalBackend>),
}
#[cfg(test)]
pub(crate) trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
fn get_allowed_ips_and_secret(
&self,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>;
fn dyn_clone(&self) -> Box<dyn TestBackend>;
}
#[cfg(test)]
impl Clone for Box<dyn TestBackend> {
fn clone(&self) -> Self {
TestBackend::dyn_clone(&**self)
}
}
impl std::fmt::Display for Backend<'_, ()> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ControlPlane(api, ()) => match &**api {
ControlPlaneBackend::Management(endpoint) => fmt
.debug_tuple("ControlPlane::Management")
ControlPlaneClient::Neon(endpoint) => fmt
.debug_tuple("ControlPlane::Neon")
.field(&endpoint.url())
.finish(),
#[cfg(any(test, feature = "testing"))]
ControlPlaneBackend::PostgresMock(endpoint) => fmt
ControlPlaneClient::PostgresMock(endpoint) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&endpoint.url())
.finish(),
#[cfg(test)]
ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
},
Self::Local(_) => fmt.debug_tuple("Local").finish(),
}
@@ -282,7 +266,7 @@ impl AuthenticationConfig {
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
ctx: &RequestMonitoring,
api: &impl control_plane::Api,
api: &impl control_plane::ControlPlaneApi,
user_info: ComputeUserInfoMaybeEndpoint,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
@@ -349,7 +333,7 @@ async fn auth_quirks(
{
Ok(keys) => Ok(keys),
Err(e) => {
if e.is_auth_failed() {
if e.is_password_failed() {
// The password could have been changed, so we invalidate the cache.
cached_entry.invalidate();
}
@@ -376,7 +360,7 @@ async fn authenticate_with_secret(
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.user));
return Err(auth::AuthError::password_failed(&*info.user));
}
};
@@ -499,12 +483,12 @@ mod tests {
use std::time::Duration;
use bytes::BytesMut;
use control_plane::AuthSecret;
use fallible_iterator::FallibleIterator;
use once_cell::sync::Lazy;
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
use postgres_protocol::message::backend::Message as PgMessage;
use postgres_protocol::message::frontend;
use provider::AuthSecret;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
use super::jwt::JwkCache;
@@ -513,8 +497,7 @@ mod tests {
use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
use crate::config::AuthenticationConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret};
use crate::control_plane::{self, CachedNodeInfo};
use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
use crate::proxy::NeonOptions;
use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
use crate::scram::threadpool::ThreadPool;
@@ -526,7 +509,7 @@ mod tests {
secret: AuthSecret,
}
impl control_plane::Api for Auth {
impl control_plane::ControlPlaneApi for Auth {
async fn get_role_secret(
&self,
_ctx: &RequestMonitoring,
@@ -577,7 +560,7 @@ mod tests {
ip_allowlist_check_enabled: true,
is_auth_broker: false,
accept_jwts: false,
webauth_confirmation_timeout: std::time::Duration::from_secs(5),
console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
});
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {

View File

@@ -21,6 +21,7 @@ pub(crate) use flow::*;
use thiserror::Error;
use tokio::time::error::Elapsed;
use crate::auth::backend::jwt::JwtError;
use crate::control_plane;
use crate::error::{ReportableError, UserFacingError};
@@ -31,7 +32,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
#[derive(Debug, Error)]
pub(crate) enum AuthError {
#[error(transparent)]
Web(#[from] backend::WebAuthError),
ConsoleRedirect(#[from] backend::ConsoleRedirectError),
#[error(transparent)]
GetAuthInfo(#[from] control_plane::errors::GetAuthInfoError),
@@ -55,7 +56,7 @@ pub(crate) enum AuthError {
MissingEndpointName,
#[error("password authentication failed for user '{0}'")]
AuthFailed(Box<str>),
PasswordFailed(Box<str>),
/// Errors produced by e.g. [`crate::stream::PqStream`].
#[error(transparent)]
@@ -76,6 +77,9 @@ pub(crate) enum AuthError {
#[error("Disconnected due to inactivity after {0}.")]
ConfirmationTimeout(humantime::Duration),
#[error(transparent)]
Jwt(#[from] JwtError),
}
impl AuthError {
@@ -83,8 +87,8 @@ impl AuthError {
AuthError::BadAuthMethod(name.into())
}
pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
AuthError::AuthFailed(user.into())
pub(crate) fn password_failed(user: impl Into<Box<str>>) -> Self {
AuthError::PasswordFailed(user.into())
}
pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
@@ -95,8 +99,8 @@ impl AuthError {
AuthError::TooManyConnections
}
pub(crate) fn is_auth_failed(&self) -> bool {
matches!(self, AuthError::AuthFailed(_))
pub(crate) fn is_password_failed(&self) -> bool {
matches!(self, AuthError::PasswordFailed(_))
}
pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
@@ -111,10 +115,10 @@ impl AuthError {
impl UserFacingError for AuthError {
fn to_string_client(&self) -> String {
match self {
Self::Web(e) => e.to_string_client(),
Self::ConsoleRedirect(e) => e.to_string_client(),
Self::GetAuthInfo(e) => e.to_string_client(),
Self::Sasl(e) => e.to_string_client(),
Self::AuthFailed(_) => self.to_string(),
Self::PasswordFailed(_) => self.to_string(),
Self::BadAuthMethod(_) => self.to_string(),
Self::MalformedPassword(_) => self.to_string(),
Self::MissingEndpointName => self.to_string(),
@@ -123,6 +127,7 @@ impl UserFacingError for AuthError {
Self::TooManyConnections => self.to_string(),
Self::UserTimeout(_) => self.to_string(),
Self::ConfirmationTimeout(_) => self.to_string(),
Self::Jwt(_) => self.to_string(),
}
}
}
@@ -130,10 +135,10 @@ impl UserFacingError for AuthError {
impl ReportableError for AuthError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::Web(e) => e.get_error_kind(),
Self::ConsoleRedirect(e) => e.get_error_kind(),
Self::GetAuthInfo(e) => e.get_error_kind(),
Self::Sasl(e) => e.get_error_kind(),
Self::AuthFailed(_) => crate::error::ErrorKind::User,
Self::PasswordFailed(_) => crate::error::ErrorKind::User,
Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
Self::MalformedPassword(_) => crate::error::ErrorKind::User,
Self::MissingEndpointName => crate::error::ErrorKind::User,
@@ -142,6 +147,7 @@ impl ReportableError for AuthError {
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::UserTimeout(_) => crate::error::ErrorKind::User,
Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
Self::Jwt(_) => crate::error::ErrorKind::User,
}
}
}

View File

@@ -281,7 +281,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
ip_allowlist_check_enabled: true,
is_auth_broker: false,
accept_jwts: true,
webauth_confirmation_timeout: Duration::ZERO,
console_redirect_confirmation_timeout: Duration::ZERO,
},
proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
handshake_timeout: Duration::from_secs(10),

View File

@@ -51,11 +51,11 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackendType {
Console,
// clap only shows the name, not the alias, in usage text.
// TODO: swap name/alias and deprecate "link"
#[value(name("link"), alias("web"))]
Web,
#[value(name("console"), alias("cplane"))]
ControlPlane,
#[value(name("link"), alias("control-redirect"))]
ConsoleRedirect,
#[cfg(feature = "testing")]
Postgres,
@@ -71,7 +71,7 @@ struct ProxyCliArgs {
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
#[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
#[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
auth_backend: AuthBackendType,
/// listen for management callback connection on ip:port
#[clap(short, long, default_value = "127.0.0.1:7000")]
@@ -82,7 +82,7 @@ struct ProxyCliArgs {
/// listen for incoming wss connections on ip:port
#[clap(long)]
wss: Option<String>,
/// redirect unauthenticated users to the given uri in case of web auth
/// redirect unauthenticated users to the given uri in case of console redirect auth
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
uri: String,
/// cloud API endpoint for authenticating users
@@ -92,6 +92,14 @@ struct ProxyCliArgs {
default_value = "http://localhost:3000/authenticate_proxy_request/"
)]
auth_endpoint: String,
/// JWT used to connect to control plane.
#[clap(
long,
value_name = "JWT",
default_value = "",
env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
)]
control_plane_token: Arc<str>,
/// if this is not local proxy, this toggles whether we accept jwt or passwords for http
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
is_auth_broker: bool,
@@ -223,6 +231,7 @@ struct ProxyCliArgs {
proxy_protocol_v2: ProxyProtocolV2,
/// Time the proxy waits for the webauth session to be confirmed by the control plane.
// TODO: rename to `console_redirect_confirmation_timeout`.
#[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
webauth_confirmation_timeout: std::time::Duration,
}
@@ -513,7 +522,7 @@ async fn main() -> anyhow::Result<()> {
}
if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
match (redis_notifications_client, regional_redis_client.clone()) {
(None, None) => {}
(client1, client2) => {
@@ -659,7 +668,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
ip_allowlist_check_enabled: !args.is_private_access_proxy,
is_auth_broker: args.is_auth_broker,
accept_jwts: args.is_auth_broker,
webauth_confirmation_timeout: args.webauth_confirmation_timeout,
console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
};
let config = ProxyConfig {
@@ -690,7 +699,7 @@ fn build_auth_backend(
args: &ProxyCliArgs,
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
match &args.auth_backend {
AuthBackendType::Console => {
AuthBackendType::ControlPlane => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
@@ -732,13 +741,14 @@ fn build_auth_backend(
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
let api = control_plane::provider::neon::Api::new(
let api = control_plane::client::neon::NeonControlPlaneClient::new(
endpoint,
args.control_plane_token.clone(),
caches,
locks,
wake_compute_endpoint_rate_limiter,
);
let api = control_plane::provider::ControlPlaneBackend::Management(api);
let api = control_plane::client::ControlPlaneClient::Neon(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
let config = Box::leak(Box::new(auth_backend));
@@ -749,8 +759,11 @@ fn build_auth_backend(
#[cfg(feature = "testing")]
AuthBackendType::Postgres => {
let url = args.auth_endpoint.parse()?;
let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
let api = control_plane::client::mock::MockControlPlane::new(
url,
!args.is_private_access_proxy,
);
let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
@@ -759,7 +772,7 @@ fn build_auth_backend(
Ok(Either::Left(config))
}
AuthBackendType::Web => {
AuthBackendType::ConsoleRedirect => {
let url = args.uri.parse()?;
let backend = ConsoleRedirectBackend::new(url);

View File

@@ -1,13 +1,12 @@
use std::convert::Infallible;
use std::future::pending;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::Duration;
use std::sync::{Arc, Mutex};
use dashmap::DashSet;
use redis::streams::{StreamReadOptions, StreamReadReply};
use redis::{AsyncCommands, FromRedisValue, Value};
use serde::Deserialize;
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use tracing::info;
@@ -19,23 +18,38 @@ use crate::rate_limiter::GlobalRateLimiter;
use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use crate::types::EndpointId;
#[derive(Deserialize, Debug, Clone)]
pub(crate) struct ControlPlaneEventKey {
// TODO: this could be an enum, but events in Redis need to be fixed first.
// ProjectCreated was sent with type:branch_created. So we ignore type.
#[derive(Deserialize, Debug, Clone, PartialEq)]
struct ControlPlaneEvent {
endpoint_created: Option<EndpointCreated>,
branch_created: Option<BranchCreated>,
project_created: Option<ProjectCreated>,
#[serde(rename = "type")]
_type: Option<String>,
}
#[derive(Deserialize, Debug, Clone)]
#[derive(Deserialize, Debug, Clone, PartialEq)]
struct EndpointCreated {
endpoint_id: String,
endpoint_id: EndpointIdInt,
}
#[derive(Deserialize, Debug, Clone)]
#[derive(Deserialize, Debug, Clone, PartialEq)]
struct BranchCreated {
branch_id: String,
branch_id: BranchIdInt,
}
#[derive(Deserialize, Debug, Clone)]
#[derive(Deserialize, Debug, Clone, PartialEq)]
struct ProjectCreated {
project_id: String,
project_id: ProjectIdInt,
}
impl TryFrom<&Value> for ControlPlaneEvent {
type Error = anyhow::Error;
fn try_from(value: &Value) -> Result<Self, Self::Error> {
let json = String::from_redis_value(value)?;
Ok(serde_json::from_str(&json)?)
}
}
pub struct EndpointsCache {
@@ -60,60 +74,80 @@ impl EndpointsCache {
ready: AtomicBool::new(false),
}
}
pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
if !self.ready.load(Ordering::Acquire) {
// the endpoint cache is not yet fully initialised.
return true;
}
let rejected = self.should_reject(endpoint);
ctx.set_rejected(rejected);
info!(?rejected, "check endpoint is valid, disabled cache");
// If cache is disabled, just collect the metrics and return or
// If the limiter allows, we don't need to check the cache.
if self.config.disable_cache || self.limiter.lock().await.check() {
if !self.should_reject(endpoint) {
ctx.set_rejected(false);
return true;
}
!rejected
// report that we might want to reject this endpoint
ctx.set_rejected(true);
// If cache is disabled, just collect the metrics and return.
if self.config.disable_cache {
return true;
}
// If the limiter allows, we can pretend like it's valid
// (incase it is, due to redis channel lag).
if self.limiter.lock().unwrap().check() {
return true;
}
// endpoint not found, and there's too much load.
false
}
fn should_reject(&self, endpoint: &EndpointId) -> bool {
if endpoint.is_endpoint() {
!self.endpoints.contains(&EndpointIdInt::from(endpoint))
let Some(endpoint) = EndpointIdInt::get(endpoint) else {
// if we haven't interned this endpoint, it's not in the cache.
return true;
};
!self.endpoints.contains(&endpoint)
} else if endpoint.is_branch() {
!self
.branches
.contains(&BranchIdInt::from(&endpoint.as_branch()))
let Some(branch) = BranchIdInt::get(endpoint) else {
// if we haven't interned this branch, it's not in the cache.
return true;
};
!self.branches.contains(&branch)
} else {
!self
.projects
.contains(&ProjectIdInt::from(&endpoint.as_project()))
let Some(project) = ProjectIdInt::get(endpoint) else {
// if we haven't interned this project, it's not in the cache.
return true;
};
!self.projects.contains(&project)
}
}
fn insert_event(&self, key: ControlPlaneEventKey) {
// Do not do normalization here, we expect the events to be normalized.
if let Some(endpoint_created) = key.endpoint_created {
self.endpoints
.insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
fn insert_event(&self, event: ControlPlaneEvent) {
if let Some(endpoint_created) = event.endpoint_created {
self.endpoints.insert(endpoint_created.endpoint_id);
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::EndpointCreated);
}
if let Some(branch_created) = key.branch_created {
self.branches
.insert(BranchIdInt::from(&branch_created.branch_id.into()));
} else if let Some(branch_created) = event.branch_created {
self.branches.insert(branch_created.branch_id);
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::BranchCreated);
}
if let Some(project_created) = key.project_created {
self.projects
.insert(ProjectIdInt::from(&project_created.project_id.into()));
} else if let Some(project_created) = event.project_created {
self.projects.insert(project_created.project_id);
Metrics::get()
.proxy
.redis_events_count
.inc(RedisEventsCount::ProjectCreated);
}
}
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
@@ -131,12 +165,13 @@ impl EndpointsCache {
}
if cancellation_token.is_cancelled() {
info!("cancellation token is cancelled, exiting");
tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
// 1 week.
// Maintenance tasks run forever. Sleep forever when canceled.
pending::<()>().await;
}
tokio::time::sleep(self.config.retry_interval).await;
}
}
async fn read_from_stream(
&self,
con: &mut ConnectionWithCredentialsProvider,
@@ -162,10 +197,7 @@ impl EndpointsCache {
)
.await
}
fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
let s: String = FromRedisValue::from_redis_value(value)?;
Ok(serde_json::from_str(&s)?)
}
async fn batch_read(
&self,
conn: &mut ConnectionWithCredentialsProvider,
@@ -196,27 +228,25 @@ impl EndpointsCache {
anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
}
let res = res.keys.pop().expect("Checked length above");
let len = res.ids.len();
for x in res.ids {
let key = res.keys.pop().expect("Checked length above");
let len = key.ids.len();
for stream_id in key.ids {
total += 1;
for (_, v) in x.map {
let key = match Self::parse_key_value(&v) {
Ok(x) => x,
Err(e) => {
for value in stream_id.map.values() {
match value.try_into() {
Ok(event) => self.insert_event(event),
Err(err) => {
Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
channel: &self.config.stream_name,
});
tracing::error!("error parsing value {v:?}: {e:?}");
continue;
tracing::error!("error parsing value {value:?}: {err:?}");
}
};
self.insert_event(key);
}
if total.is_power_of_two() {
tracing::debug!("endpoints read {}", total);
}
*last_id = x.id;
*last_id = stream_id.id;
}
if return_when_finish && len <= self.config.default_batch_size {
break;
@@ -229,11 +259,24 @@ impl EndpointsCache {
#[cfg(test)]
mod tests {
use super::ControlPlaneEventKey;
use super::*;
#[test]
fn test() {
let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
serde_json::from_str::<ControlPlaneEventKey>(s).unwrap();
fn test_parse_control_plane_event() {
let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
assert_eq!(
serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
ControlPlaneEvent {
endpoint_created: Some(EndpointCreated {
endpoint_id: endpoint_id.into(),
}),
branch_created: None,
project_created: None,
_type: Some("endpoint_created".into()),
}
);
}
}

View File

@@ -19,9 +19,9 @@ use tracing::{error, info, warn};
use crate::auth::parse_endpoint_param;
use crate::cancellation::CancelClosure;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::errors::WakeComputeError;
use crate::control_plane::messages::MetricsAuxInfo;
use crate::control_plane::provider::ApiLockError;
use crate::error::{ReportableError, UserFacingError};
use crate::metrics::{Metrics, NumDbConnectionsGuard};
use crate::proxy::neon_option;
@@ -135,13 +135,13 @@ impl ConnCfg {
/// Apply startup message params to the connection config.
pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
// Only set `user` if it's not present in the config.
// Web auth flow takes username from the console's response.
// Console redirect auth flow takes username from the console's response.
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
self.user(user);
}
// Only set `dbname` if it's not present in the config.
// Web auth flow takes dbname from the console's response.
// Console redirect auth flow takes dbname from the console's response.
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
self.dbname(dbname);
}

View File

@@ -78,7 +78,7 @@ pub struct AuthenticationConfig {
pub jwks_cache: JwkCache,
pub is_auth_broker: bool,
pub accept_jwts: bool,
pub webauth_confirmation_timeout: tokio::time::Duration,
pub console_redirect_confirmation_timeout: tokio::time::Duration,
}
impl TlsConfig {
@@ -271,7 +271,7 @@ impl CertResolver {
// auth-broker does not use SNI and instead uses the Neon-Connection-String header.
// Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
//
// Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string
// Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
// validation, so let's we can continue with any common-name
let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
s.to_string()
@@ -366,7 +366,7 @@ pub struct EndpointCacheConfig {
}
impl EndpointCacheConfig {
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
/// Default options for [`crate::control_plane::NodeInfoCache`].
/// Notice that by default the limiter is empty, which means that cache is disabled.
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
@@ -441,7 +441,7 @@ pub struct CacheOptions {
}
impl CacheOptions {
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
/// Default options for [`crate::control_plane::NodeInfoCache`].
pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
/// Parse cache options passed via cmdline.
@@ -497,7 +497,7 @@ pub struct ProjectInfoCacheOptions {
}
impl ProjectInfoCacheOptions {
/// Default options for [`crate::control_plane::provider::NodeInfoCache`].
/// Default options for [`crate::control_plane::NodeInfoCache`].
pub const CACHE_DEFAULT_OPTIONS: &'static str =
"size=10000,ttl=4m,max_roles=10,gc_interval=60m";
@@ -616,9 +616,9 @@ pub struct ConcurrencyLockOptions {
}
impl ConcurrencyLockOptions {
/// Default options for [`crate::control_plane::provider::ApiLocks`].
/// Default options for [`crate::control_plane::client::ApiLocks`].
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
/// Default options for [`crate::control_plane::provider::ApiLocks`].
/// Default options for [`crate::control_plane::client::ApiLocks`].
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
"shards=64,permits=100,epoch=10m,timeout=10ms";

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use futures::TryFutureExt;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, Instrument};
use tracing::{debug, error, info, Instrument};
use crate::auth::backend::ConsoleRedirectBackend;
use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
@@ -11,7 +11,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::{Metrics, NumClientConnectionsGuard};
use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
use crate::proxy::handshake::{handshake, HandshakeData};
use crate::proxy::passthrough::ProxyPassthrough;
@@ -49,7 +49,7 @@ pub async fn task_main(
let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler);
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
connections.spawn(async move {
let (socket, peer_addr) = match read_proxy_protocol(socket).await {
@@ -57,16 +57,21 @@ pub async fn task_main(
error!("per-client task finished with an error: {e:#}");
return;
}
Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
// our load balancers will not send any more data. let's just exit immediately
Ok((_socket, ConnectHeader::Local)) => {
debug!("healthcheck received");
return;
}
Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
error!("missing required proxy protocol header");
return;
}
Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
error!("proxy protocol header not supported");
return;
}
Ok((socket, Some(info))) => (socket, info),
Ok((socket, None)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
};
match socket.inner.set_nodelay(true) {

View File

@@ -75,7 +75,7 @@ struct RequestMonitoringInner {
#[derive(Clone, Debug)]
pub(crate) enum AuthMethod {
// aka passwordless, fka link
Web,
ConsoleRedirect,
ScramSha256,
ScramSha256Plus,
Cleartext,

View File

@@ -134,7 +134,7 @@ impl From<&RequestMonitoringInner> for RequestData {
.as_ref()
.and_then(|options| serde_json::to_string(&Options { options }).ok()),
auth_method: value.auth_method.as_ref().map(|x| match x {
super::AuthMethod::Web => "web",
super::AuthMethod::ConsoleRedirect => "console_redirect",
super::AuthMethod::ScramSha256 => "scram_sha_256",
super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
super::AuthMethod::Cleartext => "cleartext",

View File

@@ -9,16 +9,17 @@ use tokio_postgres::config::SslMode;
use tokio_postgres::Client;
use tracing::{error, info, info_span, warn, Instrument};
use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::ComputeUserInfo;
use crate::auth::IpPattern;
use crate::cache::Cached;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::GetEndpointJwksError;
use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
use crate::control_plane::errors::{
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
};
use crate::control_plane::messages::MetricsAuxInfo;
use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
use crate::error::io_error;
use crate::intern::RoleNameInt;
use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -31,25 +32,25 @@ enum MockApiError {
PasswordNotSet(tokio_postgres::Error),
}
impl From<MockApiError> for ApiError {
impl From<MockApiError> for ControlPlaneError {
fn from(e: MockApiError) -> Self {
io_error(e).into()
}
}
impl From<tokio_postgres::Error> for ApiError {
impl From<tokio_postgres::Error> for ControlPlaneError {
fn from(e: tokio_postgres::Error) -> Self {
io_error(e).into()
}
}
#[derive(Clone)]
pub struct Api {
pub struct MockControlPlane {
endpoint: ApiUrl,
ip_allowlist_check_enabled: bool,
}
impl Api {
impl MockControlPlane {
pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
Self {
endpoint,
@@ -201,7 +202,7 @@ async fn get_execute_postgres_query(
Ok(Some(entry))
}
impl super::Api for Api {
impl super::ControlPlaneApi for MockControlPlane {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,

View File

@@ -0,0 +1,281 @@
#[cfg(any(test, feature = "testing"))]
pub mod mock;
pub mod neon;
use std::hash::Hash;
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashMap;
use tokio::time::Instant;
use tracing::info;
use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
use crate::auth::backend::ComputeUserInfo;
use crate::cache::endpoints::EndpointsCache;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
use crate::context::RequestMonitoring;
use crate::control_plane::{
errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
};
use crate::error::ReportableError;
use crate::metrics::ApiLockMetrics;
use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
use crate::types::EndpointId;
#[non_exhaustive]
#[derive(Clone)]
pub enum ControlPlaneClient {
/// Current Management API (V2).
Neon(neon::NeonControlPlaneClient),
/// Local mock control plane.
#[cfg(any(test, feature = "testing"))]
PostgresMock(mock::MockControlPlane),
/// Internal testing
#[cfg(test)]
#[allow(private_interfaces)]
Test(Box<dyn TestControlPlaneClient>),
}
impl ControlPlaneApi for ControlPlaneClient {
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
match self {
Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(_) => {
unreachable!("this function should never be called in the test backend")
}
}
}
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
match self {
Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.get_allowed_ips_and_secret(),
}
}
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
match self {
Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(test)]
Self::Test(_api) => Ok(vec![]),
}
}
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
match self {
Self::Neon(api) => api.wake_compute(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.wake_compute(),
}
}
}
#[cfg(test)]
pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
fn get_allowed_ips_and_secret(
&self,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
}
#[cfg(test)]
impl Clone for Box<dyn TestControlPlaneClient> {
fn clone(&self) -> Self {
TestControlPlaneClient::dyn_clone(&**self)
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub(crate) node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
}
impl ApiCaches {
pub fn new(
wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self {
Self {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiLocks<K> {
name: &'static str,
node_locks: DashMap<K, Arc<DynamicLimiter>>,
config: RateLimiterConfig,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum ApiLockError {
#[error("timeout acquiring resource permit")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
config: RateLimiterConfig,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> {
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
config,
timeout,
epoch,
metrics,
})
}
pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.config.initial_limit == 0 {
return Ok(WakeComputePermit {
permit: Token::disabled(),
});
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.metrics.semaphores_registered.inc();
DynamicLimiter::new(self.config)
})
.clone()
}
};
let permit = semaphore.acquire_timeout(self.timeout).await;
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
info!("acquired permit {:?}", now.elapsed().as_secs_f64());
Ok(WakeComputePermit { permit: permit? })
}
pub async fn garbage_collect_worker(&self) {
if self.config.initial_limit == 0 {
return;
}
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64);
timer.observe();
}
}
}
}
pub(crate) struct WakeComputePermit {
permit: Token,
}
impl WakeComputePermit {
pub(crate) fn should_check_cache(&self) -> bool {
!self.permit.is_disabled()
}
pub(crate) fn release(self, outcome: Outcome) {
self.permit.release(outcome);
}
pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
match res {
Ok(_) => self.release(Outcome::Success),
Err(_) => self.release(Outcome::Overload),
}
res
}
}
impl FetchAuthRules for ControlPlaneClient {
async fn fetch_auth_rules(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
self.get_endpoint_jwks(ctx, endpoint)
.await
.map_err(FetchAuthRulesError::GetEndpointJwks)
}
}

View File

@@ -10,18 +10,20 @@ use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{debug, info, info_span, warn, Instrument};
use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute};
use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
use super::{
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
NodeInfo,
};
use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::ComputeUserInfo;
use crate::cache::Cached;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::GetEndpointJwksError;
use crate::control_plane::caches::ApiCaches;
use crate::control_plane::errors::{
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
use crate::control_plane::{
AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
};
use crate::metrics::{CacheOutcome, Metrics};
use crate::rate_limiter::WakeComputeRateLimiter;
use crate::types::{EndpointCacheKey, EndpointId};
@@ -30,7 +32,7 @@ use crate::{compute, http, scram};
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
#[derive(Clone)]
pub struct Api {
pub struct NeonControlPlaneClient {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
@@ -39,17 +41,15 @@ pub struct Api {
jwt: Arc<str>,
}
impl Api {
impl NeonControlPlaneClient {
/// Construct an API object containing the auth parameters.
pub fn new(
endpoint: http::Endpoint,
jwt: Arc<str>,
caches: &'static ApiCaches,
locks: &'static ApiLocks<EndpointCacheKey>,
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
) -> Self {
let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
.unwrap_or_default()
.into();
Self {
endpoint,
caches,
@@ -72,7 +72,6 @@ impl Api {
.caches
.endpoints_cache
.is_valid(ctx, &user_info.endpoint.normalize())
.await
{
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
@@ -145,7 +144,6 @@ impl Api {
.caches
.endpoints_cache
.is_valid(ctx, &endpoint.normalize())
.await
{
return Err(GetEndpointJwksError::EndpointNotFound);
}
@@ -256,7 +254,7 @@ impl Api {
}
}
impl super::Api for Api {
impl super::ControlPlaneApi for NeonControlPlaneClient {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,
@@ -356,7 +354,7 @@ impl super::Api for Api {
let (cached, info) = cached.take_value();
let info = info.map_err(|c| {
info!(key = &*key, "found cached wake_compute error");
WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
})?;
debug!(key = &*key, "found cached compute node info");
@@ -403,9 +401,11 @@ impl super::Api for Api {
Ok(cached.map(|()| node))
}
Err(err) => match err {
WakeComputeError::ApiError(ApiError::ControlPlane(err)) => {
WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
let Some(status) = &err.status else {
return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
};
let reason = status
@@ -415,7 +415,9 @@ impl super::Api for Api {
// if we can retry this error, do not cache it.
if reason.can_retry() {
return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
}
// at this point, we should only have quota errors.
@@ -430,7 +432,9 @@ impl super::Api for Api {
Duration::from_secs(30),
);
Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)))
Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)))
}
err => return Err(err),
},
@@ -441,7 +445,7 @@ impl super::Api for Api {
/// Parse http response body, taking status code into account.
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
response: http::Response,
) -> Result<T, ApiError> {
) -> Result<T, ControlPlaneError> {
let status = response.status();
if status.is_success() {
// We shouldn't log raw body because it may contain secrets.
@@ -456,7 +460,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
// as the fact that the request itself has failed.
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
warn!("failed to parse error body: {e}");
ControlPlaneError {
ControlPlaneErrorMessage {
error: "reason unclear (malformed error message)".into(),
http_status_code: status,
status: None,
@@ -465,7 +469,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
body.http_status_code = status;
warn!("console responded with an error ({status}): {body:?}");
Err(ApiError::ControlPlane(Box::new(body)))
Err(ControlPlaneError::Message(Box::new(body)))
}
fn parse_host_port(input: &str) -> Option<(&str, u16)> {

View File

@@ -0,0 +1,216 @@
use thiserror::Error;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason};
use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
use crate::proxy::retry::CouldRetry;
/// A go-to error message which doesn't leak any detail.
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
/// Common console API error.
#[derive(Debug, Error)]
pub(crate) enum ControlPlaneError {
/// Error returned by the console itself.
#[error("{REQUEST_FAILED} with {0}")]
Message(Box<ControlPlaneErrorMessage>),
/// Various IO errors like broken pipe or malformed payload.
#[error("{REQUEST_FAILED}: {0}")]
Transport(#[from] std::io::Error),
}
impl ControlPlaneError {
/// Returns HTTP status code if it's the reason for failure.
pub(crate) fn get_reason(&self) -> messages::Reason {
match self {
ControlPlaneError::Message(e) => e.get_reason(),
ControlPlaneError::Transport(_) => messages::Reason::Unknown,
}
}
}
impl UserFacingError for ControlPlaneError {
fn to_string_client(&self) -> String {
match self {
// To minimize risks, only select errors are forwarded to users.
ControlPlaneError::Message(c) => c.get_user_facing_message(),
ControlPlaneError::Transport(_) => REQUEST_FAILED.to_owned(),
}
}
}
impl ReportableError for ControlPlaneError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ControlPlaneError::Message(e) => match e.get_reason() {
Reason::RoleProtected => ErrorKind::User,
Reason::ResourceNotFound => ErrorKind::User,
Reason::ProjectNotFound => ErrorKind::User,
Reason::EndpointNotFound => ErrorKind::User,
Reason::BranchNotFound => ErrorKind::User,
Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
Reason::RunningOperations => ErrorKind::ControlPlane,
Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
Reason::Unknown => ErrorKind::ControlPlane,
},
ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
impl CouldRetry for ControlPlaneError {
fn could_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
Self::Message(e) => e.could_retry(),
}
}
}
impl From<reqwest::Error> for ControlPlaneError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()
}
}
impl From<reqwest_middleware::Error> for ControlPlaneError {
fn from(e: reqwest_middleware::Error) -> Self {
io_error(e).into()
}
}
#[derive(Debug, Error)]
pub(crate) enum GetAuthInfoError {
// We shouldn't include the actual secret here.
#[error("Console responded with a malformed auth secret")]
BadSecret,
#[error(transparent)]
ApiError(ControlPlaneError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ControlPlaneError>> From<E> for GetAuthInfoError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for GetAuthInfoError {
fn to_string_client(&self) -> String {
match self {
// We absolutely should not leak any secrets!
Self::BadSecret => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
}
}
}
impl ReportableError for GetAuthInfoError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
#[derive(Debug, Error)]
pub(crate) enum WakeComputeError {
#[error("Console responded with a malformed compute address: {0}")]
BadComputeAddress(Box<str>),
#[error(transparent)]
ControlPlane(ControlPlaneError),
#[error("Too many connections attempts")]
TooManyConnections,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ControlPlaneError>> From<E> for WakeComputeError {
fn from(e: E) -> Self {
Self::ControlPlane(e.into())
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
match self {
// We shouldn't show user the address even if it's broken.
// Besides, user is unlikely to care about this detail.
Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, control plane might return a meaningful error.
Self::ControlPlane(e) => e.to_string_client(),
Self::TooManyConnections => self.to_string(),
Self::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
impl ReportableError for WakeComputeError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
Self::ControlPlane(e) => e.get_error_kind(),
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
impl CouldRetry for WakeComputeError {
fn could_retry(&self) -> bool {
match self {
Self::BadComputeAddress(_) => false,
Self::ControlPlane(e) => e.could_retry(),
Self::TooManyConnections => false,
Self::TooManyConnectionAttempts(_) => false,
}
}
}
#[derive(Debug, Error)]
pub enum GetEndpointJwksError {
#[error("endpoint not found")]
EndpointNotFound,
#[error("failed to build control plane request: {0}")]
RequestBuild(#[source] reqwest::Error),
#[error("failed to send control plane request: {0}")]
RequestExecute(#[source] reqwest_middleware::Error),
#[error(transparent)]
ControlPlane(#[from] ControlPlaneError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TokioPostgres(#[from] tokio_postgres::Error),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
ParseUrl(#[from] url::ParseError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TaskJoin(#[from] tokio::task::JoinError),
}

View File

@@ -10,14 +10,14 @@ use crate::proxy::retry::CouldRetry;
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
#[derive(Debug, Deserialize, Clone)]
pub(crate) struct ControlPlaneError {
pub(crate) struct ControlPlaneErrorMessage {
pub(crate) error: Box<str>,
#[serde(skip)]
pub(crate) http_status_code: http::StatusCode,
pub(crate) status: Option<Status>,
}
impl ControlPlaneError {
impl ControlPlaneErrorMessage {
pub(crate) fn get_reason(&self) -> Reason {
self.status
.as_ref()
@@ -26,7 +26,7 @@ impl ControlPlaneError {
}
pub(crate) fn get_user_facing_message(&self) -> String {
use super::provider::errors::REQUEST_FAILED;
use super::errors::REQUEST_FAILED;
self.status
.as_ref()
.and_then(|s| s.details.user_facing_message.as_ref())
@@ -51,7 +51,7 @@ impl ControlPlaneError {
}
}
impl Display for ControlPlaneError {
impl Display for ControlPlaneErrorMessage {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let msg: &str = self
.status
@@ -62,7 +62,7 @@ impl Display for ControlPlaneError {
}
}
impl CouldRetry for ControlPlaneError {
impl CouldRetry for ControlPlaneErrorMessage {
fn could_retry(&self) -> bool {
// If the error message does not have a status,
// the error is unknown and probably should not retry automatically
@@ -245,7 +245,7 @@ pub(crate) struct WakeCompute {
pub(crate) aux: MetricsAuxInfo,
}
/// Async response which concludes the web auth flow.
/// Async response which concludes the console redirect auth flow.
/// Also known as `kickResponse` in the console.
#[derive(Debug, Deserialize)]
pub(crate) struct KickSession<'a> {

View File

@@ -24,8 +24,8 @@ pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), wai
CPLANE_WAITERS.notify(psql_session_id, msg)
}
/// Console management API listener task.
/// It spawns console response handlers needed for the web auth.
/// Management API listener task.
/// It spawns management response handlers needed for the console redirect auth flow.
pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
scopeguard::defer! {
info!("mgmt has shut down");
@@ -43,13 +43,13 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
tokio::task::spawn(
async move {
info!("serving a new console management API connection");
info!("serving a new management API connection");
// these might be long running connections, have a separate logging for cancelling
// on shutdown and other ways of stopping.
let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
let _e = span.entered();
info!("console management API task cancelled");
info!("management API task cancelled");
});
if let Err(e) = handle_connection(socket).await {

View File

@@ -5,18 +5,137 @@
pub mod messages;
/// Wrappers for console APIs and their mocks.
pub mod provider;
pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
pub mod client;
pub(crate) mod errors;
use std::sync::Arc;
use std::time::Duration;
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::auth::IpPattern;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::cache::{Cached, TimedLru};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
use crate::intern::ProjectIdInt;
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, scram};
/// Various cache-related types.
pub mod caches {
pub use super::provider::ApiCaches;
pub use super::client::ApiCaches;
}
/// Various cache-related types.
pub mod locks {
pub use super::provider::ApiLocks;
pub use super::client::ApiLocks;
}
/// Console's management API.
pub mod mgmt;
/// Auth secret which is managed by the cloud.
#[derive(Clone, Eq, PartialEq, Debug)]
pub(crate) enum AuthSecret {
#[cfg(any(test, feature = "testing"))]
/// Md5 hash of user's password.
Md5([u8; 16]),
/// [SCRAM](crate::scram) authentication info.
Scram(scram::ServerSecret),
}
#[derive(Default)]
pub(crate) struct AuthInfo {
pub(crate) secret: Option<AuthSecret>,
/// List of IP addresses allowed for the autorization.
pub(crate) allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub(crate) project_id: Option<ProjectIdInt>,
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
#[derive(Clone)]
pub(crate) struct NodeInfo {
/// Compute node connection params.
/// It's sad that we have to clone this, but this will improve
/// once we migrate to a bespoke connection logic.
pub(crate) config: compute::ConnCfg,
/// Labels for proxy's metrics.
pub(crate) aux: MetricsAuxInfo,
/// Whether we should accept self-signed certificates (for testing)
pub(crate) allow_self_signed_compute: bool,
}
impl NodeInfo {
pub(crate) async fn connect(
&self,
ctx: &RequestMonitoring,
timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config
.connect(
ctx,
self.allow_self_signed_compute,
self.aux.clone(),
timeout,
)
.await
}
pub(crate) fn reuse_settings(&mut self, other: Self) {
self.allow_self_signed_compute = other.allow_self_signed_compute;
self.config.reuse_password(other.config);
}
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
match keys {
#[cfg(any(test, feature = "testing"))]
ComputeCredentialKeys::Password(password) => self.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
};
}
}
pub(crate) type NodeInfoCache =
TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
pub(crate) trait ControlPlaneApi {
/// Get the client's auth secret for authentication.
/// Returns option because user not found situation is special.
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}

View File

@@ -1,588 +0,0 @@
#[cfg(any(test, feature = "testing"))]
pub mod mock;
pub mod neon;
use std::hash::Hash;
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashMap;
use tokio::time::Instant;
use tracing::info;
use super::messages::{ControlPlaneError, MetricsAuxInfo};
use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::auth::IpPattern;
use crate::cache::endpoints::EndpointsCache;
use crate::cache::project_info::ProjectInfoCacheImpl;
use crate::cache::{Cached, TimedLru};
use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::intern::ProjectIdInt;
use crate::metrics::ApiLockMetrics;
use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, scram};
pub(crate) mod errors {
use thiserror::Error;
use super::ApiLockError;
use crate::control_plane::messages::{self, ControlPlaneError, Reason};
use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
use crate::proxy::retry::CouldRetry;
/// A go-to error message which doesn't leak any detail.
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
/// Common console API error.
#[derive(Debug, Error)]
pub(crate) enum ApiError {
/// Error returned by the console itself.
#[error("{REQUEST_FAILED} with {0}")]
ControlPlane(Box<ControlPlaneError>),
/// Various IO errors like broken pipe or malformed payload.
#[error("{REQUEST_FAILED}: {0}")]
Transport(#[from] std::io::Error),
}
impl ApiError {
/// Returns HTTP status code if it's the reason for failure.
pub(crate) fn get_reason(&self) -> messages::Reason {
match self {
ApiError::ControlPlane(e) => e.get_reason(),
ApiError::Transport(_) => messages::Reason::Unknown,
}
}
}
impl UserFacingError for ApiError {
fn to_string_client(&self) -> String {
match self {
// To minimize risks, only select errors are forwarded to users.
ApiError::ControlPlane(c) => c.get_user_facing_message(),
ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
}
}
}
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::ControlPlane(e) => match e.get_reason() {
Reason::RoleProtected => ErrorKind::User,
Reason::ResourceNotFound => ErrorKind::User,
Reason::ProjectNotFound => ErrorKind::User,
Reason::EndpointNotFound => ErrorKind::User,
Reason::BranchNotFound => ErrorKind::User,
Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
Reason::RunningOperations => ErrorKind::ControlPlane,
Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
Reason::Unknown => ErrorKind::ControlPlane,
},
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
impl CouldRetry for ApiError {
fn could_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
Self::ControlPlane(e) => e.could_retry(),
}
}
}
impl From<reqwest::Error> for ApiError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()
}
}
impl From<reqwest_middleware::Error> for ApiError {
fn from(e: reqwest_middleware::Error) -> Self {
io_error(e).into()
}
}
#[derive(Debug, Error)]
pub(crate) enum GetAuthInfoError {
// We shouldn't include the actual secret here.
#[error("Console responded with a malformed auth secret")]
BadSecret,
#[error(transparent)]
ApiError(ApiError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ApiError>> From<E> for GetAuthInfoError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for GetAuthInfoError {
fn to_string_client(&self) -> String {
match self {
// We absolutely should not leak any secrets!
Self::BadSecret => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
}
}
}
impl ReportableError for GetAuthInfoError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadSecret => crate::error::ErrorKind::ControlPlane,
Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
}
}
}
#[derive(Debug, Error)]
pub(crate) enum WakeComputeError {
#[error("Console responded with a malformed compute address: {0}")]
BadComputeAddress(Box<str>),
#[error(transparent)]
ApiError(ApiError),
#[error("Too many connections attempts")]
TooManyConnections,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
// This allows more useful interactions than `#[from]`.
impl<E: Into<ApiError>> From<E> for WakeComputeError {
fn from(e: E) -> Self {
Self::ApiError(e.into())
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
match self {
// We shouldn't show user the address even if it's broken.
// Besides, user is unlikely to care about this detail.
Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
Self::ApiError(e) => e.to_string_client(),
Self::TooManyConnections => self.to_string(),
Self::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
impl ReportableError for WakeComputeError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
Self::ApiError(e) => e.get_error_kind(),
Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
impl CouldRetry for WakeComputeError {
fn could_retry(&self) -> bool {
match self {
Self::BadComputeAddress(_) => false,
Self::ApiError(e) => e.could_retry(),
Self::TooManyConnections => false,
Self::TooManyConnectionAttempts(_) => false,
}
}
}
#[derive(Debug, Error)]
pub enum GetEndpointJwksError {
#[error("endpoint not found")]
EndpointNotFound,
#[error("failed to build control plane request: {0}")]
RequestBuild(#[source] reqwest::Error),
#[error("failed to send control plane request: {0}")]
RequestExecute(#[source] reqwest_middleware::Error),
#[error(transparent)]
ControlPlane(#[from] ApiError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TokioPostgres(#[from] tokio_postgres::Error),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
ParseUrl(#[from] url::ParseError),
#[cfg(any(test, feature = "testing"))]
#[error(transparent)]
TaskJoin(#[from] tokio::task::JoinError),
}
}
/// Auth secret which is managed by the cloud.
#[derive(Clone, Eq, PartialEq, Debug)]
pub(crate) enum AuthSecret {
#[cfg(any(test, feature = "testing"))]
/// Md5 hash of user's password.
Md5([u8; 16]),
/// [SCRAM](crate::scram) authentication info.
Scram(scram::ServerSecret),
}
#[derive(Default)]
pub(crate) struct AuthInfo {
pub(crate) secret: Option<AuthSecret>,
/// List of IP addresses allowed for the autorization.
pub(crate) allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation.
pub(crate) project_id: Option<ProjectIdInt>,
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
#[derive(Clone)]
pub(crate) struct NodeInfo {
/// Compute node connection params.
/// It's sad that we have to clone this, but this will improve
/// once we migrate to a bespoke connection logic.
pub(crate) config: compute::ConnCfg,
/// Labels for proxy's metrics.
pub(crate) aux: MetricsAuxInfo,
/// Whether we should accept self-signed certificates (for testing)
pub(crate) allow_self_signed_compute: bool,
}
impl NodeInfo {
pub(crate) async fn connect(
&self,
ctx: &RequestMonitoring,
timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config
.connect(
ctx,
self.allow_self_signed_compute,
self.aux.clone(),
timeout,
)
.await
}
pub(crate) fn reuse_settings(&mut self, other: Self) {
self.allow_self_signed_compute = other.allow_self_signed_compute;
self.config.reuse_password(other.config);
}
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
match keys {
#[cfg(any(test, feature = "testing"))]
ComputeCredentialKeys::Password(password) => self.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
};
}
}
pub(crate) type NodeInfoCache =
TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneError>>>;
pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
pub(crate) trait Api {
/// Get the client's auth secret for authentication.
/// Returns option because user not found situation is special.
/// We still have to mock the scram to avoid leaking information that user doesn't exist.
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}
#[non_exhaustive]
#[derive(Clone)]
pub enum ControlPlaneBackend {
/// Current Management API (V2).
Management(neon::Api),
/// Local mock control plane.
#[cfg(any(test, feature = "testing"))]
PostgresMock(mock::Api),
/// Internal testing
#[cfg(test)]
#[allow(private_interfaces)]
Test(Box<dyn crate::auth::backend::TestBackend>),
}
impl Api for ControlPlaneBackend {
async fn get_role_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
match self {
Self::Management(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(_) => {
unreachable!("this function should never be called in the test backend")
}
}
}
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
match self {
Self::Management(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.get_allowed_ips_and_secret(),
}
}
async fn get_endpoint_jwks(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
match self {
Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(test)]
Self::Test(_api) => Ok(vec![]),
}
}
async fn wake_compute(
&self,
ctx: &RequestMonitoring,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
match self {
Self::Management(api) => api.wake_compute(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
#[cfg(test)]
Self::Test(api) => api.wake_compute(),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub(crate) node_info: NodeInfoCache,
/// Cache which stores project_id -> endpoint_ids mapping.
pub project_info: Arc<ProjectInfoCacheImpl>,
/// List of all valid endpoints.
pub endpoints_cache: Arc<EndpointsCache>,
}
impl ApiCaches {
pub fn new(
wake_compute_cache_config: CacheOptions,
project_info_cache_config: ProjectInfoCacheOptions,
endpoint_cache_config: EndpointCacheConfig,
) -> Self {
Self {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
}
}
}
/// Various caches for [`control_plane`](super).
pub struct ApiLocks<K> {
name: &'static str,
node_locks: DashMap<K, Arc<DynamicLimiter>>,
config: RateLimiterConfig,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum ApiLockError {
#[error("timeout acquiring resource permit")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
config: RateLimiterConfig,
shards: usize,
timeout: Duration,
epoch: std::time::Duration,
metrics: &'static ApiLockMetrics,
) -> prometheus::Result<Self> {
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
config,
timeout,
epoch,
metrics,
})
}
pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.config.initial_limit == 0 {
return Ok(WakeComputePermit {
permit: Token::disabled(),
});
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.metrics.semaphores_registered.inc();
DynamicLimiter::new(self.config)
})
.clone()
}
};
let permit = semaphore.acquire_timeout(self.timeout).await;
self.metrics
.semaphore_acquire_seconds
.observe(now.elapsed().as_secs_f64());
info!("acquired permit {:?}", now.elapsed().as_secs_f64());
Ok(WakeComputePermit { permit: permit? })
}
pub async fn garbage_collect_worker(&self) {
if self.config.initial_limit == 0 {
return;
}
let mut interval =
tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64);
timer.observe();
}
}
}
}
pub(crate) struct WakeComputePermit {
permit: Token,
}
impl WakeComputePermit {
pub(crate) fn should_check_cache(&self) -> bool {
!self.permit.is_disabled()
}
pub(crate) fn release(self, outcome: Outcome) {
self.permit.release(outcome);
}
pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
match res {
Ok(_) => self.release(Outcome::Success),
Err(_) => self.release(Outcome::Overload),
}
res
}
}
impl FetchAuthRules for ControlPlaneBackend {
async fn fetch_auth_rules(
&self,
ctx: &RequestMonitoring,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
self.get_endpoint_jwks(ctx, endpoint)
.await
.map_err(FetchAuthRulesError::GetEndpointJwks)
}
}

View File

@@ -6,7 +6,6 @@ pub mod health_server;
use std::time::Duration;
use anyhow::bail;
use bytes::Bytes;
use http::Method;
use http_body_util::BodyExt;
@@ -16,7 +15,7 @@ use reqwest_middleware::RequestBuilder;
pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
pub(crate) use reqwest_retry::policies::ExponentialBackoff;
pub(crate) use reqwest_retry::RetryTransientMiddleware;
use serde::de::DeserializeOwned;
use thiserror::Error;
use crate::metrics::{ConsoleRequest, Metrics};
use crate::url::ApiUrl;
@@ -122,10 +121,19 @@ impl Endpoint {
}
}
pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
#[derive(Error, Debug)]
pub(crate) enum ReadBodyError {
#[error("Content length exceeds limit of {limit} bytes")]
BodyTooLarge { limit: usize },
#[error(transparent)]
Read(#[from] reqwest::Error),
}
pub(crate) async fn read_body_with_limit(
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
limit: usize,
) -> anyhow::Result<D> {
) -> Result<Vec<u8>, ReadBodyError> {
// We could use `b.limited().collect().await.to_bytes()` here
// but this ends up being slightly more efficient as far as I can tell.
@@ -133,20 +141,20 @@ pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
// in reqwest, this value is influenced by the Content-Length header.
let lower_bound = match usize::try_from(b.size_hint().lower()) {
Ok(bound) if bound <= limit => bound,
_ => bail!("Content length exceeds limit of {limit} bytes"),
_ => return Err(ReadBodyError::BodyTooLarge { limit }),
};
let mut bytes = Vec::with_capacity(lower_bound);
while let Some(frame) = b.frame().await.transpose()? {
if let Ok(data) = frame.into_data() {
if bytes.len() + data.len() > limit {
bail!("Content length exceeds limit of {limit} bytes")
return Err(ReadBodyError::BodyTooLarge { limit });
}
bytes.extend_from_slice(&data);
}
}
Ok(serde_json::from_slice::<D>(&bytes)?)
Ok(bytes)
}
#[cfg(test)]

View File

@@ -1,12 +1,6 @@
// rustc lints/lint groups
// https://doc.rust-lang.org/rustc/lints/groups.html
#![deny(
deprecated,
future_incompatible,
let_underscore,
nonstandard_style,
rust_2024_compatibility
)]
#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
// List of denied lints from the clippy::restriction group.
// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction

View File

@@ -18,6 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
let env_filter = EnvFilter::builder()
.with_default_directive(LevelFilter::INFO.into())
.from_env_lossy()
.add_directive("aws_config=info".parse().unwrap())
.add_directive("azure_core::policies::transport=off".parse().unwrap());
let fmt_layer = tracing_subscriber::fmt::layer()

View File

@@ -11,6 +11,7 @@ use bytes::{Buf, Bytes, BytesMut};
use pin_project_lite::pin_project;
use strum_macros::FromRepr;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
use zerocopy::{FromBytes, FromZeroes};
pin_project! {
/// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
@@ -57,16 +58,31 @@ impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
}
/// Proxy Protocol Version 2 Header
const HEADER: [u8; 12] = [
const SIGNATURE: [u8; 12] = [
0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
];
const LOCAL_V2: u8 = 0x20;
const PROXY_V2: u8 = 0x21;
const TCP_OVER_IPV4: u8 = 0x11;
const UDP_OVER_IPV4: u8 = 0x12;
const TCP_OVER_IPV6: u8 = 0x21;
const UDP_OVER_IPV6: u8 = 0x22;
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct ConnectionInfo {
pub addr: SocketAddr,
pub extra: Option<ConnectionInfoExtra>,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum ConnectHeader {
Missing,
Local,
Proxy(ConnectionInfo),
}
impl fmt::Display for ConnectionInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match &self.extra {
@@ -89,96 +105,31 @@ pub enum ConnectionInfoExtra {
pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
mut read: T,
) -> std::io::Result<(ChainRW<T>, Option<ConnectionInfo>)> {
) -> std::io::Result<(ChainRW<T>, ConnectHeader)> {
let mut buf = BytesMut::with_capacity(128);
while buf.len() < 16 {
let header = loop {
let bytes_read = read.read_buf(&mut buf).await?;
// exit for bad header
let len = usize::min(buf.len(), HEADER.len());
if buf[..len] != HEADER[..len] {
return Ok((ChainRW { inner: read, buf }, None));
// exit for bad header signature
let len = usize::min(buf.len(), SIGNATURE.len());
if buf[..len] != SIGNATURE[..len] {
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
}
// if no more bytes available then exit
if bytes_read == 0 {
return Ok((ChainRW { inner: read, buf }, None));
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
};
}
let header = buf.split_to(16);
// The next byte (the 13th one) is the protocol version and command.
// The highest four bits contains the version. As of this specification, it must
// always be sent as \x2 and the receiver must only accept this value.
let vc = header[12];
let version = vc >> 4;
let command = vc & 0b1111;
if version != 2 {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol version. expected version 2",
));
}
match command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
0 => {}
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
1 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol command. expected local (0) or proxy (1)",
))
// check if we have enough bytes to continue
if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
break header;
}
};
// The 14th byte contains the transport protocol and address family. The highest 4
// bits contain the address family, the lowest 4 bits contain the protocol.
let ft = header[13];
let address_length = match ft {
// - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
// - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
// protocol family. Address length is 2*4 + 2*2 = 12 bytes.
0x11 | 0x12 => 12,
// - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
// - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
// protocol family. Address length is 2*16 + 2*2 = 36 bytes.
0x21 | 0x22 => 36,
// unspecified or unix stream. ignore the addresses
_ => 0,
};
let remaining_length = usize::from(header.len.get());
// The 15th and 16th bytes is the address length in bytes in network endian order.
// It is used so that the receiver knows how many address bytes to skip even when
// it does not implement the presented protocol. Thus the length of the protocol
// header in bytes is always exactly 16 + this value. When a sender presents a
// LOCAL connection, it should not present any address so it sets this field to
// zero. Receivers MUST always consider this field to skip the appropriate number
// of bytes and must not assume zero is presented for LOCAL connections. When a
// receiver accepts an incoming connection showing an UNSPEC address family or
// protocol, it may or may not decide to log the address information if present.
let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
if remaining_length < address_length {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol length. not enough to fit requested IP addresses",
));
}
drop(header);
while buf.len() < remaining_length as usize {
while buf.len() < remaining_length {
if read.read_buf(&mut buf).await? == 0 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
@@ -186,36 +137,69 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
));
}
}
let payload = buf.split_to(remaining_length);
// Starting from the 17th byte, addresses are presented in network byte order.
// The address order is always the same :
// - source layer 3 address in network byte order
// - destination layer 3 address in network byte order
// - source layer 4 address if any, in network byte order (port)
// - destination layer 4 address if any, in network byte order (port)
let mut header = buf.split_to(usize::from(remaining_length));
let mut addr = header.split_to(usize::from(address_length));
let socket = match addr.len() {
12 => {
let src_addr = Ipv4Addr::from_bits(addr.get_u32());
let _dst_addr = Ipv4Addr::from_bits(addr.get_u32());
let src_port = addr.get_u16();
let _dst_port = addr.get_u16();
Some(SocketAddr::from((src_addr, src_port)))
let res = process_proxy_payload(header, payload)?;
Ok((ChainRW { inner: read, buf }, res))
}
fn process_proxy_payload(
header: ProxyProtocolV2Header,
mut payload: BytesMut,
) -> std::io::Result<ConnectHeader> {
match header.version_and_command {
// the connection was established on purpose by the proxy
// without being relayed. The connection endpoints are the sender and the
// receiver. Such connections exist when the proxy sends health-checks to the
// server. The receiver must accept this connection as valid and must use the
// real connection endpoints and discard the protocol block including the
// family which is ignored.
LOCAL_V2 => return Ok(ConnectHeader::Local),
// the connection was established on behalf of another node,
// and reflects the original connection endpoints. The receiver must then use
// the information provided in the protocol block to get original the address.
PROXY_V2 => {}
// other values are unassigned and must not be emitted by senders. Receivers
// must drop connections presenting unexpected values here.
#[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
_ => return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
header.version_and_command
),
)),
};
let size_err =
"invalid proxy protocol length. payload not large enough to fit requested IP addresses";
let addr = match header.protocol_and_family {
TCP_OVER_IPV4 | UDP_OVER_IPV4 => {
let addr = payload
.try_get::<ProxyProtocolV2HeaderV4>()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
}
36 => {
let src_addr = Ipv6Addr::from_bits(addr.get_u128());
let _dst_addr = Ipv6Addr::from_bits(addr.get_u128());
let src_port = addr.get_u16();
let _dst_port = addr.get_u16();
Some(SocketAddr::from((src_addr, src_port)))
TCP_OVER_IPV6 | UDP_OVER_IPV6 => {
let addr = payload
.try_get::<ProxyProtocolV2HeaderV6>()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
}
// unspecified or unix stream. ignore the addresses
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid proxy protocol address family/transport protocol.",
))
}
_ => None,
};
let mut extra = None;
while let Some(mut tlv) = read_tlv(&mut header) {
while let Some(mut tlv) = read_tlv(&mut payload) {
match Pp2Kind::from_repr(tlv.kind) {
Some(Pp2Kind::Aws) => {
if tlv.value.is_empty() {
@@ -259,9 +243,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
}
}
let conn_info = socket.map(|addr| ConnectionInfo { addr, extra });
Ok((ChainRW { inner: read, buf }, conn_info))
Ok(ConnectHeader::Proxy(ConnectionInfo { addr, extra }))
}
#[derive(FromRepr, Debug, Copy, Clone)]
@@ -337,27 +319,93 @@ struct Tlv {
}
fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
if b.len() < 3 {
return None;
}
let kind = b.get_u8();
let len = usize::from(b.get_u16());
let tlv_header = b.try_get::<TlvHeader>()?;
let len = usize::from(tlv_header.len.get());
if b.len() < len {
return None;
}
let value = b.split_to(len).freeze();
Some(Tlv { kind, value })
Some(Tlv {
kind: tlv_header.kind,
value: b.split_to(len).freeze(),
})
}
trait BufExt: Sized {
fn try_get<T: FromBytes>(&mut self) -> Option<T>;
}
impl BufExt for BytesMut {
fn try_get<T: FromBytes>(&mut self) -> Option<T> {
let res = T::read_from_prefix(self)?;
self.advance(size_of::<T>());
Some(res)
}
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2Header {
signature: [u8; 12],
version_and_command: u8,
protocol_and_family: u8,
len: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2HeaderV4 {
src_addr: NetworkEndianIpv4,
dst_addr: NetworkEndianIpv4,
src_port: zerocopy::byteorder::network_endian::U16,
dst_port: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct ProxyProtocolV2HeaderV6 {
src_addr: NetworkEndianIpv6,
dst_addr: NetworkEndianIpv6,
src_port: zerocopy::byteorder::network_endian::U16,
dst_port: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(C)]
struct TlvHeader {
kind: u8,
len: zerocopy::byteorder::network_endian::U16,
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(transparent)]
struct NetworkEndianIpv4(zerocopy::byteorder::network_endian::U32);
impl NetworkEndianIpv4 {
#[inline]
fn get(self) -> Ipv4Addr {
Ipv4Addr::from_bits(self.0.get())
}
}
#[derive(FromBytes, FromZeroes, Copy, Clone)]
#[repr(transparent)]
struct NetworkEndianIpv6(zerocopy::byteorder::network_endian::U128);
impl NetworkEndianIpv6 {
#[inline]
fn get(self) -> Ipv6Addr {
Ipv6Addr::from_bits(self.0.get())
}
}
#[cfg(test)]
mod tests {
use tokio::io::AsyncReadExt;
use crate::protocol2::read_proxy_protocol;
use crate::protocol2::{
read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6,
};
#[tokio::test]
async fn test_ipv4() {
let header = super::HEADER
let header = super::SIGNATURE
// Proxy command, IPV4 | TCP
.chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
// 12 + 3 bytes
@@ -384,15 +432,17 @@ mod tests {
assert_eq!(bytes, extra_data);
let info = info.unwrap();
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into());
}
#[tokio::test]
async fn test_ipv6() {
let header = super::HEADER
let header = super::SIGNATURE
// Proxy command, IPV6 | UDP
.chain([(2 << 4) | 1, (2 << 4) | 2].as_slice())
.chain([PROXY_V2, UDP_OVER_IPV6].as_slice())
// 36 + 3 bytes
.chain([0, 39].as_slice())
// src ip
@@ -417,7 +467,9 @@ mod tests {
assert_eq!(bytes, extra_data);
let info = info.unwrap();
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(
info.addr,
([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
@@ -433,7 +485,7 @@ mod tests {
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(info, None);
assert_eq!(info, ConnectHeader::Missing);
}
#[tokio::test]
@@ -445,7 +497,7 @@ mod tests {
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, data);
assert_eq!(info, None);
assert_eq!(info, ConnectHeader::Missing);
}
#[tokio::test]
@@ -454,9 +506,9 @@ mod tests {
let tlv_len = (tlv.len() as u16).to_be_bytes();
let len = (12 + 3 + tlv.len() as u16).to_be_bytes();
let header = super::HEADER
let header = super::SIGNATURE
// Proxy command, Inet << 4 | Stream
.chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
.chain([PROXY_V2, TCP_OVER_IPV4].as_slice())
// 12 + 3 bytes
.chain(len.as_slice())
// src ip
@@ -483,7 +535,30 @@ mod tests {
assert_eq!(bytes, extra_data);
let info = info.unwrap();
let ConnectHeader::Proxy(info) = info else {
panic!()
};
assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into());
}
#[tokio::test]
async fn test_local() {
let len = 0u16.to_be_bytes();
let header = super::SIGNATURE
.chain([LOCAL_V2, 0x00].as_slice())
.chain(len.as_slice());
let extra_data = [0xaa; 256];
let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
.await
.unwrap();
let mut bytes = vec![];
read.read_to_end(&mut bytes).await.unwrap();
assert_eq!(bytes, extra_data);
let ConnectHeader::Local = info else { panic!() };
}
}

View File

@@ -19,7 +19,7 @@ use smol_str::{format_smolstr, SmolStr};
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn, Instrument};
use tracing::{debug, error, info, warn, Instrument};
use self::connect_compute::{connect_to_compute, TcpMechanism};
use self::passthrough::ProxyPassthrough;
@@ -28,7 +28,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::{Metrics, NumClientConnectionsGuard};
use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
use crate::proxy::handshake::{handshake, HandshakeData};
use crate::rate_limiter::EndpointRateLimiter;
use crate::stream::{PqStream, Stream};
@@ -83,7 +83,7 @@ pub async fn task_main(
let session_id = uuid::Uuid::new_v4();
let cancellation_handler = Arc::clone(&cancellation_handler);
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
connections.spawn(async move {
@@ -92,16 +92,21 @@ pub async fn task_main(
warn!("per-client task finished with an error: {e:#}");
return;
}
Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
// our load balancers will not send any more data. let's just exit immediately
Ok((_socket, ConnectHeader::Local)) => {
debug!("healthcheck received");
return;
}
Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
warn!("missing required proxy protocol header");
return;
}
Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
warn!("proxy protocol header not supported");
return;
}
Ok((socket, Some(info))) => (socket, info),
Ok((socket, None)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
};
match socket.inner.set_nodelay(true) {

View File

@@ -20,14 +20,14 @@ use super::connect_compute::ConnectMechanism;
use super::retry::CouldRetry;
use super::*;
use crate::auth::backend::{
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned,
};
use crate::config::{CertResolver, RetryConfig};
use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
use crate::control_plane::provider::{
CachedAllowedIps, CachedRoleSecret, ControlPlaneBackend, NodeInfoCache,
use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
use crate::control_plane::{
self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
};
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
use crate::types::{BranchId, EndpointId, ProjectId};
use crate::{sasl, scram};
@@ -490,7 +490,7 @@ impl ConnectMechanism for TestConnectMechanism {
fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
}
impl TestBackend for TestConnectMechanism {
impl TestControlPlaneClient for TestConnectMechanism {
fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
let mut counter = self.counter.lock().unwrap();
let action = self.sequence[*counter];
@@ -498,18 +498,19 @@ impl TestBackend for TestConnectMechanism {
match action {
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
ConnectAction::WakeFail => {
let err =
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
let err = control_plane::errors::ControlPlaneError::Message(Box::new(
ControlPlaneErrorMessage {
http_status_code: StatusCode::BAD_REQUEST,
error: "TEST".into(),
status: None,
}));
},
));
assert!(!err.could_retry());
Err(control_plane::errors::WakeComputeError::ApiError(err))
Err(control_plane::errors::WakeComputeError::ControlPlane(err))
}
ConnectAction::WakeRetry => {
let err =
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
let err = control_plane::errors::ControlPlaneError::Message(Box::new(
ControlPlaneErrorMessage {
http_status_code: StatusCode::BAD_REQUEST,
error: "TEST".into(),
status: Some(Status {
@@ -523,9 +524,10 @@ impl TestBackend for TestConnectMechanism {
user_facing_message: None,
},
}),
}));
},
));
assert!(err.could_retry());
Err(control_plane::errors::WakeComputeError::ApiError(err))
Err(control_plane::errors::WakeComputeError::ControlPlane(err))
}
x => panic!("expecting action {x:?}, wake_compute is called instead"),
}
@@ -538,7 +540,7 @@ impl TestBackend for TestConnectMechanism {
unimplemented!("not used in tests")
}
fn dyn_clone(&self) -> Box<dyn TestBackend> {
fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient> {
Box::new(self.clone())
}
}
@@ -562,7 +564,7 @@ fn helper_create_connect_info(
mechanism: &TestConnectMechanism,
) -> auth::Backend<'static, ComputeCredentials> {
let user_info = auth::Backend::ControlPlane(
MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
ComputeCredentials {
info: ComputeUserInfo {
endpoint: "endpoint".into(),

View File

@@ -4,7 +4,7 @@ use super::connect_compute::ComputeConnectBackend;
use crate::config::RetryConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::errors::WakeComputeError;
use crate::control_plane::provider::CachedNodeInfo;
use crate::control_plane::CachedNodeInfo;
use crate::error::ReportableError;
use crate::metrics::{
ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,

View File

@@ -14,7 +14,7 @@ use tracing::{debug, info};
use super::conn_pool::poll_client;
use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
use super::http_conn_pool::{self, poll_http2_client, Send};
use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
use crate::auth::backend::local::StaticAuthRules;
use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
@@ -24,9 +24,9 @@ use crate::compute_ctl::{
};
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::control_plane::client::ApiLockError;
use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::provider::ApiLockError;
use crate::control_plane::CachedNodeInfo;
use crate::error::{ErrorKind, ReportableError, UserFacingError};
use crate::intern::EndpointIdInt;
@@ -81,7 +81,7 @@ impl PoolingBackend {
None => {
// If we don't have an authentication secret, for the http flow we can just return an error.
info!("authentication info not found");
return Err(AuthError::auth_failed(&*user_info.user));
return Err(AuthError::password_failed(&*user_info.user));
}
};
let ep = EndpointIdInt::from(&user_info.endpoint);
@@ -99,7 +99,7 @@ impl PoolingBackend {
}
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
Err(AuthError::auth_failed(&*user_info.user))
Err(AuthError::password_failed(&*user_info.user))
}
};
res.map(|key| ComputeCredentials {
@@ -126,8 +126,7 @@ impl PoolingBackend {
&**console,
&jwt,
)
.await
.map_err(|e| AuthError::auth_failed(e.to_string()))?;
.await?;
Ok(ComputeCredentials {
info: user_info.clone(),
@@ -146,8 +145,7 @@ impl PoolingBackend {
&StaticAuthRules,
&jwt,
)
.await
.map_err(|e| AuthError::auth_failed(e.to_string()))?;
.await?;
Ok(ComputeCredentials {
info: user_info.clone(),
@@ -207,7 +205,7 @@ impl PoolingBackend {
conn_info: ConnInfo,
) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
info!("pool: looking for an existing connection");
if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) {
return Ok(client);
}
@@ -250,7 +248,7 @@ impl PoolingBackend {
&self,
ctx: &RequestMonitoring,
conn_info: ConnInfo,
) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
return Ok(client);
}

View File

@@ -18,7 +18,9 @@ use {
std::{sync::atomic, time::Duration},
};
use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
use super::conn_pool_lib::{
Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::MetricsAuxInfo;
use crate::metrics::Metrics;
@@ -152,53 +154,30 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
}
.instrument(span));
let inner = ClientInnerRemote {
let inner = ClientInnerCommon {
inner: client,
session: tx,
cancel,
aux,
conn_id,
data: ClientDataEnum::Remote(ClientDataRemote {
session: tx,
cancel,
}),
};
Client::new(inner, conn_info, pool_clone)
}
pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
inner: C,
pub(crate) struct ClientDataRemote {
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
}
impl<C: ClientInnerExt> ClientInnerRemote<C> {
pub(crate) fn inner_mut(&mut self) -> &mut C {
&mut self.inner
}
pub(crate) fn inner(&self) -> &C {
&self.inner
}
pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
impl ClientDataRemote {
pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
&mut self.session
}
pub(crate) fn aux(&self) -> &MetricsAuxInfo {
&self.aux
}
pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
self.conn_id
}
pub(crate) fn is_closed(&self) -> bool {
self.inner.is_closed()
}
}
impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
fn drop(&mut self) {
// on client drop, tell the conn to shut down
pub fn cancel(&mut self) {
self.cancel.cancel();
}
}
@@ -228,15 +207,13 @@ mod tests {
}
}
fn create_inner() -> ClientInnerRemote<MockClient> {
fn create_inner() -> ClientInnerCommon<MockClient> {
create_inner_with(MockClient::new(false))
}
fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
ClientInnerRemote {
fn create_inner_with(client: MockClient) -> ClientInnerCommon<MockClient> {
ClientInnerCommon {
inner: client,
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
cancel: CancellationToken::new(),
aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
@@ -244,6 +221,10 @@ mod tests {
cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
},
conn_id: uuid::Uuid::new_v4(),
data: ClientDataEnum::Remote(ClientDataRemote {
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
cancel: CancellationToken::new(),
}),
}
}
@@ -280,7 +261,7 @@ mod tests {
{
let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
assert_eq!(0, pool.get_global_connections_count());
client.inner_mut().1.discard();
client.inner().1.discard();
// Discard should not add the connection from the pool.
assert_eq!(0, pool.get_global_connections_count());
}

View File

@@ -11,10 +11,12 @@ use tokio_postgres::ReadyForQueryStatus;
use tracing::{debug, info, Span};
use super::backend::HttpConnError;
use super::conn_pool::ClientInnerRemote;
use super::conn_pool::ClientDataRemote;
use super::http_conn_pool::ClientDataHttp;
use super::local_conn_pool::ClientDataLocal;
use crate::auth::backend::ComputeUserInfo;
use crate::context::RequestMonitoring;
use crate::control_plane::messages::ColdStartInfo;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
use crate::types::{DbName, EndpointCacheKey, RoleName};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
@@ -41,8 +43,46 @@ impl ConnInfo {
}
}
pub(crate) enum ClientDataEnum {
Remote(ClientDataRemote),
Local(ClientDataLocal),
#[allow(dead_code)]
Http(ClientDataHttp),
}
pub(crate) struct ClientInnerCommon<C: ClientInnerExt> {
pub(crate) inner: C,
pub(crate) aux: MetricsAuxInfo,
pub(crate) conn_id: uuid::Uuid,
pub(crate) data: ClientDataEnum, // custom client data like session, key, jti
}
impl<C: ClientInnerExt> Drop for ClientInnerCommon<C> {
fn drop(&mut self) {
match &mut self.data {
ClientDataEnum::Remote(remote_data) => {
remote_data.cancel();
}
ClientDataEnum::Local(local_data) => {
local_data.cancel();
}
ClientDataEnum::Http(_http_data) => (),
}
}
}
impl<C: ClientInnerExt> ClientInnerCommon<C> {
pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
self.conn_id
}
pub(crate) fn get_data(&mut self) -> &mut ClientDataEnum {
&mut self.data
}
}
pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
pub(crate) conn: ClientInnerRemote<C>,
pub(crate) conn: ClientInnerCommon<C>,
pub(crate) _last_access: std::time::Instant,
}
@@ -55,10 +95,33 @@ pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
_guard: HttpEndpointPoolsGuard<'static>,
global_connections_count: Arc<AtomicUsize>,
global_pool_size_max_conns: usize,
pool_name: String,
}
impl<C: ClientInnerExt> EndpointConnPool<C> {
fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
pub(crate) fn new(
hmap: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
tconns: usize,
max_conns_per_endpoint: usize,
global_connections_count: Arc<AtomicUsize>,
max_total_conns: usize,
pname: String,
) -> Self {
Self {
pools: hmap,
total_conns: tconns,
max_conns: max_conns_per_endpoint,
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
global_connections_count,
global_pool_size_max_conns: max_total_conns,
pool_name: pname,
}
}
pub(crate) fn get_conn_entry(
&mut self,
db_user: (DbName, RoleName),
) -> Option<ConnPoolEntry<C>> {
let Self {
pools,
total_conns,
@@ -84,9 +147,10 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
let new_len = pool.conns.len();
let old_len = pool.get_conns().len();
pool.get_conns()
.retain(|conn| conn.conn.get_conn_id() != conn_id);
let new_len = pool.get_conns().len();
let removed = old_len - new_len;
if removed > 0 {
global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
@@ -103,11 +167,26 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
}
}
pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
let conn_id = client.get_conn_id();
pub(crate) fn get_name(&self) -> &str {
&self.pool_name
}
if client.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
pub(crate) fn get_pool(&self, db_user: (DbName, RoleName)) -> Option<&DbUserConnPool<C>> {
self.pools.get(&db_user)
}
pub(crate) fn get_pool_mut(
&mut self,
db_user: (DbName, RoleName),
) -> Option<&mut DbUserConnPool<C>> {
self.pools.get_mut(&db_user)
}
pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerCommon<C>) {
let conn_id = client.get_conn_id();
let pool_name = pool.read().get_name().to_string();
if client.inner.is_closed() {
info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name);
return;
}
@@ -118,7 +197,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
.load(atomic::Ordering::Relaxed)
>= global_max_conn
{
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name);
return;
}
@@ -130,13 +209,13 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
if pool.total_conns < pool.max_conns {
let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
pool_entries.conns.push(ConnPoolEntry {
pool_entries.get_conns().push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
per_db_size = pool_entries.get_conns().len();
pool.total_conns += 1;
pool.global_connections_count
@@ -153,9 +232,9 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
// do logging outside of the mutex
if returned {
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
}
}
@@ -176,19 +255,39 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
pub(crate) conns: Vec<ConnPoolEntry<C>>,
pub(crate) initialized: Option<bool>, // a bit ugly, exists only for local pools
}
impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
fn default() -> Self {
Self { conns: Vec::new() }
Self {
conns: Vec::new(),
initialized: None,
}
}
}
impl<C: ClientInnerExt> DbUserConnPool<C> {
pub(crate) trait DbUserConn<C: ClientInnerExt>: Default {
fn set_initialized(&mut self);
fn is_initialized(&self) -> bool;
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize;
fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize);
fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>>;
}
impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
fn set_initialized(&mut self) {
self.initialized = Some(true);
}
fn is_initialized(&self) -> bool {
self.initialized.unwrap_or(false)
}
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.is_closed());
self.conns.retain(|conn| !conn.conn.inner.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
@@ -196,10 +295,7 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
removed
}
pub(crate) fn get_conn_entry(
&mut self,
conns: &mut usize,
) -> (Option<ConnPoolEntry<C>>, usize) {
fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize) {
let mut removed = self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
@@ -215,6 +311,10 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
(conn, removed)
}
fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>> {
&mut self.conns
}
}
pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
@@ -278,6 +378,60 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
self.config.pool_options.idle_timeout
}
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
let mut client: Option<ClientInnerCommon<C>> = None;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
return Ok(None);
};
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
if let Some(entry) = endpoint_pool
.write()
.get_conn_entry(conn_info.db_and_user())
{
client = Some(entry.conn);
}
let endpoint_pool = Arc::downgrade(&endpoint_pool);
// ok return cached connection if found and establish a new one otherwise
if let Some(mut client) = client {
if client.inner.is_closed() {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner.get_process_id()),
);
info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
match client.get_data() {
ClientDataEnum::Local(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Remote(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Http(_) => (),
}
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
}
Ok(None)
}
pub(crate) fn shutdown(&self) {
// drops all strong references to endpoint-pools
self.global_pool.clear();
@@ -374,6 +528,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
_guard: Metrics::get().proxy.http_endpoint_pools.guard(),
global_connections_count: self.global_connections_count.clone(),
global_pool_size_max_conns: self.config.pool_options.max_total_conns,
pool_name: String::from("remote"),
}));
// find or create a pool for this endpoint
@@ -400,55 +555,23 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
pool
}
}
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<Client<C>>, HttpConnError> {
let mut client: Option<ClientInnerRemote<C>> = None;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
return Ok(None);
};
pub(crate) struct Client<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInnerCommon<C>>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
}
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
if let Some(entry) = endpoint_pool
.write()
.get_conn_entry(conn_info.db_and_user())
{
client = Some(entry.conn);
}
let endpoint_pool = Arc::downgrade(&endpoint_pool);
// ok return cached connection if found and establish a new one otherwise
if let Some(mut client) = client {
if client.is_closed() {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner().get_process_id()),
);
info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
client.session().send(ctx.session_id())?;
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
}
Ok(None)
}
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
}
impl<C: ClientInnerExt> Client<C> {
pub(crate) fn new(
inner: ClientInnerRemote<C>,
inner: ClientInnerCommon<C>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
) -> Self {
@@ -460,7 +583,18 @@ impl<C: ClientInnerExt> Client<C> {
}
}
pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
pub(crate) fn client_inner(&mut self) -> (&mut ClientInnerCommon<C>, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner_m = inner.as_mut().expect("client inner should not be removed");
(inner_m, Discard { conn_info, pool })
}
pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
let Self {
inner,
pool,
@@ -468,12 +602,11 @@ impl<C: ClientInnerExt> Client<C> {
span: _,
} = self;
let inner = inner.as_mut().expect("client inner should not be removed");
let inner_ref = inner.inner_mut();
(inner_ref, Discard { conn_info, pool })
(&mut inner.inner, Discard { conn_info, pool })
}
pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux();
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
@@ -498,13 +631,6 @@ impl<C: ClientInnerExt> Client<C> {
}
}
pub(crate) struct Client<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInnerRemote<C>>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool<C>>>,
}
impl<C: ClientInnerExt> Drop for Client<C> {
fn drop(&mut self) {
if let Some(drop) = self.do_drop() {
@@ -517,10 +643,11 @@ impl<C: ClientInnerExt> Deref for Client<C> {
type Target = C;
fn deref(&self) -> &Self::Target {
self.inner
&self
.inner
.as_ref()
.expect("client inner should not be removed")
.inner()
.inner
}
}
@@ -539,11 +666,6 @@ impl ClientInnerExt for tokio_postgres::Client {
}
}
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
}
impl<C: ClientInnerExt> Discard<'_, C> {
pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
let conn_info = &self.conn_info;

View File

@@ -10,6 +10,7 @@ use rand::Rng;
use tokio::net::TcpStream;
use tracing::{debug, error, info, info_span, Instrument};
use super::backend::HttpConnError;
use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
@@ -28,6 +29,8 @@ pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
aux: MetricsAuxInfo,
}
pub(crate) struct ClientDataHttp();
// Per-endpoint connection pool
// Number of open connections is limited by the `max_conns_per_endpoint`.
pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
@@ -206,14 +209,22 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
}
}
#[expect(unused_results)]
pub(crate) fn get(
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Option<Client<C>> {
let endpoint = conn_info.endpoint_cache_key()?;
) -> Result<Option<Client<C>>, HttpConnError> {
let result: Result<Option<Client<C>>, HttpConnError>;
let Some(endpoint) = conn_info.endpoint_cache_key() else {
result = Ok(None);
return result;
};
let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
let client = endpoint_pool.write().get_conn_entry()?;
let Some(client) = endpoint_pool.write().get_conn_entry() else {
result = Ok(None);
return result;
};
tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
info!(
@@ -222,7 +233,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
);
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
Some(Client::new(client.conn, client.aux))
Ok(Some(Client::new(client.conn, client.aux)))
}
fn get_or_create_endpoint_pool(

View File

@@ -11,7 +11,8 @@
use std::collections::HashMap;
use std::pin::pin;
use std::sync::{Arc, Weak};
use std::sync::atomic::AtomicUsize;
use std::sync::Arc;
use std::task::{ready, Poll};
use std::time::Duration;
@@ -26,177 +27,42 @@ use signature::Signer;
use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::types::ToSql;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use tokio_postgres::{AsyncMessage, Socket};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument, Span};
use tracing::{error, info, info_span, warn, Instrument};
use super::backend::HttpConnError;
use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
use super::conn_pool_lib::{
Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
EndpointConnPool,
};
use crate::context::RequestMonitoring;
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::Metrics;
use crate::types::{DbName, RoleName};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
pub(crate) const EXT_NAME: &str = "pg_session_jwt";
pub(crate) const EXT_VERSION: &str = "0.1.2";
pub(crate) const EXT_SCHEMA: &str = "auth";
struct ConnPoolEntry<C: ClientInnerExt> {
conn: ClientInner<C>,
_last_access: std::time::Instant,
pub(crate) struct ClientDataLocal {
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
key: SigningKey,
jti: u64,
}
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
// Number of open connections is limited by the `max_conns_per_endpoint`.
pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
total_conns: usize,
max_conns: usize,
global_pool_size_max_conns: usize,
}
impl<C: ClientInnerExt> EndpointConnPool<C> {
fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
let Self {
pools, total_conns, ..
} = self;
pools
.get_mut(&db_user)
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
impl ClientDataLocal {
pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
&mut self.session
}
fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
let Self {
pools, total_conns, ..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
let new_len = pool.conns.len();
let removed = old_len - new_len;
if removed > 0 {
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
}
*total_conns -= removed;
removed > 0
} else {
false
}
}
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
let conn_id = client.conn_id;
if client.is_closed() {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because connection is closed");
return;
}
let global_max_conn = pool.read().global_pool_size_max_conns;
if pool.read().total_conns >= global_max_conn {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full");
return;
}
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < pool.max_conns {
let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.inc();
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "local_pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
}
}
impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
fn drop(&mut self) {
if self.total_conns > 0 {
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(self.total_conns as i64);
}
}
}
pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
conns: Vec<ConnPoolEntry<C>>,
// true if we have definitely installed the extension and
// granted the role access to the auth schema.
initialized: bool,
}
impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
fn default() -> Self {
Self {
conns: Vec::new(),
initialized: false,
}
}
}
impl<C: ClientInnerExt> DbUserConnPool<C> {
fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
*conns -= removed;
removed
}
fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry<C>> {
let mut removed = self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
*conns -= 1;
removed += 1;
}
Metrics::get()
.proxy
.http_pool_opened_connections
.get_metric()
.dec_by(removed as i64);
conn
pub fn cancel(&mut self) {
self.cancel.cancel();
}
}
pub(crate) struct LocalConnPool<C: ClientInnerExt> {
global_pool: RwLock<EndpointConnPool<C>>,
global_pool: Arc<RwLock<EndpointConnPool<C>>>,
config: &'static crate::config::HttpConfig,
}
@@ -204,12 +70,14 @@ pub(crate) struct LocalConnPool<C: ClientInnerExt> {
impl<C: ClientInnerExt> LocalConnPool<C> {
pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
Arc::new(Self {
global_pool: RwLock::new(EndpointConnPool {
pools: HashMap::new(),
total_conns: 0,
max_conns: config.pool_options.max_conns_per_endpoint,
global_pool_size_max_conns: config.pool_options.max_total_conns,
}),
global_pool: Arc::new(RwLock::new(EndpointConnPool::new(
HashMap::new(),
0,
config.pool_options.max_conns_per_endpoint,
Arc::new(AtomicUsize::new(0)),
config.pool_options.max_total_conns,
String::from("local_pool"),
))),
config,
})
}
@@ -222,7 +90,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
self: &Arc<Self>,
ctx: &RequestMonitoring,
conn_info: &ConnInfo,
) -> Result<Option<LocalClient<C>>, HttpConnError> {
) -> Result<Option<Client<C>>, HttpConnError> {
let client = self
.global_pool
.write()
@@ -230,12 +98,14 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
.map(|entry| entry.conn);
// ok return cached connection if found and establish a new one otherwise
if let Some(client) = client {
if client.is_closed() {
if let Some(mut client) = client {
if client.inner.is_closed() {
info!("local_pool: cached connection '{conn_info}' is closed, opening a new one");
return Ok(None);
}
tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
tracing::Span::current()
.record("conn_id", tracing::field::display(client.get_conn_id()));
tracing::Span::current().record(
"pid",
tracing::field::display(client.inner.get_process_id()),
@@ -244,47 +114,59 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"local_pool: reusing connection '{conn_info}'"
);
client.session.send(ctx.session_id())?;
match client.get_data() {
ClientDataEnum::Local(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Remote(data) => {
data.session().send(ctx.session_id())?;
}
ClientDataEnum::Http(_) => (),
}
ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.success();
return Ok(Some(LocalClient::new(
return Ok(Some(Client::new(
client,
conn_info.clone(),
Arc::downgrade(self),
Arc::downgrade(&self.global_pool),
)));
}
Ok(None)
}
pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
self.global_pool
.read()
.pools
.get(&conn_info.db_and_user())
.map_or(false, |pool| pool.initialized)
if let Some(pool) = self.global_pool.read().get_pool(conn_info.db_and_user()) {
return pool.is_initialized();
}
false
}
pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
self.global_pool
if let Some(pool) = self
.global_pool
.write()
.pools
.entry(conn_info.db_and_user())
.or_default()
.initialized = true;
.get_pool_mut(conn_info.db_and_user())
{
pool.set_initialized();
}
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn poll_client(
global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
pub(crate) fn poll_client<C: ClientInnerExt>(
global_pool: Arc<LocalConnPool<C>>,
ctx: &RequestMonitoring,
conn_info: ConnInfo,
client: tokio_postgres::Client,
client: C,
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
key: SigningKey,
conn_id: uuid::Uuid,
aux: MetricsAuxInfo,
) -> LocalClient<tokio_postgres::Client> {
) -> Client<C> {
let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
let mut session_id = ctx.session_id();
let (tx, mut rx) = tokio::sync::watch::channel(session_id);
@@ -377,111 +259,47 @@ pub(crate) fn poll_client(
}
.instrument(span));
let inner = ClientInner {
let inner = ClientInnerCommon {
inner: client,
session: tx,
cancel,
aux,
conn_id,
key,
jti: 0,
data: ClientDataEnum::Local(ClientDataLocal {
session: tx,
cancel,
key,
jti: 0,
}),
};
LocalClient::new(inner, conn_info, pool_clone)
Client::new(
inner,
conn_info,
Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool),
)
}
pub(crate) struct ClientInner<C: ClientInnerExt> {
inner: C,
session: tokio::sync::watch::Sender<uuid::Uuid>,
cancel: CancellationToken,
aux: MetricsAuxInfo,
conn_id: uuid::Uuid,
// needed for pg_session_jwt state
key: SigningKey,
jti: u64,
}
impl<C: ClientInnerExt> Drop for ClientInner<C> {
fn drop(&mut self) {
// on client drop, tell the conn to shut down
self.cancel.cancel();
}
}
impl<C: ClientInnerExt> ClientInner<C> {
pub(crate) fn is_closed(&self) -> bool {
self.inner.is_closed()
}
}
impl ClientInner<tokio_postgres::Client> {
impl ClientInnerCommon<tokio_postgres::Client> {
pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
self.jti += 1;
let token = resign_jwt(&self.key, payload, self.jti)?;
if let ClientDataEnum::Local(local_data) = &mut self.data {
local_data.jti += 1;
let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
// initiates the auth session
self.inner.simple_query("discard all").await?;
self.inner
.query(
"select auth.jwt_session_init($1)",
&[&token as &(dyn ToSql + Sync)],
)
.await?;
// initiates the auth session
self.inner.simple_query("discard all").await?;
self.inner
.query(
"select auth.jwt_session_init($1)",
&[&token as &(dyn ToSql + Sync)],
)
.await?;
let pid = self.inner.get_process_id();
info!(pid, jti = self.jti, "user session state init");
Ok(())
}
}
pub(crate) struct LocalClient<C: ClientInnerExt> {
span: Span,
inner: Option<ClientInner<C>>,
conn_info: ConnInfo,
pool: Weak<LocalConnPool<C>>,
}
pub(crate) struct Discard<'a, C: ClientInnerExt> {
conn_info: &'a ConnInfo,
pool: &'a mut Weak<LocalConnPool<C>>,
}
impl<C: ClientInnerExt> LocalClient<C> {
pub(self) fn new(
inner: ClientInner<C>,
conn_info: ConnInfo,
pool: Weak<LocalConnPool<C>>,
) -> Self {
Self {
inner: Some(inner),
span: Span::current(),
conn_info,
pool,
let pid = self.inner.get_process_id();
info!(pid, jti = local_data.jti, "user session state init");
Ok(())
} else {
panic!("unexpected client data type");
}
}
pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner_m = inner.as_mut().expect("client inner should not be removed");
(inner_m, Discard { conn_info, pool })
}
pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
let Self {
inner,
pool,
conn_info,
span: _,
} = self;
let inner = inner.as_mut().expect("client inner should not be removed");
(&mut inner.inner, Discard { conn_info, pool })
}
}
/// implements relatively efficient in-place json object key upserting
@@ -547,58 +365,6 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
jwt
}
impl<C: ClientInnerExt> LocalClient<C> {
pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id,
})
}
fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
let conn_info = self.conn_info.clone();
let client = self
.inner
.take()
.expect("client inner should not be removed");
if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
let current_span = self.span.clone();
// return connection to the pool
return Some(move || {
let _span = current_span.enter();
EndpointConnPool::put(&conn_pool.global_pool, &conn_info, client);
});
}
None
}
}
impl<C: ClientInnerExt> Drop for LocalClient<C> {
fn drop(&mut self) {
if let Some(drop) = self.do_drop() {
tokio::task::spawn_blocking(drop);
}
}
}
impl<C: ClientInnerExt> Discard<'_, C> {
pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
let conn_info = &self.conn_info;
if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
info!(
"local_pool: throwing away connection '{conn_info}' because connection is not idle"
);
}
}
pub(crate) fn discard(&mut self) {
let conn_info = &self.conn_info;
if std::mem::take(self.pool).strong_count() > 0 {
info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
}
}
}
#[cfg(test)]
mod tests {
use p256::ecdsa::SigningKey;

View File

@@ -47,7 +47,7 @@ use crate::cancellation::CancellationHandlerMain;
use crate::config::{ProxyConfig, ProxyProtocolV2};
use crate::context::RequestMonitoring;
use crate::metrics::Metrics;
use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectionInfo};
use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
@@ -251,16 +251,21 @@ async fn connection_startup(
};
let conn_info = match peer {
None if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
// our load balancers will not send any more data. let's just exit immediately
ConnectHeader::Local => {
tracing::debug!("healthcheck received");
return None;
}
ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
tracing::warn!("missing required proxy protocol header");
return None;
}
Some(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
tracing::warn!("proxy protocol header not supported");
return None;
}
Some(info) => info,
None => ConnectionInfo {
ConnectHeader::Proxy(info) => info,
ConnectHeader::Missing => ConnectionInfo {
addr: peer_addr,
extra: None,
},

View File

@@ -31,7 +31,6 @@ use super::conn_pool_lib::{self, ConnInfo};
use super::error::HttpCodeError;
use super::http_util::json_response;
use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
use super::local_conn_pool;
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
@@ -1052,12 +1051,12 @@ async fn query_to_json<T: GenericClient>(
enum Client {
Remote(conn_pool_lib::Client<tokio_postgres::Client>),
Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
Local(conn_pool_lib::Client<tokio_postgres::Client>),
}
enum Discard<'a> {
Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
}
impl Client {
@@ -1071,7 +1070,7 @@ impl Client {
fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
match self {
Client::Remote(client) => {
let (c, d) = client.inner_mut();
let (c, d) = client.inner();
(c, Discard::Remote(d))
}
Client::Local(local_client) => {

View File

@@ -64,24 +64,28 @@ macro_rules! smol_str_wrapper {
}
const POOLER_SUFFIX: &str = "-pooler";
pub(crate) const LOCAL_PROXY_SUFFIX: &str = "-local-proxy";
impl EndpointId {
#[must_use]
pub fn normalize(&self) -> Self {
fn normalize_str(&self) -> &str {
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
stripped.into()
stripped
} else if let Some(stripped) = self.as_ref().strip_suffix(LOCAL_PROXY_SUFFIX) {
stripped
} else {
self.clone()
self
}
}
#[must_use]
pub fn normalize(&self) -> Self {
self.normalize_str().into()
}
#[must_use]
pub fn normalize_intern(&self) -> EndpointIdInt {
if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
EndpointIdTag::get_interner().get_or_intern(stripped)
} else {
self.into()
}
EndpointIdTag::get_interner().get_or_intern(self.normalize_str())
}
}
@@ -110,13 +114,4 @@ impl EndpointId {
pub(crate) fn is_branch(&self) -> bool {
self.0.starts_with("br-")
}
// pub(crate) fn is_project(&self) -> bool {
// !self.is_endpoint() && !self.is_branch()
// }
pub(crate) fn as_branch(&self) -> BranchId {
BranchId(self.0.clone())
}
pub(crate) fn as_project(&self) -> ProjectId {
ProjectId(self.0.clone())
}
}

View File

@@ -6,7 +6,7 @@ package-mode = false
[tool.poetry.dependencies]
python = "^3.9"
pytest = "^7.4.4"
psycopg2-binary = "^2.9.9"
psycopg2-binary = "^2.9.10"
typing-extensions = "^4.6.1"
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
requests = "^2.32.3"
@@ -15,7 +15,7 @@ asyncpg = "^0.29.0"
aiopg = "^1.4.0"
Jinja2 = "^3.1.4"
types-requests = "^2.31.0.0"
types-psycopg2 = "^2.9.21.10"
types-psycopg2 = "^2.9.21.20241019"
boto3 = "^1.34.11"
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
moto = {extras = ["server"], version = "^5.0.6"}

View File

@@ -61,8 +61,14 @@ utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
criterion.workspace = true
itertools.workspace = true
walproposer.workspace = true
rand.workspace = true
desim.workspace = true
tracing.workspace = true
tracing-subscriber = { workspace = true, features = ["json"] }
[[bench]]
name = "receive_wal"
harness = false

View File

@@ -0,0 +1,22 @@
## Safekeeper Benchmarks
To run benchmarks:
```sh
# All benchmarks.
cargo bench --package safekeeper
# Specific file.
cargo bench --package safekeeper --bench receive_wal
# Specific benchmark.
cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false
# List available benchmarks.
cargo bench --package safekeeper --benches -- --list
```
Additional charts and statistics are available in `target/criterion/report/index.html`.
Benchmarks are automatically compared against the previous run. To compare against other runs, see
`--baseline` and `--save-baseline`.

View File

@@ -0,0 +1,102 @@
use std::sync::Arc;
use camino_tempfile::Utf8TempDir;
use safekeeper::rate_limit::RateLimiter;
use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
use safekeeper::state::{TimelinePersistentState, TimelineState};
use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
use safekeeper::timelines_set::TimelinesSet;
use safekeeper::wal_backup::remote_timeline_path;
use safekeeper::{control_file, wal_storage, SafeKeeperConf};
use tokio::fs::create_dir_all;
use utils::id::{NodeId, TenantTimelineId};
use utils::lsn::Lsn;
/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop.
pub struct Env {
/// Whether to enable fsync.
pub fsync: bool,
/// Benchmark directory. Deleted when dropped.
pub tempdir: Utf8TempDir,
}
impl Env {
/// Creates a new benchmarking environment in a temporary directory. fsync controls whether to
/// enable fsyncing.
pub fn new(fsync: bool) -> anyhow::Result<Self> {
let tempdir = camino_tempfile::tempdir()?;
Ok(Self { fsync, tempdir })
}
/// Constructs a Safekeeper config for the given node ID.
fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf {
let mut conf = SafeKeeperConf::dummy();
conf.my_id = node_id;
conf.no_sync = !self.fsync;
conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}"));
conf
}
/// Constructs a Safekeeper with the given node and tenant/timeline ID.
///
/// TODO: we should support using in-memory storage, to measure non-IO costs. This would be
/// easier if SafeKeeper used trait objects for storage rather than generics. It's also not
/// currently possible to construct a timeline using non-file storage since StateSK only accepts
/// SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>.
pub async fn make_safekeeper(
&self,
node_id: NodeId,
ttid: TenantTimelineId,
) -> anyhow::Result<SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>> {
let conf = self.make_conf(node_id);
let timeline_dir = get_timeline_dir(&conf, &ttid);
create_dir_all(&timeline_dir).await?;
let mut pstate = TimelinePersistentState::empty();
pstate.tenant_id = ttid.tenant_id;
pstate.timeline_id = ttid.timeline_id;
let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?;
let ctrl =
control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?;
let state = TimelineState::new(ctrl);
let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?;
// Emulate an initial election.
safekeeper
.process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
term: 1,
start_streaming_at: Lsn(0),
term_history: TermHistory(vec![(1, Lsn(0)).into()]),
timeline_start_lsn: Lsn(0),
}))
.await?;
Ok(safekeeper)
}
/// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its
/// manager task.
pub async fn make_timeline(
&self,
node_id: NodeId,
ttid: TenantTimelineId,
) -> anyhow::Result<Arc<Timeline>> {
let conf = self.make_conf(node_id);
let timeline_dir = get_timeline_dir(&conf, &ttid);
let remote_path = remote_timeline_path(&ttid)?;
let safekeeper = self.make_safekeeper(node_id, ttid).await?;
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
timeline.bootstrap(
&mut timeline.write_shared_state().await,
&conf,
Arc::new(TimelinesSet::default()), // ignored for now
RateLimiter::new(0, 0),
);
Ok(timeline)
}
}

View File

@@ -0,0 +1,341 @@
//! WAL ingestion benchmarks.
#[path = "benchutils.rs"]
mod benchutils;
use std::io::Write as _;
use benchutils::Env;
use camino_tempfile::tempfile;
use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
use itertools::Itertools as _;
use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
use safekeeper::receive_wal::{self, WalAcceptor};
use safekeeper::safekeeper::{
AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
};
use tokio::io::AsyncWriteExt as _;
use utils::id::{NodeId, TenantTimelineId};
use utils::lsn::Lsn;
const KB: usize = 1024;
const MB: usize = 1024 * KB;
const GB: usize = 1024 * MB;
// Register benchmarks with Criterion.
criterion_group!(
benches,
bench_process_msg,
bench_wal_acceptor,
bench_wal_acceptor_throughput,
bench_file_write
);
criterion_main!(benches);
/// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an
/// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When
/// measuring throughput, only the logical message payload is considered, excluding
/// segment/page/record headers.
fn bench_process_msg(c: &mut Criterion) {
let mut g = c.benchmark_group("process_msg");
for fsync in [false, true] {
for commit in [false, true] {
for size in [8, KB, 8 * KB, 128 * KB, MB] {
// Kind of weird to change the group throughput per benchmark, but it's the only way
// to vary it per benchmark. It works.
g.throughput(criterion::Throughput::Bytes(size as u64));
g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
run_bench(b, size, fsync, commit).unwrap()
});
}
}
}
// The actual benchmark. If commit is true, advance the commit LSN on every message.
fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> {
let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only
.enable_all()
.build()?;
// Construct the payload. The prefix counts towards the payload (including NUL terminator).
let prefix = c"p";
let prefixlen = prefix.to_bytes_with_nul().len();
assert!(size >= prefixlen);
let message = vec![0; size - prefixlen];
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
// Set up the Safekeeper.
let env = Env::new(fsync)?;
let mut safekeeper =
runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?;
b.iter_batched_ref(
// Pre-construct WAL records and requests. Criterion will batch them.
|| {
let (lsn, record) = walgen.next().expect("endless WAL");
ProposerAcceptorMessage::AppendRequest(AppendRequest {
h: AppendRequestHeader {
term: 1,
term_start_lsn: Lsn(0),
begin_lsn: lsn,
end_lsn: lsn + record.len() as u64,
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
truncate_lsn: Lsn(0),
proposer_uuid: [0; 16],
},
wal_data: record,
})
},
// Benchmark message processing (time per message).
|msg| {
runtime
.block_on(safekeeper.process_msg(msg))
.expect("message failed")
},
BatchSize::SmallInput, // automatically determine a batch size
);
Ok(())
}
}
/// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting
/// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages
/// instead of measuring each individual message to amortize costs (e.g. fsync), which is more
/// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including
/// headers). Records are pre-constructed to avoid skewing the benchmark.
///
/// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`:
fn bench_wal_acceptor(c: &mut Criterion) {
let mut g = c.benchmark_group("wal_acceptor");
for fsync in [false, true] {
for n in [1, 100, 10000] {
g.bench_function(format!("fsync={fsync}/n={n}"), |b| {
run_bench(b, n, fsync).unwrap()
});
}
}
/// The actual benchmark. n is the number of WAL records to send in a pipelined batch.
fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> {
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
let env = Env::new(fsync)?;
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"));
// Create buffered channels that can fit all requests, to avoid blocking on channels.
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n);
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n);
// Spawn the WalAcceptor task.
runtime.block_on(async {
// TODO: WalAcceptor doesn't actually need a full timeline, only
// Safekeeper::process_msg(). Consider decoupling them to simplify the setup.
let tli = env
.make_timeline(NodeId(1), TenantTimelineId::generate())
.await?
.wal_residence_guard()
.await?;
WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
anyhow::Ok(())
})?;
b.iter_batched(
// Pre-construct a batch of WAL records and requests.
|| {
walgen
.take(n)
.map(|(lsn, record)| AppendRequest {
h: AppendRequestHeader {
term: 1,
term_start_lsn: Lsn(0),
begin_lsn: lsn,
end_lsn: lsn + record.len() as u64,
commit_lsn: Lsn(0),
truncate_lsn: Lsn(0),
proposer_uuid: [0; 16],
},
wal_data: record,
})
.collect_vec()
},
// Benchmark batch ingestion (time per batch).
|reqs| {
runtime.block_on(async {
let final_lsn = reqs.last().unwrap().h.end_lsn;
// Stuff all the messages into the buffered channel to pipeline them.
for req in reqs {
let msg = ProposerAcceptorMessage::AppendRequest(req);
msg_tx.send(msg).await.expect("send failed");
}
// Wait for the last message to get flushed.
while let Some(reply) = reply_rx.recv().await {
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
if resp.flush_lsn >= final_lsn {
return;
}
}
}
panic!("disconnected")
})
},
BatchSize::PerIteration, // only run one request batch at a time
);
Ok(())
}
}
/// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting
/// for the last LSN to be flushed to storage. Only the actual message payload counts towards
/// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage.
///
/// To avoid running out of memory, messages are constructed during the benchmark.
fn bench_wal_acceptor_throughput(c: &mut Criterion) {
const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding
let mut g = c.benchmark_group("wal_acceptor_throughput");
g.sample_size(10);
g.throughput(criterion::Throughput::Bytes(VOLUME as u64));
for fsync in [false, true] {
for commit in [false, true] {
for size in [KB, 8 * KB, 128 * KB, MB] {
assert_eq!(VOLUME % size, 0, "volume must be divisible by size");
let count = VOLUME / size;
g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
run_bench(b, count, size, fsync, commit).unwrap()
});
}
}
}
/// The actual benchmark. size is the payload size per message, count is the number of messages.
/// If commit is true, advance the commit LSN on each message.
fn run_bench(
b: &mut Bencher,
count: usize,
size: usize,
fsync: bool,
commit: bool,
) -> anyhow::Result<()> {
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
// Construct the payload. The prefix counts towards the payload (including NUL terminator).
let prefix = c"p";
let prefixlen = prefix.to_bytes_with_nul().len();
assert!(size >= prefixlen);
let message = vec![0; size - prefixlen];
let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
// Construct and spawn the WalAcceptor task.
let env = Env::new(fsync)?;
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
runtime.block_on(async {
let tli = env
.make_timeline(NodeId(1), TenantTimelineId::generate())
.await?
.wal_residence_guard()
.await?;
WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
anyhow::Ok(())
})?;
// Ingest the WAL.
b.iter(|| {
runtime.block_on(async {
let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
h: AppendRequestHeader {
term: 1,
term_start_lsn: Lsn(0),
begin_lsn: lsn,
end_lsn: lsn + record.len() as u64,
commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
truncate_lsn: Lsn(0),
proposer_uuid: [0; 16],
},
wal_data: record,
});
// Send requests.
for req in reqgen {
_ = reply_rx.try_recv(); // discard any replies, to avoid blocking
let msg = ProposerAcceptorMessage::AppendRequest(req);
msg_tx.send(msg).await.expect("send failed");
}
// Wait for last message to get flushed.
while let Some(reply) = reply_rx.recv().await {
if let AcceptorProposerMessage::AppendResponse(resp) = reply {
if resp.flush_lsn >= walgen.lsn {
return;
}
}
}
panic!("disconnected")
})
});
Ok(())
}
}
/// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended
/// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput.
fn bench_file_write(c: &mut Criterion) {
let mut g = c.benchmark_group("file_write");
for kind in ["stdlib", "tokio"] {
for fsync in [false, true] {
for size in [8, KB, 8 * KB, 128 * KB, MB] {
// Kind of weird to change the group throughput per benchmark, but it's the only way to
// vary it per benchmark. It works.
g.throughput(criterion::Throughput::Bytes(size as u64));
g.bench_function(
format!("{kind}/fsync={fsync}/size={size}"),
|b| match kind {
"stdlib" => run_bench_stdlib(b, size, fsync).unwrap(),
"tokio" => run_bench_tokio(b, size, fsync).unwrap(),
name => panic!("unknown kind {name}"),
},
);
}
}
}
fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
let mut file = tempfile()?;
let buf = vec![0u8; size];
b.iter(|| {
file.write_all(&buf).unwrap();
file.flush().unwrap();
if fsync {
file.sync_data().unwrap();
}
});
Ok(())
}
fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
let mut file = tokio::fs::File::from_std(tempfile()?);
let buf = vec![0u8; size];
b.iter(|| {
runtime.block_on(async {
file.write_all(&buf).await.unwrap();
file.flush().await.unwrap();
if fsync {
file.sync_data().await.unwrap();
}
})
});
Ok(())
}
}

View File

@@ -172,7 +172,7 @@ async fn copy_disk_segments(
) -> Result<()> {
let mut wal_reader = tli.get_walreader(start_lsn).await?;
let mut buf = [0u8; MAX_SEND_SIZE];
let mut buf = vec![0u8; MAX_SEND_SIZE];
let first_segment = start_lsn.segment_number(wal_seg_size);
let last_segment = end_lsn.segment_number(wal_seg_size);

View File

@@ -383,7 +383,7 @@ pub async fn calculate_digest(
let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
let mut hasher = Sha256::new();
let mut buf = [0u8; MAX_SEND_SIZE];
let mut buf = vec![0u8; MAX_SEND_SIZE];
let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
while bytes_left > 0 {

View File

@@ -112,9 +112,7 @@ impl SafeKeeperConf {
}
impl SafeKeeperConf {
#[cfg(test)]
#[allow(unused)]
fn dummy() -> Self {
pub fn dummy() -> Self {
SafeKeeperConf {
workdir: Utf8PathBuf::from("./"),
no_sync: false,

View File

@@ -5,23 +5,23 @@ use std::{
time::{Instant, SystemTime},
};
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS};
use anyhow::Result;
use futures::Future;
use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
pow2_buckets,
proto::MetricFamily,
register_histogram_vec, register_int_counter, register_int_counter_pair,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
};
use once_cell::sync::Lazy;
use postgres_ffi::XLogSegNo;
use utils::pageserver_feedback::PageserverFeedback;
use utils::{id::TenantTimelineId, lsn::Lsn};
use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback};
use crate::{
receive_wal::MSG_QUEUE_SIZE,
state::{TimelineMemState, TimelinePersistentState},
GlobalTimelines,
};
@@ -204,6 +204,44 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
});
pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_wal_receivers",
"Number of currently connected WAL receivers (i.e. connected computes)"
)
.expect("Failed to register safekeeper_wal_receivers")
});
pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
// Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
// full queues respectively.
let mut buckets = pow2_buckets(1, MSG_QUEUE_SIZE);
buckets.insert(0, 0.0);
buckets.insert(buckets.len() - 1, (MSG_QUEUE_SIZE - 1) as f64);
assert!(buckets.len() <= 12, "too many histogram buckets");
register_histogram!(
"safekeeper_wal_receiver_queue_depth",
"Number of queued messages per WAL receiver (sampled every 5 seconds)",
buckets
)
.expect("Failed to register safekeeper_wal_receiver_queue_depth histogram")
});
pub static WAL_RECEIVER_QUEUE_DEPTH_TOTAL: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_wal_receiver_queue_depth_total",
"Total number of queued messages across all WAL receivers",
)
.expect("Failed to register safekeeper_wal_receiver_queue_depth_total gauge")
});
// TODO: consider adding a per-receiver queue_size histogram. This will require wrapping the Tokio
// MPSC channel to update counters on send, receive, and drop, while forwarding all other methods.
pub static WAL_RECEIVER_QUEUE_SIZE_TOTAL: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_wal_receiver_queue_size_total",
"Total memory byte size of queued messages across all WAL receivers",
)
.expect("Failed to register safekeeper_wal_receiver_queue_size_total gauge")
});
// Metrics collected on operations on the storage repository.
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]

View File

@@ -3,6 +3,10 @@
//! sends replies back.
use crate::handler::SafekeeperPostgresHandler;
use crate::metrics::{
WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL,
WAL_RECEIVER_QUEUE_SIZE_TOTAL,
};
use crate::safekeeper::AcceptorProposerMessage;
use crate::safekeeper::ProposerAcceptorMessage;
use crate::safekeeper::ServerInfo;
@@ -86,6 +90,7 @@ impl WalReceivers {
};
self.update_num(&shared);
WAL_RECEIVERS.inc();
WalReceiverGuard {
id: pos,
@@ -144,6 +149,7 @@ impl WalReceivers {
let mut shared = self.mutex.lock();
shared.slots[id] = None;
self.update_num(&shared);
WAL_RECEIVERS.dec();
}
/// Broadcast pageserver feedback to connected walproposers.
@@ -390,6 +396,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
loop {
let started = Instant::now();
let size = next_msg.size();
match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
Ok(()) => {}
// Slow send, log a message and keep trying. Log context has timeline ID.
@@ -409,6 +416,11 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
// WalAcceptor terminated.
Err(SendTimeoutError::Closed(_)) => return Ok(()),
}
// Update metrics. Will be decremented in WalAcceptor.
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
next_msg = read_message(pgb_reader).await?;
}
}
@@ -466,6 +478,12 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
/// walproposer, even when it's writing a steady stream of messages.
const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
/// The metrics computation interval.
///
/// The Prometheus poll interval is 60 seconds at the time of writing. We sample the queue depth
/// every 5 seconds, for 12 samples per poll. This will give a count of up to 12x active timelines.
const METRICS_INTERVAL: Duration = Duration::from_secs(5);
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
/// replies to reply_tx.
///
@@ -512,12 +530,15 @@ impl WalAcceptor {
async fn run(&mut self) -> anyhow::Result<()> {
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
// Periodically flush the WAL.
// Periodically flush the WAL and compute metrics.
let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
flush_ticker.tick().await; // skip the initial, immediate tick
// Tracks unflushed appends.
let mut metrics_ticker = tokio::time::interval(METRICS_INTERVAL);
metrics_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
// Tracks whether we have unflushed appends.
let mut dirty = false;
loop {
@@ -529,6 +550,10 @@ impl WalAcceptor {
break;
};
// Update gauge metrics.
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.dec();
WAL_RECEIVER_QUEUE_SIZE_TOTAL.sub(msg.size() as i64);
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &msg {
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
@@ -565,6 +590,12 @@ impl WalAcceptor {
.process_msg(&ProposerAcceptorMessage::FlushWAL)
.await?
}
// Update histogram metrics periodically.
_ = metrics_ticker.tick() => {
WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64);
None // no reply
}
};
// Send reply, if any.
@@ -585,3 +616,14 @@ impl WalAcceptor {
Ok(())
}
}
/// On drop, drain msg_rx and update metrics to avoid leaks.
impl Drop for WalAcceptor {
fn drop(&mut self) {
self.msg_rx.close(); // prevent further sends
while let Ok(msg) = self.msg_rx.try_recv() {
WAL_RECEIVER_QUEUE_DEPTH_TOTAL.dec();
WAL_RECEIVER_QUEUE_SIZE_TOTAL.sub(msg.size() as i64);
}
}
}

View File

@@ -422,6 +422,70 @@ impl ProposerAcceptorMessage {
_ => bail!("unknown proposer-acceptor message tag: {}", tag),
}
}
/// The memory size of the message, including byte slices.
pub fn size(&self) -> usize {
const BASE_SIZE: usize = std::mem::size_of::<ProposerAcceptorMessage>();
// For most types, the size is just the base enum size including the nested structs. Some
// types also contain byte slices; add them.
//
// We explicitly list all fields, to draw attention here when new fields are added.
let mut size = BASE_SIZE;
size += match self {
Self::Greeting(ProposerGreeting {
protocol_version: _,
pg_version: _,
proposer_id: _,
system_id: _,
timeline_id: _,
tenant_id: _,
tli: _,
wal_seg_size: _,
}) => 0,
Self::VoteRequest(VoteRequest { term: _ }) => 0,
Self::Elected(ProposerElected {
term: _,
start_streaming_at: _,
term_history: _,
timeline_start_lsn: _,
}) => 0,
Self::AppendRequest(AppendRequest {
h:
AppendRequestHeader {
term: _,
term_start_lsn: _,
begin_lsn: _,
end_lsn: _,
commit_lsn: _,
truncate_lsn: _,
proposer_uuid: _,
},
wal_data,
}) => wal_data.len(),
Self::NoFlushAppendRequest(AppendRequest {
h:
AppendRequestHeader {
term: _,
term_start_lsn: _,
begin_lsn: _,
end_lsn: _,
commit_lsn: _,
truncate_lsn: _,
proposer_uuid: _,
},
wal_data,
}) => wal_data.len(),
Self::FlushWAL => 0,
};
size
}
}
/// Acceptor -> Proposer messages

Some files were not shown because too many files have changed in this diff Show More