Compare commits

..

1 Commits

Author SHA1 Message Date
Christian Schwarz
4707d2df6d revert pageserver parts of "feat(walredo): use posix_spawn by moving close_fds() work to walredo C code (#6574)"
the addition of close_range() to the C code remains, it doesn't matter
for the purposes of this reproducer.

This reverts parts of commit 1be5e564ce.
2024-02-15 18:48:04 +00:00
123 changed files with 2556 additions and 6484 deletions

View File

@@ -59,7 +59,7 @@ runs:
BUCKET: neon-github-public-dev
# TODO: We can replace with a special docker image with Java and Allure pre-installed
- uses: actions/setup-java@v4
- uses: actions/setup-java@v3
with:
distribution: 'temurin'
java-version: '17'
@@ -76,8 +76,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.27.0
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
ALLURE_VERSION: 2.24.0
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock
@@ -180,7 +180,7 @@ runs:
fi
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -215,7 +215,7 @@ runs:
rm -rf ${WORKDIR}
fi
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: always()
env:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}

View File

@@ -80,13 +80,13 @@ runs:
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

View File

@@ -64,7 +64,7 @@ jobs:
steps:
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
- uses: actions/checkout@v4
- uses: actions/checkout@v3
with:
ref: main
token: ${{ secrets.CI_ACCESS_TOKEN }}

View File

@@ -66,7 +66,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -221,7 +221,7 @@ jobs:
timeout-minutes: 480
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -366,7 +366,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -465,7 +465,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -562,7 +562,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download

View File

@@ -69,7 +69,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -106,13 +106,13 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -138,7 +138,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -146,7 +146,7 @@ jobs:
# Disabled for now
# - name: Restore cargo deps cache
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# !~/.cargo/registry/src
@@ -231,7 +231,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -303,7 +303,7 @@ jobs:
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v4
# uses: actions/cache@v3
# with:
# path: |
# ~/.cargo/registry/
@@ -317,21 +317,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -451,7 +451,7 @@ jobs:
pg_version: [ v14, v15, v16 ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
@@ -472,14 +472,9 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
PAGESERVER_GET_VECTORED_IMPL: vectored
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data
get-benchmarks-durations:
@@ -493,10 +488,10 @@ jobs:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
@@ -530,7 +525,7 @@ jobs:
build_type: [ release ]
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
@@ -559,7 +554,7 @@ jobs:
options: --init
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Create Allure report
if: ${{ !cancelled() }}
@@ -570,7 +565,7 @@ jobs:
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v7
- uses: actions/github-script@v6
if: ${{ !cancelled() }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -610,7 +605,7 @@ jobs:
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
@@ -679,7 +674,7 @@ jobs:
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
- uses: actions/github-script@v7
- uses: actions/github-script@v6
env:
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
@@ -905,7 +900,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -1119,7 +1114,7 @@ jobs:
done
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 0
@@ -1142,7 +1137,7 @@ jobs:
- name: Create git tag
if: github.ref_name == 'release'
uses: actions/github-script@v7
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
@@ -1156,7 +1151,7 @@ jobs:
- name: Create GitHub release
if: github.ref_name == 'release'
uses: actions/github-script@v7
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5

View File

@@ -57,21 +57,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -82,7 +82,7 @@ jobs:
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Cache cargo deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
@@ -172,21 +172,21 @@ jobs:
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
@@ -356,7 +356,7 @@ jobs:
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
- name: Publish build stats report
uses: actions/github-script@v7
uses: actions/github-script@v6
env:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}

View File

@@ -28,7 +28,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
@@ -38,7 +38,7 @@ jobs:
uses: snok/install-poetry@v1
- name: Cache poetry deps
uses: actions/cache@v4
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
@@ -82,7 +82,7 @@ jobs:
# It will be fixed after switching to gen2 runner
- name: Upload python test logs
if: always()
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
retention-days: 7
name: python-test-pg_clients-${{ runner.os }}-stage-logs

View File

@@ -9,7 +9,7 @@ on:
defaults:
run:
shell: bash -euxo pipefail {0}
env:
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
@@ -37,7 +37,7 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -115,3 +115,4 @@ jobs:
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

28
Cargo.lock generated
View File

@@ -286,7 +286,6 @@ dependencies = [
"git-version",
"hyper",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
@@ -1157,6 +1156,16 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "close_fds"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
dependencies = [
"cfg-if",
"libc",
]
[[package]]
name = "colorchoice"
version = "1.0.0"
@@ -1814,7 +1823,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
dependencies = [
"enumset_derive",
"serde",
]
[[package]]
@@ -2759,17 +2767,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "leaky-bucket"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
dependencies = [
"parking_lot 0.12.1",
"tokio",
"tracing",
]
[[package]]
name = "libc"
version = "0.2.150"
@@ -3461,7 +3458,6 @@ name = "pageserver"
version = "0.1.0"
dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"async-stream",
"async-trait",
@@ -3471,6 +3467,7 @@ dependencies = [
"camino-tempfile",
"chrono",
"clap",
"close_fds",
"const_format",
"consumption_metrics",
"crc32c",
@@ -3489,7 +3486,6 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"leaky-bucket",
"md5",
"metrics",
"nix 0.27.1",
@@ -3552,7 +3548,6 @@ dependencies = [
"enum-map",
"hex",
"humantime-serde",
"itertools",
"postgres_ffi",
"rand 0.8.5",
"serde",
@@ -6363,7 +6358,6 @@ dependencies = [
"hex-literal",
"hyper",
"jsonwebtoken",
"leaky-bucket",
"metrics",
"nix 0.27.1",
"once_cell",

View File

@@ -66,6 +66,7 @@ camino = "1.1.6"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
@@ -97,7 +98,6 @@ ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "9"
lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
memoffset = "0.8"

View File

@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
&& mold -run cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \

View File

@@ -769,24 +769,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pg_ivm"
# compile pg_ivm extension
#
#########################################################################################
FROM build-deps AS pg-ivm-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -828,7 +810,6 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \

View File

@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
.PHONY: neon-pg-clean-ext-%
neon-pg-clean-ext-%:
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -216,11 +216,11 @@ neon-pg-ext: \
neon-pg-ext-v15 \
neon-pg-ext-v16
.PHONY: neon-pg-clean-ext
neon-pg-clean-ext: \
neon-pg-clean-ext-v14 \
neon-pg-clean-ext-v15 \
neon-pg-clean-ext-v16
.PHONY: neon-pg-ext-clean
neon-pg-ext-clean: \
neon-pg-ext-clean-v14 \
neon-pg-ext-clean-v15 \
neon-pg-ext-clean-v16
# shorthand to build all Postgres versions
.PHONY: postgres
@@ -249,7 +249,7 @@ postgres-check: \
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean: postgres-clean neon-pg-clean-ext
clean: postgres-clean neon-pg-ext-clean
$(CARGO_CMD_PREFIX) cargo clean
# This removes everything

View File

@@ -249,16 +249,6 @@ testing locally, it is convenient to run just one set of permutations, like this
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
```
## Flamegraphs
You may find yourself in need of flamegraphs for software in this repository.
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
>[!IMPORTANT]
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.

View File

@@ -45,6 +45,7 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use nix::sys::signal::{kill, Signal};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info};
@@ -52,9 +53,7 @@ use url::Url;
use compute_api::responses::ComputeStatus;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
};
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::http::api::launch_http_server;
@@ -395,15 +394,6 @@ fn main() -> Result<()> {
info!("synced safekeepers at lsn {lsn}");
}
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::TerminationPending {
state.status = ComputeStatus::Terminated;
compute.state_changed.notify_all();
// we were asked to terminate gracefully, don't exit to avoid restart
delay_exit = true
}
drop(state);
if let Err(err) = compute.check_for_core_dumps() {
error!("error while checking for core dumps: {err:?}");
}
@@ -533,7 +523,16 @@ fn cli() -> clap::Command {
/// wait for termination which would be easy then.
fn handle_exit_signal(sig: i32) {
info!("received {sig} termination signal");
forward_termination_signal();
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
kill(pg_pid, Signal::SIGTERM).ok();
}
exit(1);
}

View File

@@ -28,8 +28,6 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use nix::sys::signal::{kill, Signal};
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
@@ -326,8 +324,7 @@ impl ComputeNode {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
@@ -1324,17 +1321,3 @@ LIMIT 100",
Ok(remote_ext_metrics)
}
}
pub fn forward_termination_signal() {
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
kill(pg_pid, Signal::SIGQUIT).ok();
}
}

View File

@@ -51,9 +51,6 @@ pub fn write_postgres_conf(
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,

View File

@@ -5,7 +5,6 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -124,17 +123,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::POST, "/terminate") => {
info!("serving /terminate POST request");
match handle_terminate_request(compute).await {
Ok(()) => Response::new(Body::empty()),
Err((msg, code)) => {
error!("error handling /terminate request: {msg}");
render_json_error(&msg, code)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
@@ -309,49 +297,6 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();
if state.status == ComputeStatus::Terminated {
return Ok(());
}
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
let msg = format!(
"invalid compute status for termination request: {:?}",
state.status.clone()
);
return Err((msg, StatusCode::PRECONDITION_FAILED));
}
state.status = ComputeStatus::TerminationPending;
compute.state_changed.notify_all();
drop(state);
}
forward_termination_signal();
info!("sent signal and notified waiters");
// Spawn a blocking thread to wait for compute to become Terminated.
// This is needed to do not block the main pool of workers and
// be able to serve other requests while some particular request
// is waiting for compute to finish configuration.
let c = compute.clone();
task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
while state.status != ComputeStatus::Terminated {
state = c.state_changed.wait(state).unwrap();
info!(
"waiting for compute to become Terminated, current status: {:?}",
state.status
);
}
Ok(())
})
.await
.unwrap()?;
info!("terminated Postgres");
Ok(())
}
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {

View File

@@ -168,29 +168,6 @@ paths:
schema:
$ref: "#/components/schemas/GenericError"
/terminate:
post:
tags:
- Terminate
summary: Terminate Postgres and wait for it to exit
description: ""
operationId: terminate
responses:
200:
description: Result
412:
description: "wrong state"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: "Unexpected error"
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:
JWT:

View File

@@ -4,11 +4,6 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs and behaviors
testing = []
[dependencies]
anyhow.workspace = true
aws-config.workspace = true
@@ -18,7 +13,6 @@ clap.workspace = true
futures.workspace = true
git-version.workspace = true
hyper.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
postgres_connection.workspace = true

View File

@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
self.shards
.sort_by_key(|(shard, _node_id)| shard.shard_number);
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
// We have pageservers for all the shards: emit a configuration update
return Some(ComputeHookNotifyRequest {
tenant_id,
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
tracing::info!(
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
self.shards.len(),
shard_count.count()
shard_count.0
);
}
@@ -155,7 +155,7 @@ impl ComputeHook {
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
endpoint.reconfigure(compute_pageservers.clone()).await?;
}
}
@@ -177,7 +177,7 @@ impl ComputeHook {
req
};
tracing::info!(
tracing::debug!(
"Sending notify request to {} ({:?})",
url,
reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
/// the proper pageserver nodes for a tenant.
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
#[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
pub(super) async fn notify(
&self,
tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
tracing::debug!("Tenant isn't yet ready to emit a notification",);
return Ok(());
};

View File

@@ -114,10 +114,7 @@ async fn handle_tenant_create(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
json_response(
StatusCode::CREATED,
service.tenant_create(create_req).await?,
)
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
}
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -199,7 +196,7 @@ async fn handle_tenant_timeline_create(
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
json_response(
StatusCode::CREATED,
StatusCode::OK,
service
.tenant_timeline_create(tenant_id, create_req)
.await?,
@@ -336,22 +333,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
state.service.tenants_dump()
}
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
state.service.scheduler_dump()
}
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let state = get_state(&req);
json_response(StatusCode::OK, state.service.consistency_check().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ())
@@ -440,13 +421,6 @@ pub fn make_router(
.post("/debug/v1/node/:node_id/drop", |r| {
request_span(r, handle_node_drop)
})
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
.get("/debug/v1/scheduler", |r| {
request_span(r, handle_scheduler_dump)
})
.post("/debug/v1/consistency_check", |r| {
request_span(r, handle_consistency_check)
})
.get("/control/v1/tenant/:tenant_id/locate", |r| {
tenant_service_handler(r, handle_tenant_locate)
})

View File

@@ -3,7 +3,6 @@ use utils::seqwait::MonotonicCounter;
mod compute_hook;
pub mod http;
pub mod metrics;
mod node;
pub mod persistence;
mod reconciler;
@@ -12,7 +11,7 @@ mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize, Deserialize)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
@@ -23,7 +22,7 @@ enum PlacementPolicy {
Detached,
}
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
struct Sequence(u64);
impl Sequence {
@@ -38,12 +37,6 @@ impl std::fmt::Display for Sequence {
}
}
impl std::fmt::Debug for Sequence {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl MonotonicCounter<Sequence> for Sequence {
fn cnt_advance(&mut self, v: Sequence) {
assert!(*self <= v);

View File

@@ -6,7 +6,6 @@
///
use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{self, BehaviorVersion, Region};
@@ -16,7 +15,6 @@ use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
@@ -206,8 +204,6 @@ async fn async_main() -> anyhow::Result<()> {
logging::Output::Stdout,
)?;
preinitialize_metrics();
let args = Cli::parse();
tracing::info!(
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
@@ -241,23 +237,15 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth)
let router = make_router(service, auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
// Start HTTP server
let server_shutdown = CancellationToken::new();
let server = hyper::Server::from_tcp(http_listener)?
.serve(router_service)
.with_graceful_shutdown({
let server_shutdown = server_shutdown.clone();
async move {
server_shutdown.cancelled().await;
}
});
tracing::info!("Serving on {0}", args.listen);
let server_task = tokio::task::spawn(server);
tokio::task::spawn(server);
// Wait until we receive a signal
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -278,16 +266,5 @@ async fn async_main() -> anyhow::Result<()> {
}
}
// Stop HTTP server first, so that we don't have to service requests
// while shutting down Service
server_shutdown.cancel();
if let Err(e) = server_task.await {
tracing::error!("Error joining HTTP server task: {e}")
}
tracing::info!("Joined HTTP server task");
service.shutdown().await;
tracing::info!("Service shutdown complete");
std::process::exit(0);
}

View File

@@ -1,32 +0,0 @@
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
pub(crate) struct ReconcilerMetrics {
pub(crate) spawned: IntCounter,
pub(crate) complete: IntCounterVec,
}
impl ReconcilerMetrics {
// Labels used on [`Self::complete`]
pub(crate) const SUCCESS: &'static str = "ok";
pub(crate) const ERROR: &'static str = "success";
pub(crate) const CANCEL: &'static str = "cancel";
}
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
spawned: register_int_counter!(
"storage_controller_reconcile_spawn",
"Count of how many times we spawn a reconcile task",
)
.expect("failed to define a metric"),
complete: register_int_counter_vec!(
"storage_controller_reconcile_complete",
"Reconciler tasks completed, broken down by success/failure/cancelled",
&["status"],
)
.expect("failed to define a metric"),
});
pub fn preinitialize_metrics() {
Lazy::force(&RECONCILER);
}

View File

@@ -1,16 +1,9 @@
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use serde::Serialize;
use utils::id::NodeId;
use crate::persistence::NodePersistence;
/// Represents the in-memory description of a Node.
///
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
///
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
/// implementation of serialization on this type is only for debug dumps.
#[derive(Clone, Serialize, Eq, PartialEq)]
#[derive(Clone)]
pub(crate) struct Node {
pub(crate) id: NodeId,

View File

@@ -222,7 +222,7 @@ impl Persistence {
let tenant_shard_id = TenantShardId {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
tenants_map.insert(tenant_shard_id, tsp);
@@ -318,7 +318,7 @@ impl Persistence {
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
shard_count: ShardCount(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
@@ -340,7 +340,7 @@ impl Persistence {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation.eq(generation + 1),
generation_pageserver.eq(node_id.0 as i64),
@@ -362,7 +362,7 @@ impl Persistence {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
.set((
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
@@ -392,19 +392,21 @@ impl Persistence {
conn.transaction(|conn| -> DatabaseResult<()> {
// Mark parent shards as splitting
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.filter(shard_count.eq(old_shard_count.0 as i32))
.set((splitting.eq(1),))
.execute(conn)?;
if u8::try_from(updated)
.map_err(|_| DatabaseError::Logical(
format!("Overflow existing shard count {} while splitting", updated))
)? != old_shard_count.count() {
)? != expect_parent_records {
// Perhaps a deletion or another split raced with this attempt to split, mutating
// the parent shards that we intend to split. In this case the split request should fail.
return Err(DatabaseError::Logical(
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
));
}
@@ -416,7 +418,7 @@ impl Persistence {
let mut parent = crate::schema::tenant_shards::table
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
.load::<TenantShardPersistence>(conn)?;
let parent = if parent.len() != 1 {
return Err(DatabaseError::Logical(format!(
@@ -457,7 +459,7 @@ impl Persistence {
// Drop parent shards
diesel::delete(tenant_shards)
.filter(tenant_id.eq(split_tenant_id.to_string()))
.filter(shard_count.eq(old_shard_count.literal() as i32))
.filter(shard_count.eq(old_shard_count.0 as i32))
.execute(conn)?;
// Clear sharding flag
@@ -477,7 +479,7 @@ impl Persistence {
}
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
#[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence {
#[serde(default)]

View File

@@ -13,7 +13,6 @@ use tokio_util::sync::CancellationToken;
use utils::generation::Generation;
use utils::id::{NodeId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
@@ -27,7 +26,7 @@ pub(super) struct Reconciler {
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Generation,
pub(crate) intent: TargetState,
pub(crate) intent: IntentState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -54,46 +53,14 @@ pub(super) struct Reconciler {
/// the tenant is changed.
pub(crate) cancel: CancellationToken,
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
/// can wait for all the reconcilers to respond to their cancellation tokens.
pub(crate) _gate_guard: GateGuard,
/// Access to persistent storage for updating generation numbers
pub(crate) persistence: Arc<Persistence>,
}
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)]
pub(crate) struct TargetState {
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl TargetState {
pub(crate) fn from_intent(intent: &IntentState) -> Self {
Self {
attached: *intent.get_attached(),
secondary: intent.get_secondary().clone(),
}
}
fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = self.secondary.clone();
if let Some(node_id) = &self.attached {
result.push(*node_id);
}
result
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum ReconcileError {
#[error(transparent)]
Notify(#[from] NotifyError),
#[error("Cancelled")]
Cancel,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
@@ -296,7 +263,7 @@ impl Reconciler {
secondary_conf,
tenant_conf: config.clone(),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
}
}
@@ -491,7 +458,7 @@ impl Reconciler {
generation: None,
secondary_conf: None,
shard_number: self.shard.number.0,
shard_count: self.shard.count.literal(),
shard_count: self.shard.count.0,
shard_stripe_size: self.shard.stripe_size.0,
tenant_conf: self.config.clone(),
},
@@ -499,9 +466,6 @@ impl Reconciler {
}
for (node_id, conf) in changes {
if self.cancel.is_cancelled() {
return Err(ReconcileError::Cancel);
}
self.location_config(node_id, conf, None).await?;
}
@@ -542,7 +506,7 @@ pub(crate) fn attached_location_conf(
generation: generation.into(),
secondary_conf: None,
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}
@@ -557,7 +521,7 @@ pub(crate) fn secondary_location_conf(
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: shard.number.0,
shard_count: shard.count.literal(),
shard_count: shard.count.0,
shard_stripe_size: shard.stripe_size.0,
tenant_conf: config.clone(),
}

View File

@@ -1,8 +1,9 @@
use crate::{node::Node, tenant_state::TenantState};
use serde::Serialize;
use std::collections::HashMap;
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, HashMap};
use utils::{http::error::ApiError, id::NodeId};
use crate::{node::Node, tenant_state::TenantState};
/// Scenarios in which we cannot find a suitable location for a tenant shard
#[derive(thiserror::Error, Debug)]
pub enum ScheduleError {
@@ -18,179 +19,52 @@ impl From<ScheduleError> for ApiError {
}
}
#[derive(Serialize, Eq, PartialEq)]
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
/// from a node's availability state and scheduling policy).
may_schedule: bool,
}
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
/// on which to run.
///
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
/// impl is only for debug dumps.
#[derive(Serialize)]
pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>,
tenant_counts: HashMap<NodeId, usize>,
}
impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new();
for node in nodes {
scheduler_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
pub(crate) fn new(
tenants: &BTreeMap<TenantShardId, TenantState>,
nodes: &HashMap<NodeId, Node>,
) -> Self {
let mut tenant_counts = HashMap::new();
for node_id in nodes.keys() {
tenant_counts.insert(*node_id, 0);
}
Self {
nodes: scheduler_nodes,
}
}
/// For debug/support: check that our internal statistics are in sync with the state of
/// the nodes & tenant shards.
///
/// If anything is inconsistent, log details and return an error.
pub(crate) fn consistency_check<'a>(
&self,
nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes {
expect_nodes.insert(
node.id,
SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
},
);
}
for shard in shards {
if let Some(node_id) = shard.intent.get_attached() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
}
for node_id in shard.intent.get_secondary() {
match expect_nodes.get_mut(node_id) {
Some(node) => node.shard_count += 1,
None => anyhow::bail!(
"Tenant {} references nonexistent node {}",
shard.tenant_shard_id,
node_id
),
}
for tenant in tenants.values() {
if let Some(ps) = tenant.intent.attached {
let entry = tenant_counts.entry(ps).or_insert(0);
*entry += 1;
}
}
for (node_id, expect_node) in &expect_nodes {
let Some(self_node) = self.nodes.get(node_id) else {
anyhow::bail!("Node {node_id} not found in Self")
};
if self_node != expect_node {
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
anyhow::bail!("Inconsistent state on {node_id}");
for (node_id, node) in nodes {
if !node.may_schedule() {
tenant_counts.remove(node_id);
}
}
if expect_nodes.len() != self.nodes.len() {
// We just checked that all the expected nodes are present. If the lengths don't match,
// it means that we have nodes in Self that are unexpected.
for node_id in self.nodes.keys() {
if !expect_nodes.contains_key(node_id) {
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
}
}
}
Ok(())
}
/// Increment the reference count of a node. This reference count is used to guide scheduling
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
/// this node.
///
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
/// [`Self::new`] or [`Self::node_upsert`])
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
tracing::error!("Scheduler missing node {node_id}");
debug_assert!(false);
return;
};
node.shard_count += 1;
}
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
let Some(node) = self.nodes.get_mut(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return;
};
node.shard_count -= 1;
}
pub(crate) fn node_upsert(&mut self, node: &Node) {
use std::collections::hash_map::Entry::*;
match self.nodes.entry(node.id) {
Occupied(mut entry) => {
entry.get_mut().may_schedule = node.may_schedule();
}
Vacant(entry) => {
entry.insert(SchedulerNode {
shard_count: 0,
may_schedule: node.may_schedule(),
});
}
}
}
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
if self.nodes.remove(&node_id).is_none() {
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
}
Self { tenant_counts }
}
pub(crate) fn schedule_shard(
&mut self,
hard_exclude: &[NodeId],
) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() {
if self.tenant_counts.is_empty() {
return Err(ScheduleError::NoPageservers);
}
let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes
.tenant_counts
.iter()
.filter_map(|(k, v)| {
if hard_exclude.contains(k) || !v.may_schedule {
if hard_exclude.contains(k) {
None
} else {
Some((*k, v.shard_count))
Some((*k, *v))
}
})
.collect();
@@ -199,108 +73,17 @@ impl Scheduler {
tenant_counts.sort_by_key(|i| (i.1, i.0));
if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. We log some detail about
// the state of nodes to help understand why this happened. This is not logged as an error because
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule,
node.shard_count
);
}
// After applying constraints, no pageservers were left
return Err(ScheduleError::ImpossibleConstraint);
}
for (node_id, count) in &tenant_counts {
tracing::info!("tenant_counts[{node_id}]={count}");
}
let node_id = tenant_counts.first().unwrap().0;
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
);
// Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used.
tracing::info!("scheduler selected node {node_id}");
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
Ok(node_id)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
use utils::id::NodeId;
use crate::{node::Node, tenant_state::IntentState};
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let mut nodes = HashMap::new();
nodes.insert(
NodeId(1),
Node {
id: NodeId(1),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
nodes.insert(
NodeId(2),
Node {
id: NodeId(2),
availability: NodeAvailability::Active,
scheduling: NodeSchedulingPolicy::Active,
listen_http_addr: String::new(),
listen_http_port: 0,
listen_pg_addr: String::new(),
listen_pg_port: 0,
},
);
let mut scheduler = Scheduler::new(nodes.values());
let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new();
let scheduled = scheduler.schedule_shard(&[])?;
t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
t1_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
if cfg!(debug_assertions) {
// Dropping an IntentState without clearing it causes a panic in debug mode,
// because we have failed to properly update scheduler shard counts.
let result = std::panic::catch_unwind(move || {
drop(t2_intent);
});
assert!(result.is_err());
} else {
t2_intent.clear(&mut scheduler);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
}
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,47 +1,27 @@
use std::{collections::HashMap, sync::Arc, time::Duration};
use crate::{metrics, persistence::TenantShardPersistence};
use control_plane::attachment_service::NodeAvailability;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
};
use serde::Serialize;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::{instrument, Instrument};
use utils::{
generation::Generation,
id::NodeId,
seqwait::{SeqWait, SeqWaitError},
sync::gate::Gate,
};
use crate::{
compute_hook::ComputeHook,
node::Node,
persistence::{split_state::SplitState, Persistence},
reconciler::{
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
},
persistence::Persistence,
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
scheduler::{ScheduleError, Scheduler},
service, PlacementPolicy, Sequence,
};
/// Serialization helper
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
T: Clone + std::fmt::Display,
{
serializer.collect_str(&v.lock().unwrap())
}
/// In-memory state for a particular tenant shard.
///
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)]
pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId,
@@ -76,29 +56,20 @@ pub(crate) struct TenantState {
/// If a reconcile task is currently in flight, it may be joined here (it is
/// only safe to join if either the result has been received or the reconciler's
/// cancellation token has been fired)
#[serde(skip)]
pub(crate) reconciler: Option<ReconcilerHandle>,
/// If a tenant is being split, then all shards with that TenantId will have a
/// SplitState set, this acts as a guard against other operations such as background
/// reconciliation, and timeline creation.
pub(crate) splitting: SplitState,
/// Optionally wait for reconciliation to complete up to a particular
/// sequence number.
#[serde(skip)]
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// Indicates sequence number for which we have encountered an error reconciling. If
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
/// and callers should stop waiting for `waiter` and propagate the error.
#[serde(skip)]
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// The most recent error from a reconcile on this tenant
/// TODO: generalize to an array of recent events
/// TOOD: use a ArcSwap instead of mutex for faster reads?
#[serde(serialize_with = "read_mutex_content")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
/// If we have a pending compute notification that for some reason we weren't able to send,
@@ -108,112 +79,13 @@ pub(crate) struct TenantState {
pub(crate) pending_compute_notification: bool,
}
#[derive(Default, Clone, Debug, Serialize)]
#[derive(Default, Clone, Debug)]
pub(crate) struct IntentState {
attached: Option<NodeId>,
secondary: Vec<NodeId>,
pub(crate) attached: Option<NodeId>,
pub(crate) secondary: Vec<NodeId>,
}
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
if let Some(node_id) = node_id {
scheduler.node_inc_ref(node_id);
}
Self {
attached: node_id,
secondary: vec![],
}
}
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
if self.attached != new_attached {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
if let Some(new_attached) = &new_attached {
scheduler.node_inc_ref(*new_attached);
}
self.attached = new_attached;
}
}
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
debug_assert!(!self.secondary.contains(&new_secondary));
scheduler.node_inc_ref(new_secondary);
self.secondary.push(new_secondary);
}
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
let index = self.secondary.iter().position(|n| *n == node_id);
if let Some(index) = index {
scheduler.node_dec_ref(node_id);
self.secondary.remove(index);
}
}
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
for secondary in self.secondary.drain(..) {
scheduler.node_dec_ref(secondary);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
}
self.clear_secondary(scheduler);
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
&self.attached
}
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
&self.secondary
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl Drop for IntentState {
fn drop(&mut self) {
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
}
#[derive(Default, Clone, Serialize)]
#[derive(Default, Clone)]
pub(crate) struct ObservedState {
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
}
@@ -227,7 +99,7 @@ pub(crate) struct ObservedState {
/// what it is (e.g. we failed partway through configuring it)
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
/// and that configuration will still be present unless something external interfered.
#[derive(Clone, Serialize)]
#[derive(Clone)]
pub(crate) struct ObservedStateLocation {
/// If None, it means we do not know the status of this shard's location on this node, but
/// we know that we might have some state on this node.
@@ -303,6 +175,46 @@ pub(crate) struct ReconcileResult {
pub(crate) pending_compute_notification: bool,
}
impl IntentState {
pub(crate) fn new() -> Self {
Self {
attached: None,
secondary: vec![],
}
}
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
let mut result = Vec::new();
if let Some(p) = self.attached {
result.push(p)
}
result.extend(self.secondary.iter().copied());
result
}
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
Self {
attached: node_id,
secondary: vec![],
}
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
self.attached = None;
self.secondary.push(node_id);
true
} else {
false
}
}
}
impl ObservedState {
pub(crate) fn new() -> Self {
Self {
@@ -326,7 +238,6 @@ impl TenantState {
observed: ObservedState::default(),
config: TenantConfig::default(),
reconciler: None,
splitting: SplitState::Idle,
sequence: Sequence(1),
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -396,12 +307,12 @@ impl TenantState {
// Should have exactly one attached, and zero secondaries
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
self.intent.secondary.clear();
modified = true;
}
}
@@ -409,14 +320,14 @@ impl TenantState {
// Should have exactly one attached, and N secondaries
if self.intent.attached.is_none() {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.set_attached(scheduler, Some(node_id));
self.intent.attached = Some(node_id);
used_pageservers.push(node_id);
modified = true;
}
while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id);
self.intent.secondary.push(node_id);
used_pageservers.push(node_id);
modified = true;
}
@@ -424,12 +335,12 @@ impl TenantState {
Detached => {
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
self.intent.attached = None;
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
self.intent.secondary.clear();
modified = true;
}
}
@@ -504,8 +415,6 @@ impl TenantState {
false
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn maybe_reconcile(
&mut self,
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -513,8 +422,6 @@ impl TenantState {
compute_hook: &Arc<ComputeHook>,
service_config: &service::Config,
persistence: &Arc<Persistence>,
gate: &Gate,
cancel: &CancellationToken,
) -> Option<ReconcilerWaiter> {
// If there are any ambiguous observed states, and the nodes they refer to are available,
// we should reconcile to clean them up.
@@ -536,14 +443,6 @@ impl TenantState {
return None;
}
// If we are currently splitting, then never start a reconciler task: the splitting logic
// requires that shards are not interfered with while it runs. Do this check here rather than
// up top, so that we only log this message if we would otherwise have done a reconciliation.
if !matches!(self.splitting, SplitState::Idle) {
tracing::info!("Refusing to reconcile, splitting in progress");
return None;
}
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
@@ -561,101 +460,70 @@ impl TenantState {
// doing our sequence's work.
let old_handle = self.reconciler.take();
let Ok(gate_guard) = gate.enter() else {
// Shutting down, don't start a reconciler
return None;
};
let reconciler_cancel = cancel.child_token();
let cancel = CancellationToken::new();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
shard: self.shard,
generation: self.generation,
intent: TargetState::from_intent(&self.intent),
intent: self.intent.clone(),
config: self.config.clone(),
observed: self.observed.clone(),
pageservers: pageservers.clone(),
compute_hook: compute_hook.clone(),
service_config: service_config.clone(),
_gate_guard: gate_guard,
cancel: reconciler_cancel.clone(),
cancel: cancel.clone(),
persistence: persistence.clone(),
compute_notify_failure: false,
};
let reconcile_seq = self.sequence;
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
let must_notify = self.pending_compute_notification;
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,
shard_id=%reconciler.tenant_shard_id.shard_slug());
metrics::RECONCILER.spawned.inc();
let join_handle = tokio::task::spawn(
async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
let join_handle = tokio::task::spawn(async move {
// Wait for any previous reconcile task to complete before we start
if let Some(old_handle) = old_handle {
old_handle.cancel.cancel();
if let Err(e) = old_handle.handle.await {
// We can't do much with this other than log it: the task is done, so
// we may proceed with our work.
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
}
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
.inc();
return;
}
// Attempt to make observed state match intent state
let result = reconciler.reconcile().await;
// If we know we had a pending compute notification from some previous action, send a notification irrespective
// of whether the above reconcile() did any work
if result.is_ok() && must_notify {
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
reconciler.compute_notify().await.ok();
}
// Update result counter
match &result {
Ok(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
Err(ReconcileError::Cancel) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
Err(_) => metrics::RECONCILER
.complete
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
}
.inc();
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
}
.instrument(reconciler_span),
);
// Early check for cancellation before doing any work
// TODO: wrap all remote API operations in cancellation check
// as well.
if reconciler.cancel.is_cancelled() {
return;
}
// Attempt to make observed state match intent state
let result = reconciler.reconcile().await;
// If we know we had a pending compute notification from some previous action, send a notification irrespective
// of whether the above reconcile() did any work
if result.is_ok() && must_notify {
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
reconciler.compute_notify().await.ok();
}
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
});
self.reconciler = Some(ReconcilerHandle {
sequence: self.sequence,
handle: join_handle,
cancel: reconciler_cancel,
cancel,
});
Some(ReconcilerWaiter {
@@ -680,18 +548,4 @@ impl TenantState {
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
}
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
TenantShardPersistence {
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
}
}
}

View File

@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeAvailability {
// Normal, happy state
Active,
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
/// type needs to be defined with diesel traits in there.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Clone, Copy)]
pub enum NodeSchedulingPolicy {
Active,
Filling,

View File

@@ -450,7 +450,7 @@ async fn handle_tenant(
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters {
count: ShardCount::new(shard_count),
count: ShardCount(shard_count),
stripe_size: shard_stripe_size
.map(ShardStripeSize)
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
@@ -652,10 +652,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
let name = import_match
.get_one::<String>("node-name")
.ok_or_else(|| anyhow!("No node name provided"))?;
let update_catalog = import_match
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
// Parse base inputs
let base_tarfile = import_match
@@ -698,7 +694,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
None,
pg_version,
ComputeMode::Primary,
!update_catalog,
)?;
println!("Done");
}
@@ -836,10 +831,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get_one::<String>("endpoint_id")
.map(String::to_string)
.unwrap_or_else(|| format!("ep-{branch_name}"));
let update_catalog = sub_args
.get_one::<bool>("update-catalog")
.cloned()
.unwrap_or_default();
let lsn = sub_args
.get_one::<String>("lsn")
@@ -889,7 +880,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
http_port,
pg_version,
mode,
!update_catalog,
)?;
}
"start" => {
@@ -928,11 +918,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.get(endpoint_id.as_str())
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
let create_test_user = sub_args
.get_one::<bool>("create-test-user")
.cloned()
.unwrap_or_default();
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
@@ -987,7 +972,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
pageservers,
remote_ext_config,
stripe_size.0 as usize,
create_test_user,
)
.await?;
}
@@ -1473,18 +1457,6 @@ fn cli() -> Command {
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool))
.long("update-catalog")
.help("If set, will set up the catalog for neon_superuser")
.required(false);
let create_test_user = Arg::new("create-test-user")
.value_parser(value_parser!(bool))
.long("create-test-user")
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1545,7 +1517,6 @@ fn cli() -> Command {
.arg(Arg::new("end-lsn").long("end-lsn")
.help("Lsn the basebackup ends at"))
.arg(pg_version_arg.clone())
.arg(update_catalog.clone())
)
).subcommand(
Command::new("tenant")
@@ -1659,7 +1630,6 @@ fn cli() -> Command {
.required(false))
.arg(pg_version_arg.clone())
.arg(hot_standby_arg.clone())
.arg(update_catalog)
)
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1667,7 +1637,6 @@ fn cli() -> Command {
.arg(endpoint_pageserver_id_arg.clone())
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
.arg(create_test_user)
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")

View File

@@ -41,15 +41,11 @@ use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::Database;
use compute_api::spec::PgIdent;
use compute_api::spec::RemoteExtSpec;
use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use serde::{Deserialize, Serialize};
@@ -126,7 +122,6 @@ impl ComputeControlPlane {
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
skip_pg_catalog_updates: bool,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -145,7 +140,7 @@ impl ComputeControlPlane {
// before and after start are the same. So, skip catalog updates,
// with this we basically test a case of waking up an idle compute, where
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
});
@@ -160,7 +155,7 @@ impl ComputeControlPlane {
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates,
skip_pg_catalog_updates: true,
features: vec![],
})?,
)?;
@@ -505,7 +500,6 @@ impl Endpoint {
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: usize,
create_test_user: bool,
) -> Result<()> {
if self.status() == EndpointStatus::Running {
anyhow::bail!("The endpoint is already running");
@@ -557,26 +551,8 @@ impl Endpoint {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: if create_test_user {
vec![Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
}]
} else {
Vec::new()
},
databases: if create_test_user {
vec![Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
}]
} else {
Vec::new()
},
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
@@ -601,16 +577,11 @@ impl Endpoint {
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
if create_test_user {
let conn_str = self.connstr("user", "neondb");
println!("Also at '{}'", conn_str);
}
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -681,9 +652,7 @@ impl Endpoint {
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration
| ComputeStatus::TerminationPending
| ComputeStatus::Terminated => {
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
@@ -814,13 +783,13 @@ impl Endpoint {
Ok(())
}
pub fn connstr(&self, user: &str, db_name: &str) -> String {
pub fn connstr(&self) -> String {
format!(
"postgresql://{}@{}:{}/{}",
user,
"cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
db_name
"postgres"
)
}
}

View File

@@ -400,11 +400,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -510,11 +505,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'lazy_slru_download' as bool")?,
timeline_get_throttle: settings
.remove("timeline_get_throttle")
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
}
};

View File

@@ -52,10 +52,6 @@ pub enum ComputeStatus {
// compute will exit soon or is waiting for
// control-plane to terminate it.
Failed,
// Termination requested
TerminationPending,
// Terminated Postgres
Terminated,
}
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

View File

@@ -201,11 +201,6 @@ impl<P: Atomic> GenericCounterPairVec<P> {
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
}
impl<P: Atomic> GenericCounterPair<P> {
@@ -252,15 +247,6 @@ impl<P: Atomic> GenericCounterPair<P> {
}
}
impl<P: Atomic> Clone for GenericCounterPair<P> {
fn clone(&self) -> Self {
Self {
inc: self.inc.clone(),
dec: self.dec.clone(),
}
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);

View File

@@ -21,7 +21,6 @@ hex.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono.workspace = true
itertools.workspace = true
workspace_hack.workspace = true

View File

@@ -2,7 +2,6 @@ use postgres_ffi::BLCKSZ;
use std::ops::Range;
use crate::key::Key;
use itertools::Itertools;
///
/// Represents a set of Keys, in a compact form.
@@ -64,36 +63,9 @@ impl KeySpace {
KeyPartitioning { parts }
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
let all_ranges = self
.ranges
.iter()
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
let mut accum = KeySpaceAccum::new();
let mut prev: Option<&Range<Key>> = None;
for range in all_ranges {
if let Some(prev) = prev {
let overlap =
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
assert!(
!overlap,
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
prev, range
);
}
accum.add_range(range.clone());
prev = Some(range);
}
self.ranges = accum.to_keyspace().ranges;
}
/// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges.
/// Update the keyspace such that it doesn't contain any range
/// that is overlapping with `other`. This can involve splitting or
/// removing of existing ranges.
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
@@ -248,7 +220,16 @@ impl KeySpaceAccum {
}
pub fn consume_keyspace(&mut self) -> KeySpace {
std::mem::take(self).to_keyspace()
if let Some(accum) = self.accum.take() {
self.ranges.push(accum);
}
let mut prev_accum = KeySpaceAccum::new();
std::mem::swap(self, &mut prev_accum);
KeySpace {
ranges: prev_accum.ranges,
}
}
pub fn size(&self) -> u64 {
@@ -298,13 +279,6 @@ impl KeySpaceRandomAccum {
}
KeySpace { ranges }
}
pub fn consume_keyspace(&mut self) -> KeySpace {
let mut prev_accum = KeySpaceRandomAccum::new();
std::mem::swap(self, &mut prev_accum);
prev_accum.to_keyspace()
}
}
pub fn key_range_size(key_range: &Range<Key>) -> u32 {

View File

@@ -180,7 +180,7 @@ pub enum TimelineState {
Broken { reason: String, backtrace: String },
}
#[derive(Serialize, Deserialize, Clone)]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
pub new_timeline_id: TimelineId,
#[serde(default)]
@@ -214,14 +214,14 @@ impl ShardParameters {
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
pub fn is_unsharded(&self) -> bool {
self.count.is_unsharded()
self.count == ShardCount(0)
}
}
impl Default for ShardParameters {
fn default() -> Self {
Self {
count: ShardCount::new(0),
count: ShardCount(0),
stripe_size: Self::DEFAULT_STRIPE_SIZE,
}
}
@@ -283,7 +283,6 @@ pub struct TenantConfig {
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -310,35 +309,6 @@ pub struct EvictionPolicyLayerAccessThreshold {
pub threshold: Duration,
}
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ThrottleConfig {
pub task_kinds: Vec<String>, // TaskKind
pub initial: usize,
#[serde(with = "humantime_serde")]
pub refill_interval: Duration,
pub refill_amount: NonZeroUsize,
pub max: usize,
pub fair: bool,
}
impl ThrottleConfig {
pub fn disabled() -> Self {
Self {
task_kinds: vec![], // effectively disables the throttle
// other values don't matter with emtpy `task_kinds`.
initial: 0,
refill_interval: Duration::from_millis(1),
refill_amount: NonZeroUsize::new(1).unwrap(),
max: 1,
fair: true,
}
}
/// The requests per second allowed by the given config.
pub fn steady_rps(&self) -> f64 {
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
}
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
@@ -720,7 +690,6 @@ pub enum PagestreamFeMessage {
GetPage(PagestreamGetPageRequest),
DbSize(PagestreamDbSizeRequest),
GetSlruSegment(PagestreamGetSlruSegmentRequest),
GetVectoredPages(PagestreamGetVectoredPagesRequest),
}
// Wrapped in libpq CopyData
@@ -732,7 +701,6 @@ pub enum PagestreamBeMessage {
Error(PagestreamErrorResponse),
DbSize(PagestreamDbSizeResponse),
GetSlruSegment(PagestreamGetSlruSegmentResponse),
GetVectoredPages(PagestreamGetVectoredPagesResponse),
}
// Keep in sync with `pagestore_client.h`
@@ -744,7 +712,6 @@ enum PagestreamBeMessageTag {
Error = 103,
DbSize = 104,
GetSlruSegment = 105,
GetVectoredPages = 106,
}
impl TryFrom<u8> for PagestreamBeMessageTag {
type Error = u8;
@@ -756,7 +723,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
103 => Ok(PagestreamBeMessageTag::Error),
104 => Ok(PagestreamBeMessageTag::DbSize),
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
106 => Ok(PagestreamBeMessageTag::GetVectoredPages),
_ => Err(value),
}
}
@@ -799,15 +765,6 @@ pub struct PagestreamGetSlruSegmentRequest {
pub segno: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetVectoredPagesRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
pub blkno: u32,
pub count: u8,
}
#[derive(Debug)]
pub struct PagestreamExistsResponse {
pub exists: bool,
@@ -828,12 +785,6 @@ pub struct PagestreamGetSlruSegmentResponse {
pub segment: Bytes,
}
#[derive(Debug)]
pub struct PagestreamGetVectoredPagesResponse {
pub page_count: u8,
pub pages: Bytes,
}
#[derive(Debug)]
pub struct PagestreamErrorResponse {
pub message: String,
@@ -905,18 +856,6 @@ impl PagestreamFeMessage {
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
Self::GetVectoredPages(req) => {
bytes.put_u8(5);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
bytes.put_u32(req.blkno);
bytes.put_u8(req.count);
}
}
bytes.into()
@@ -975,20 +914,6 @@ impl PagestreamFeMessage {
segno: body.read_u32::<BigEndian>()?,
},
)),
5 => Ok(PagestreamFeMessage::GetVectoredPages(
PagestreamGetVectoredPagesRequest {
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
blkno: body.read_u32::<BigEndian>()?,
count: body.read_u8()?,
},
)),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
@@ -1030,12 +955,6 @@ impl PagestreamBeMessage {
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
bytes.put(&resp.segment[..]);
}
Self::GetVectoredPages(resp) => {
bytes.put_u8(Tag::GetVectoredPages as u8);
bytes.put_u8(resp.page_count);
bytes.put(&resp.pages[..]);
}
}
bytes.into()
@@ -1084,15 +1003,6 @@ impl PagestreamBeMessage {
segment: segment.into(),
})
}
Tag::GetVectoredPages => {
let page_count = buf.read_u8()?;
let mut pages = vec![0; page_count as usize * 8192];
buf.read_exact(&mut pages)?;
Self::GetVectoredPages(PagestreamGetVectoredPagesResponse {
page_count,
pages: pages.into(),
})
}
};
let remaining = buf.into_inner();
if !remaining.is_empty() {
@@ -1112,7 +1022,6 @@ impl PagestreamBeMessage {
Self::Error(_) => "Error",
Self::DbSize(_) => "DbSize",
Self::GetSlruSegment(_) => "GetSlruSegment",
Self::GetVectoredPages(_) => "GetVectoredPages",
}
}
}

View File

@@ -13,41 +13,10 @@ use utils::id::TenantId;
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8);
pub struct ShardCount(pub u8);
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as `TenantShardId::unsharded`.
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
pub fn count(&self) -> u8 {
if self.0 > 0 {
self.0
} else {
1
}
}
/// The literal internal value: this is **not** the number of shards in the
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
/// [`Self::count`] if you want to know the cardinality of shards.
pub fn literal(&self) -> u8 {
self.0
}
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub fn new(val: u8) -> Self {
Self(val)
}
}
impl ShardNumber {
@@ -117,7 +86,7 @@ impl TenantShardId {
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
@@ -502,12 +471,10 @@ impl ShardIdentity {
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
// A1: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
// A2: because key_is_shard0 also covers relation size keys, which are written
// on all shards even though they're only maintained accurately on shard 0.
// A: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
false
} else {
!self.is_key_local(key)

View File

@@ -25,7 +25,6 @@ hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true
leaky-bucket.workspace = true
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true

View File

@@ -29,9 +29,6 @@ pub enum Scope {
// Should only be used e.g. for status check.
// Currently also used for connection from any pageserver to any safekeeper.
SafekeeperData,
// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -12,7 +12,6 @@ testing = ["fail/failpoints"]
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
@@ -22,6 +21,7 @@ camino.workspace = true
camino-tempfile.workspace = true
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["string"] }
close_fds.workspace = true
const_format.workspace = true
consumption_metrics.workspace = true
crc32c.workspace = true
@@ -36,7 +36,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
leaky-bucket.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
@@ -84,7 +83,7 @@ workspace_hack.workspace = true
reqwest.workspace = true
rpds.workspace = true
enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true

View File

@@ -6,28 +6,14 @@
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
//! logging what happens when a sequential scan is requested on a small table, then picking out two
//! suitable from logs.
//!
//!
//! Reference data (git blame to see commit) on an i3en.3xlarge
// ```text
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
//! ``
use std::sync::Arc;
use std::sync::{Arc, Barrier};
use bytes::{Buf, Bytes};
use pageserver::{
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
};
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use utils::{id::TenantId, lsn::Lsn};
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -53,11 +39,11 @@ fn redo_scenarios(c: &mut Criterion) {
.build()
.unwrap();
tracing::info!("executing first");
rt.block_on(short().execute(&manager)).unwrap();
short().execute(rt.handle(), &manager).unwrap();
tracing::info!("first executed");
}
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
let thread_counts = [1, 2, 4, 8, 16];
let mut group = c.benchmark_group("short");
group.sampling_mode(criterion::SamplingMode::Flat);
@@ -88,69 +74,114 @@ fn redo_scenarios(c: &mut Criterion) {
drop(group);
}
/// Sets up a multi-threaded tokio runtime with default worker thread count,
/// then, spawn `requesters` tasks that repeatedly:
/// - get input from `input_factor()`
/// - call `manager.request_redo()` with their input
///
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
///
/// Using tokio's default worker thread count means the results will differ on machines
/// with different core countrs. We don't care about that, the performance will always
/// be different on different hardware. To compare performance of different software versions,
/// use the same hardware.
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
fn add_multithreaded_walredo_requesters(
b: &mut criterion::Bencher,
nrequesters: usize,
threads: u32,
manager: &Arc<PostgresRedoManager>,
input_factory: fn() -> Request,
) {
assert_ne!(nrequesters, 0);
assert_ne!(threads, 0);
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
if threads == 1 {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
b.iter_batched_ref(
|| Some(input_factory()),
|input| execute_all(input.take(), handle, manager),
criterion::BatchSize::PerIteration,
);
} else {
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
let mut requesters = JoinSet::new();
for _ in 0..nrequesters {
let _entered = rt.enter();
let manager = manager.clone();
let barrier = barrier.clone();
requesters.spawn(async move {
loop {
let input = input_factory();
barrier.wait().await;
let page = input.execute(&manager).await.unwrap();
assert_eq!(page.remaining(), 8192);
barrier.wait().await;
}
});
let barrier = Arc::new(Barrier::new(threads as usize + 1));
let jhs = (0..threads)
.map(|_| {
std::thread::spawn({
let manager = manager.clone();
let barrier = barrier.clone();
let work_rx = work_rx.clone();
move || {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
loop {
// queue up and wait if we want to go another round
if work_rx.lock().unwrap().recv().is_err() {
break;
}
let input = Some(input_factory());
barrier.wait();
execute_all(input, handle, &manager).unwrap();
barrier.wait();
}
}
})
})
.collect::<Vec<_>>();
let _jhs = JoinOnDrop(jhs);
b.iter_batched(
|| {
for _ in 0..threads {
work_tx.send(()).unwrap()
}
},
|()| {
// start the work
barrier.wait();
// wait for work to complete
barrier.wait();
},
criterion::BatchSize::PerIteration,
);
drop(work_tx);
}
}
let do_one_iteration = || {
rt.block_on(async {
barrier.wait().await;
// wait for work to complete
barrier.wait().await;
})
};
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
b.iter_batched(
|| {
// warmup
do_one_iteration();
},
|()| {
// work loop
do_one_iteration();
},
criterion::BatchSize::PerIteration,
);
impl Drop for JoinOnDrop {
// it's not really needless because we want join all then check for panicks
#[allow(clippy::needless_collect)]
fn drop(&mut self) {
// first join all
let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
// then check the results; panicking here is not great, but it does get the message across
// to the user, and sets an exit value.
results.into_iter().try_for_each(|res| res).unwrap();
}
}
rt.block_on(requesters.shutdown());
fn execute_all<I>(
input: I,
handle: &tokio::runtime::Handle,
manager: &PostgresRedoManager,
) -> anyhow::Result<()>
where
I: IntoIterator<Item = Request>,
{
// just fire all requests as fast as possible
input.into_iter().try_for_each(|req| {
let page = req.execute(handle, manager)?;
assert_eq!(page.remaining(), 8192);
anyhow::Ok(())
})
}
criterion_group!(benches, redo_scenarios);
@@ -462,7 +493,11 @@ struct Request {
}
impl Request {
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
fn execute(
self,
rt: &tokio::runtime::Handle,
manager: &PostgresRedoManager,
) -> anyhow::Result<Bytes> {
let Request {
key,
lsn,
@@ -471,8 +506,6 @@ impl Request {
pg_version,
} = self;
manager
.request_redo(key, lsn, base_img, records, pg_version)
.await
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
}
}

View File

@@ -4,8 +4,7 @@ use futures::SinkExt;
use pageserver_api::{
models::{
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
PagestreamGetPageResponse, PagestreamGetVectoredPagesRequest,
PagestreamGetVectoredPagesResponse,
PagestreamGetPageResponse,
},
reltag::RelTag,
};
@@ -158,39 +157,7 @@ impl PagestreamClient {
PagestreamBeMessage::Exists(_)
| PagestreamBeMessage::Nblocks(_)
| PagestreamBeMessage::DbSize(_)
| PagestreamBeMessage::GetSlruSegment(_)
| PagestreamBeMessage::GetVectoredPages(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
msg.kind()
)
}
}
}
pub async fn getpages(
&mut self,
req: PagestreamGetVectoredPagesRequest,
) -> anyhow::Result<PagestreamGetVectoredPagesResponse> {
let req = PagestreamFeMessage::GetVectoredPages(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);
let mut req = tokio_stream::once(Ok(req));
self.copy_both.send_all(&mut req).await?;
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
let next: bytes::Bytes = next.unwrap()?;
let msg = PagestreamBeMessage::deserialize(next)?;
match msg {
PagestreamBeMessage::GetVectoredPages(p) => Ok(p),
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
PagestreamBeMessage::Exists(_)
| PagestreamBeMessage::Nblocks(_)
| PagestreamBeMessage::DbSize(_)
| PagestreamBeMessage::GetSlruSegment(_)
| PagestreamBeMessage::GetPage(_) => {
| PagestreamBeMessage::GetSlruSegment(_) => {
anyhow::bail!(
"unexpected be message kind in response to getpage request: {}",
msg.kind()

View File

@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{info, instrument};
use tracing::{debug, info, instrument};
use std::collections::HashMap;
use std::num::NonZeroUsize;
@@ -28,8 +28,6 @@ pub(crate) struct Args {
#[clap(long, default_value = "localhost:64000")]
page_service_host_port: String,
#[clap(long)]
page_service_connstring: Option<String>,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
@@ -232,17 +230,12 @@ async fn client(
) {
start_work_barrier.wait().await;
let connstr = match &args.page_service_connstring {
Some(connstr) => connstr.clone(),
None => crate::util::connstring::connstring(
&args.page_service_host_port,
args.pageserver_jwt.as_deref(),
),
};
let client = pageserver_client::page_service::Client::new(connstr)
.await
.unwrap();
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
&args.page_service_host_port,
args.pageserver_jwt.as_deref(),
))
.await
.unwrap();
while let Some(Work { lsn, gzip }) = work.recv().await {
let start = Instant::now();
@@ -270,7 +263,7 @@ async fn client(
}
})
.await;
info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {

View File

@@ -1,18 +1,20 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use futures::future::join_all;
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamGetVectoredPagesRequest};
use pageserver_api::models::PagestreamGetPageRequest;
use tokio_util::sync::CancellationToken;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::info;
use tracing::{info, instrument};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
@@ -36,12 +38,8 @@ pub(crate) struct Args {
num_clients: NonZeroUsize,
#[clap(long)]
runtime: Option<humantime::Duration>,
/// Each client sends requests at the given rate.
///
/// If a request takes too long and we should be issuing a new request already,
/// we skip that request and account it as `MISSED`.
#[clap(long)]
per_client_rate: Option<usize>,
per_target_rate_limit: Option<usize>,
/// Probability for sending `latest=true` in the request (uniform distribution).
#[clap(long, default_value = "1")]
req_latest_probability: f64,
@@ -57,24 +55,18 @@ pub(crate) struct Args {
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
#[clap(long)]
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
#[clap(long)]
vectored_read_size: Option<u8>,
targets: Option<Vec<TenantTimelineId>>,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
missed: AtomicU64,
}
impl LiveStats {
fn request_done(&self) {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
fn missed(&self, n: u64) {
self.missed.fetch_add(n, Ordering::Relaxed);
}
}
#[derive(Clone, serde::Serialize, serde::Deserialize)]
@@ -228,12 +220,13 @@ async fn main_impl(
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = args.num_clients.get() * timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = args.num_clients.get() * timelines.len();
let num_work_sender_tasks = 1;
let num_main_impl = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_live_stats_dump + num_work_sender_tasks + num_main_impl,
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
));
tokio::spawn({
@@ -245,12 +238,10 @@ async fn main_impl(
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let missed = stats.missed.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
info!(
"RPS: {:.0} MISSED: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64(),
missed as f64 / elapsed.as_secs_f64()
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
@@ -258,128 +249,127 @@ async fn main_impl(
let cancel = CancellationToken::new();
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let live_stats = live_stats.clone();
let start_work_barrier = start_work_barrier.clone();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == worker_id.timeline)
.cloned()
.collect();
let weights =
rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
.unwrap();
let cancel = cancel.clone();
Box::pin(async move {
let client =
pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
.await
.unwrap();
start_work_barrier.wait().await;
let client_start = Instant::now();
let mut ticks_processed = 0;
while !cancel.is_cancelled() {
// Detect if a request took longer than the RPS rate
if let Some(period) = &rps_period {
let periods_passed_until_now =
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
.unwrap();
if periods_passed_until_now > ticks_processed {
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
}
ticks_processed = periods_passed_until_now;
}
let start = Instant::now();
if let Some(size) = args.vectored_read_size {
assert!(size > 0);
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(is_rel_block_key(&key));
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
PagestreamGetVectoredPagesRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
count: size
}
};
client.getpages(req).await.unwrap();
} else {
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(is_rel_block_key(&key));
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
client.getpage(req).await.unwrap();
}
let end = Instant::now();
live_stats.request_done();
ticks_processed += 1;
STATS.with(|stats| {
stats
.borrow()
.lock()
.unwrap()
.observe(end.duration_since(start))
.unwrap();
});
if let Some(period) = &rps_period {
let next_at = client_start
+ Duration::from_micros(
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
);
tokio::time::sleep_until(next_at.into()).await;
}
}
})
};
info!("spawning workers");
let mut workers = JoinSet::new();
let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
let mut tasks = Vec::new();
for timeline in timelines.iter().cloned() {
for num_client in 0..args.num_clients.get() {
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
let worker_id = WorkerId {
timeline,
num_client,
};
workers.spawn(make_worker(worker_id));
work_senders.insert(worker_id, sender);
tasks.push(tokio::spawn(client(
args,
worker_id,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&live_stats),
cancel.clone(),
)));
}
}
let workers = async move {
while let Some(res) = workers.join_next().await {
res.unwrap();
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
let start_work_barrier = start_work_barrier.clone();
let cancel = cancel.clone();
match args.per_target_rate_limit {
None => Box::pin(async move {
let weights = rand::distributions::weighted::WeightedIndex::new(
all_ranges.iter().map(|v| v.len()),
)
.unwrap();
start_work_barrier.wait().await;
while !cancel.is_cancelled() {
let (timeline, req) = {
let mut rng = rand::thread_rng();
let r = &all_ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
(
WorkerId {
timeline: r.timeline,
num_client: rng.gen_range(0..args.num_clients.get()),
},
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
},
)
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
if sender.send(req).await.is_err() {
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
}
}
}),
Some(rps_limit) => Box::pin(async move {
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
&|worker_id| {
let sender = work_senders.get(&worker_id).unwrap();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == worker_id.timeline)
.cloned()
.collect();
let weights = rand::distributions::weighted::WeightedIndex::new(
ranges.iter().map(|v| v.len()),
)
.unwrap();
let cancel = cancel.clone();
Box::pin(async move {
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(
/* TODO review this choice */
tokio::time::MissedTickBehavior::Burst,
);
while !cancel.is_cancelled() {
ticker.tick().await;
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = Key::from_i128(key);
assert!(is_rel_block_key(&key));
let (rel_tag, block_no) = key_to_rel_block(key)
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
if sender.send(req).await.is_err() {
assert!(
cancel.is_cancelled(),
"client has gone away unexpectedly"
);
}
}
})
};
let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
start_work_barrier.wait().await;
join_all(tasks).await;
}),
}
};
let work_sender_task = tokio::spawn(work_sender);
info!("waiting for everything to become ready");
start_work_barrier.wait().await;
info!("work started");
@@ -387,13 +377,20 @@ async fn main_impl(
tokio::time::sleep(runtime.into()).await;
info!("runtime over, signalling cancellation");
cancel.cancel();
workers.await;
work_sender_task.await.unwrap();
info!("work sender exited");
} else {
workers.await;
work_sender_task.await.unwrap();
unreachable!("work sender never terminates");
}
info!("joining clients");
for t in tasks {
t.await.unwrap();
}
info!("all clients stopped");
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
@@ -410,3 +407,49 @@ async fn main_impl(
anyhow::Ok(())
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
id: WorkerId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
live_stats: Arc<LiveStats>,
cancel: CancellationToken,
) {
let WorkerId {
timeline,
num_client: _,
} = id;
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
let do_requests = async {
start_work_barrier.wait().await;
while let Some(req) = work.recv().await {
let start = Instant::now();
client
.getpage(req)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
};
tokio::select! {
res = do_requests => { res },
_ = cancel.cancelled() => {
// fallthrough to shutdown
}
}
client.shutdown().await;
}

View File

@@ -14,12 +14,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
}
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Pageserver auth",
claims.scope
)
.into(),
(Scope::SafekeeperData, _) => Err(AuthError(
"SafekeeperData scope makes no sense for Pageserver".into(),
)),
}
}

View File

@@ -143,7 +143,6 @@ where
ar: &'a mut Builder<&'b mut W>,
buf: Vec<u8>,
current_segment: Option<(SlruKind, u32)>,
total_blocks: usize,
}
impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -155,7 +154,6 @@ where
ar,
buf: Vec::new(),
current_segment: None,
total_blocks: 0,
}
}
@@ -201,8 +199,7 @@ where
let header = new_tar_header(&segname, self.buf.len() as u64)?;
self.ar.append(&header, self.buf.as_slice()).await?;
self.total_blocks += nblocks;
debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
self.buf.clear();
@@ -210,15 +207,11 @@ where
}
async fn finish(mut self) -> anyhow::Result<()> {
let res = if self.current_segment.is_none() || self.buf.is_empty() {
Ok(())
} else {
self.flush().await
};
if self.current_segment.is_none() || self.buf.is_empty() {
return Ok(());
}
info!("Collected {} SLRU blocks", self.total_blocks);
res
self.flush().await
}
}
@@ -268,7 +261,10 @@ where
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
for part in slru_partitions.parts {
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
let blocks = self
.timeline
.get_vectored(&part.ranges, self.lsn, self.ctx)
.await?;
for (key, block) in blocks {
slru_builder.add_block(&key, block?).await?;

View File

@@ -33,7 +33,6 @@ use utils::{
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::timeline::GetVectoredImpl;
use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
@@ -85,12 +84,6 @@ pub mod defaults {
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
pub const DEFAULT_MAX_VECTORED_READ_SIZE: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
///
/// Default built-in configuration file.
///
@@ -128,12 +121,6 @@ pub mod defaults {
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
#max_vectored_read_size = '{DEFAULT_MAX_VECTORED_READ_SIZE}'
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -269,12 +256,6 @@ pub struct PageServerConf {
pub ingest_batch_size: u64,
pub virtual_file_io_engine: virtual_file::IoEngineKind,
pub get_vectored_impl: GetVectoredImpl,
pub max_vectored_read_size: usize,
pub validate_vectored_get: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -361,12 +342,6 @@ struct PageServerConfigBuilder {
ingest_batch_size: BuilderValue<u64>,
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
get_vectored_impl: BuilderValue<GetVectoredImpl>,
max_vectored_read_size: BuilderValue<usize>,
validate_vectored_get: BuilderValue<bool>,
}
impl Default for PageServerConfigBuilder {
@@ -444,10 +419,6 @@ impl Default for PageServerConfigBuilder {
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
max_vectored_read_size: Set(DEFAULT_MAX_VECTORED_READ_SIZE),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
}
}
}
@@ -608,18 +579,6 @@ impl PageServerConfigBuilder {
self.virtual_file_io_engine = BuilderValue::Set(value);
}
pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
self.get_vectored_impl = BuilderValue::Set(value);
}
pub fn get_max_vectored_read_size(&mut self, value: usize) {
self.max_vectored_read_size = BuilderValue::Set(value);
}
pub fn get_validate_vectored_get(&mut self, value: bool) {
self.validate_vectored_get = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
@@ -730,15 +689,6 @@ impl PageServerConfigBuilder {
virtual_file_io_engine: self
.virtual_file_io_engine
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
get_vectored_impl: self
.get_vectored_impl
.ok_or(anyhow!("missing get_vectored_impl"))?,
max_vectored_read_size: self
.max_vectored_read_size
.ok_or(anyhow!("missing max_vectored_read_size"))?,
validate_vectored_get: self
.validate_vectored_get
.ok_or(anyhow!("missing validate_vectored_get"))?,
})
}
}
@@ -993,15 +943,6 @@ impl PageServerConf {
"virtual_file_io_engine" => {
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
}
"get_vectored_impl" => {
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
}
"max_vectored_read_size" => {
builder.get_max_vectored_read_size(parse_toml_u64("max_vectored_read_size", item)? as usize)
}
"validate_vectored_get" => {
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1076,9 +1017,6 @@ impl PageServerConf {
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
}
}
}
@@ -1312,9 +1250,6 @@ background_task_maximum_delay = '334 s'
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
},
"Correct defaults should be used when no config values are provided"
);
@@ -1379,9 +1314,6 @@ background_task_maximum_delay = '334 s'
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
max_vectored_read_size: defaults::DEFAULT_MAX_VECTORED_READ_SIZE,
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -83,12 +83,12 @@ use utils::{
// This is not functionally necessary (clients will retry), but avoids generating a lot of
// failed API calls while tenants are activating.
#[cfg(not(feature = "testing"))]
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
// finish attaching, if calls to remote storage are slow.
#[cfg(feature = "testing")]
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
pub struct State {
conf: &'static PageServerConf,
@@ -571,16 +571,10 @@ async fn timeline_list_handler(
parse_query_param(&request, "force-await-initial-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, false)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
@@ -622,7 +616,7 @@ async fn timeline_preserve_initdb_handler(
// location where timeline recreation cand find it.
async {
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -1142,7 +1136,7 @@ async fn tenant_shard_split_handler(
let new_shards = state
.tenant_manager
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
.await
.map_err(ApiError::InternalServerError)?;

View File

@@ -4,8 +4,8 @@ use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
@@ -1266,12 +1266,13 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
// remote storage metrics
static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_remote_timeline_client_calls_started",
"Number of started calls to remote timeline client.",
"pageserver_remote_timeline_client_calls_finished",
"Number of finshed calls to remote timeline client.",
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_remote_timeline_client_calls_unfinished",
"Number of ongoing calls to remote timeline client. \
Used to populate pageserver_remote_timeline_client_calls_started. \
This metric is not useful for sampling from Prometheus, but useful in tests.",
&[
"tenant_id",
"shard_id",
@@ -1280,7 +1281,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
"op_kind"
],
)
.unwrap()
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_timeline_client_calls_started",
"When calling a remote timeline client method, we record the current value \
of the calls_unfinished gauge in this histogram. Plot the histogram \
over time in a heatmap to visualize how many operations were ongoing \
at a given instant. It gives you a better idea of the queue depth \
than plotting the gauge directly, since operations may complete faster \
than the sampling interval.",
&["file_kind", "op_kind"],
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
)
.expect("failed to define a metric")
});
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
@@ -2061,7 +2078,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
shard_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
}
@@ -2072,7 +2089,7 @@ impl RemoteTimelineClientMetrics {
tenant_id: tenant_shard_id.tenant_id.to_string(),
shard_id: format!("{}", tenant_shard_id.shard_slug()),
timeline_id: timeline_id.to_string(),
calls: Mutex::new(HashMap::default()),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge: Mutex::new(None),
@@ -2112,15 +2129,15 @@ impl RemoteTimelineClientMetrics {
.unwrap()
}
fn calls_counter_pair(
fn calls_unfinished_gauge(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> IntCounterPair {
let mut guard = self.calls.lock().unwrap();
) -> IntGauge {
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
@@ -2133,6 +2150,17 @@ impl RemoteTimelineClientMetrics {
metric.clone()
}
fn calls_started_hist(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> Histogram {
let key = (file_kind.as_str(), op_kind.as_str());
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
.get_metric_with_label_values(&[key.0, key.1])
.unwrap()
}
fn bytes_started_counter(
&self,
file_kind: &RemoteOpFileKind,
@@ -2203,7 +2231,7 @@ impl RemoteTimelineClientMetrics {
#[must_use]
pub(crate) struct RemoteTimelineClientCallMetricGuard {
/// Decremented on drop.
calls_counter_pair: Option<IntCounterPair>,
calls_unfinished_metric: Option<IntGauge>,
/// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
bytes_finished: Option<(IntCounter, u64)>,
}
@@ -2213,10 +2241,10 @@ impl RemoteTimelineClientCallMetricGuard {
/// The caller vouches to do the metric updates manually.
pub fn will_decrement_manually(mut self) {
let RemoteTimelineClientCallMetricGuard {
calls_counter_pair,
calls_unfinished_metric,
bytes_finished,
} = &mut self;
calls_counter_pair.take();
calls_unfinished_metric.take();
bytes_finished.take();
}
}
@@ -2224,10 +2252,10 @@ impl RemoteTimelineClientCallMetricGuard {
impl Drop for RemoteTimelineClientCallMetricGuard {
fn drop(&mut self) {
let RemoteTimelineClientCallMetricGuard {
calls_counter_pair,
calls_unfinished_metric,
bytes_finished,
} = self;
if let Some(guard) = calls_counter_pair.take() {
if let Some(guard) = calls_unfinished_metric.take() {
guard.dec();
}
if let Some((bytes_finished_metric, value)) = bytes_finished {
@@ -2260,8 +2288,10 @@ impl RemoteTimelineClientMetrics {
op_kind: &RemoteOpKind,
size: RemoteTimelineClientMetricsCallTrackSize,
) -> RemoteTimelineClientCallMetricGuard {
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
calls_counter_pair.inc();
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
self.calls_started_hist(file_kind, op_kind)
.observe(calls_unfinished_metric.get() as f64);
calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
let bytes_finished = match size {
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
@@ -2275,7 +2305,7 @@ impl RemoteTimelineClientMetrics {
}
};
RemoteTimelineClientCallMetricGuard {
calls_counter_pair: Some(calls_counter_pair),
calls_unfinished_metric: Some(calls_unfinished_metric),
bytes_finished,
}
}
@@ -2289,8 +2319,12 @@ impl RemoteTimelineClientMetrics {
op_kind: &RemoteOpKind,
size: RemoteTimelineClientMetricsCallTrackSize,
) {
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
calls_counter_pair.dec();
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
debug_assert!(
calls_unfinished_metric.get() > 0,
"begin and end should cancel out"
);
calls_unfinished_metric.dec();
match size {
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
@@ -2307,15 +2341,18 @@ impl Drop for RemoteTimelineClientMetrics {
shard_id,
timeline_id,
remote_physical_size_gauge,
calls,
calls_unfinished_gauge,
bytes_started_counter,
bytes_finished_counter,
} = self;
for ((a, b), _) in calls.get_mut().unwrap().drain() {
let mut res = [Ok(()), Ok(())];
REMOTE_TIMELINE_CLIENT_CALLS
.remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
// don't care about results
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
a,
b,
]);
}
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
@@ -2459,56 +2496,6 @@ pub mod tokio_epoll_uring {
}
}
pub(crate) mod tenant_throttling {
use metrics::{register_int_counter_vec, IntCounter};
use once_cell::sync::Lazy;
use crate::tenant::{self, throttle::Metric};
pub(crate) struct TimelineGet {
wait_time: IntCounter,
count: IntCounter,
}
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_wait_usecs_sum_global",
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
&["kind"]
)
.unwrap()
});
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_throttling_count_global",
"Count of tenant throttlings, by kind of throttle.",
&["kind"]
)
.unwrap()
});
let kind = "timeline_get";
TimelineGet {
wait_time: WAIT_USECS.with_label_values(&[kind]),
count: WAIT_COUNT.with_label_values(&[kind]),
}
});
impl Metric for &'static TimelineGet {
#[inline(always)]
fn observe_throttling(
&self,
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
) {
let val = u64::try_from(wait_time.as_micros()).unwrap();
self.wait_time.inc_by(val);
self.count.inc();
}
}
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//
@@ -2570,5 +2557,4 @@ pub fn preinitialize_metrics() {
// Custom
Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&tenant_throttling::TIMELINE_GET);
}

View File

@@ -17,8 +17,6 @@ use futures::stream::FuturesUnordered;
use futures::Stream;
use futures::StreamExt;
use pageserver_api::key::Key;
use pageserver_api::models::PagestreamGetVectoredPagesRequest;
use pageserver_api::models::PagestreamGetVectoredPagesResponse;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -28,7 +26,7 @@ use pageserver_api::models::{
PagestreamNblocksResponse,
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use pageserver_api::shard::{ShardCount, ShardNumber};
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
@@ -73,7 +71,6 @@ use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
@@ -337,10 +334,6 @@ enum PageStreamError {
#[error("Read error")]
Read(#[source] PageReconstructError),
/// Something went wrong reading a page: this likely indicates a pageserver bug
#[error("Vectored read error")]
VectoredRead(#[source] GetVectoredError),
/// Ran out of time waiting for an LSN
#[error("LSN timeout: {0}")]
LsnTimeout(WaitLsnError),
@@ -364,15 +357,6 @@ impl From<PageReconstructError> for PageStreamError {
}
}
impl From<GetVectoredError> for PageStreamError {
fn from(value: GetVectoredError) -> Self {
match value {
GetVectoredError::Cancelled => Self::Shutdown,
e => Self::VectoredRead(e),
}
}
}
impl From<GetActiveTimelineError> for PageStreamError {
fn from(value: GetActiveTimelineError) -> Self {
match value {
@@ -682,15 +666,6 @@ impl PageServerHandler {
span,
)
}
PagestreamFeMessage::GetVectoredPages(req) => {
let span = tracing::info_span!("handle_get_vectored_pages_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn, req_count = %req.count);
(
self.handle_get_pages_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
.instrument(span.clone())
.await,
span,
)
}
};
match response {
@@ -1023,7 +998,7 @@ impl PageServerHandler {
) -> Result<&Arc<Timeline>, Key> {
let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
// Fastest path: single sharded case
if first_idx.shard_count.count() == 1 {
if first_idx.shard_count < ShardCount(2) {
return Ok(&first_timeline.timeline);
}
@@ -1186,80 +1161,6 @@ impl PageServerHandler {
}))
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_pages_at_lsn_request(
&mut self,
tenant_id: TenantId,
timeline_id: TimelineId,
req: &PagestreamGetVectoredPagesRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
// This is cheeky and relies on not using sharding :)
// A real solution has to split the requested key sequence between shards.
let get_page_request = PagestreamGetPageRequest {
latest: req.latest,
lsn: req.lsn,
rel: req.rel,
blkno: req.blkno,
};
let timeline = match self.get_cached_timeline_for_page(&get_page_request) {
Ok(tl) => tl,
Err(key) => {
match self
.load_timeline_for_page(tenant_id, timeline_id, key)
.await
{
Ok(t) => t,
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
//
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return Err(PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into(),
));
}
Err(e) => return Err(e.into()),
}
}
};
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
set_tracing_field_shard_id(timeline);
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let (page_count, pages_buf) = timeline
.get_rel_pages_at_lsn(
req.rel,
req.blkno,
req.count,
Version::Lsn(lsn),
req.latest,
ctx,
)
.await?;
Ok(PagestreamBeMessage::GetVectoredPages(
PagestreamGetVectoredPagesResponse {
page_count,
pages: pages_buf,
},
))
}
#[instrument(skip_all, fields(shard_id))]
async fn handle_get_slru_segment_request(
&mut self,

View File

@@ -11,12 +11,10 @@ use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::tenant::timeline::GetVectoredError;
use crate::walrecord::NeonWalRecord;
use anyhow::{anyhow, ensure, Context};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::{
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -28,7 +26,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
@@ -199,41 +197,6 @@ impl Timeline {
version.get(self, key, ctx).await
}
pub(crate) async fn get_rel_pages_at_lsn(
&self,
tag: RelTag,
blknum: BlockNumber,
count: u8,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<(u8, Bytes), GetVectoredError> {
if tag.relnode == 0 {
return Err(GetVectoredError::Other(
RelationError::InvalidRelnode.into(),
));
}
let nblocks = self
.get_rel_size(tag, version, latest, ctx)
.await
.map_err(|e| GetVectoredError::Other(anyhow!(e)))?;
if blknum + (count - 1) as u32 >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag,
blknum,
version.get_lsn(),
nblocks
);
return Ok((1, ZERO_PAGE.clone()));
}
let start_key = rel_block_to_key(tag, blknum);
let end_key = start_key.add(count as u32);
version.get_vectored(self, start_key..end_key, ctx).await
}
// Get size of a database in blocks
pub(crate) async fn get_db_size(
&self,
@@ -1529,7 +1492,7 @@ impl<'a> DatadirModification<'a> {
return Ok(());
}
let mut writer = self.tline.writer().await;
let writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
@@ -1568,23 +1531,13 @@ impl<'a> DatadirModification<'a> {
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let mut writer = self.tline.writer().await;
let writer = self.tline.writer().await;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
if !self.pending_updates.is_empty() {
let prev_pending_updates = std::mem::take(&mut self.pending_updates);
// The put_batch call below expects expects the inputs to be sorted by Lsn,
// so we do that first.
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
.into_iter()
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
.collect();
writer.put_batch(lsn_ordered_batch, ctx).await?;
writer.put_batch(&self.pending_updates, ctx).await?;
self.pending_updates.clear();
}
@@ -1645,55 +1598,6 @@ impl<'a> DatadirModification<'a> {
self.tline.get(key, lsn, ctx).await
}
async fn get_vectored(
&self,
key_range: Range<Key>,
ctx: &RequestContext,
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
// Have we already updated the same key? Read the latest pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let mut keys_in_modification = KeySpaceAccum::new();
let key = key_range.start;
while key != key_range.end {
if let Some(values) = self.pending_updates.get(&key) {
if let Some((_, value)) = values.last() {
keys_in_modification.add_key(key);
match value {
Value::Image(img) => {
results.insert(key, Ok(img.clone()));
}
_ => {
results.insert(
key,
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
))),
);
}
}
}
}
}
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
let mut keyspace = KeySpace {
ranges: vec![key_range],
};
keyspace.remove_overlapping_with(&keys_in_modification.to_keyspace());
let pages = self.tline.get_vectored(keyspace, lsn, ctx).await?;
results.extend(pages.into_iter());
Ok(results)
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn
@@ -1737,43 +1641,6 @@ impl<'a> Version<'a> {
}
}
async fn get_vectored(
&self,
timeline: &Timeline,
key_range: Range<Key>,
ctx: &RequestContext,
) -> Result<(u8, Bytes), GetVectoredError> {
let pages = match self {
Version::Lsn(lsn) => {
timeline
.get_vectored(
KeySpace {
ranges: vec![key_range],
},
*lsn,
ctx,
)
.await
}
Version::Modified(modification) => modification.get_vectored(key_range, ctx).await,
}?;
let mut buf = BytesMut::new();
let page_count: u8 = pages.len().try_into().expect("too many pages returned");
for page in pages {
match page {
(_key, Ok(bytes)) => {
buf.extend_from_slice(&bytes[..]);
}
(_key, Err(err)) => {
return Err(GetVectoredError::Other(anyhow!(err)));
}
}
}
Ok((page_count, buf.freeze()))
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,

View File

@@ -188,7 +188,6 @@ task_local! {
serde::Serialize,
serde::Deserialize,
strum_macros::IntoStaticStr,
strum_macros::EnumString,
)]
pub enum TaskKind {
// Pageserver startup, i.e., `main`

File diff suppressed because it is too large Load Diff

View File

@@ -9,8 +9,8 @@
//! may lead to a data loss.
//!
use anyhow::bail;
use pageserver_api::models;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
@@ -251,7 +251,7 @@ impl LocationConf {
} else {
ShardIdentity::new(
ShardNumber(conf.shard_number),
ShardCount::new(conf.shard_count),
ShardCount(conf.shard_count),
ShardStripeSize(conf.shard_stripe_size),
)?
};
@@ -285,7 +285,7 @@ impl Default for LocationConf {
///
/// For storing and transmitting individual tenant's configuration, see
/// TenantConfOpt.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct TenantConf {
// Flush out an inmemory layer, if it's holding WAL older than this
// This puts a backstop on how much WAL needs to be re-digested if the
@@ -348,13 +348,11 @@ pub struct TenantConf {
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
pub lazy_slru_download: bool,
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
}
/// Same as TenantConf, but this struct preserves the information about
/// which parameters are set and which are not.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
@@ -439,9 +437,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub lazy_slru_download: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
}
impl TenantConfOpt {
@@ -490,10 +485,6 @@ impl TenantConfOpt {
lazy_slru_download: self
.lazy_slru_download
.unwrap_or(global_conf.lazy_slru_download),
timeline_get_throttle: self
.timeline_get_throttle
.clone()
.unwrap_or(global_conf.timeline_get_throttle),
}
}
}
@@ -533,7 +524,6 @@ impl Default for TenantConf {
gc_feedback: false,
heatmap_period: Duration::ZERO,
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
}
}
}
@@ -606,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
gc_feedback: value.gc_feedback,
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
}
}
}

View File

@@ -246,8 +246,6 @@ async fn cleanup_remaining_fs_traces(
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-tenant-dir"

View File

@@ -52,7 +52,8 @@ use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use anyhow::Result;
use pageserver_api::keyspace::KeySpaceAccum;
use std::collections::{HashMap, VecDeque};
use std::cmp::Ordering;
use std::collections::{BTreeMap, VecDeque};
use std::iter::Peekable;
use std::ops::Range;
use std::sync::Arc;
@@ -146,28 +147,43 @@ impl Drop for BatchedUpdates<'_> {
}
/// Return value of LayerMap::search
#[derive(Eq, PartialEq, Debug, Hash)]
#[derive(Eq, PartialEq, Debug)]
pub struct SearchResult {
pub layer: Arc<PersistentLayerDesc>,
pub lsn_floor: Lsn,
}
/// Return value of [`LayerMap::range_search`]
///
/// Contains a mapping from a layer description to a keyspace
/// accumulator that contains all the keys which intersect the layer
/// from the original search space. Keys that were not found are accumulated
/// in a separate key space accumulator.
#[derive(Debug)]
pub struct OrderedSearchResult(SearchResult);
impl Ord for OrderedSearchResult {
fn cmp(&self, other: &Self) -> Ordering {
self.0.lsn_floor.cmp(&other.0.lsn_floor)
}
}
impl PartialOrd for OrderedSearchResult {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for OrderedSearchResult {
fn eq(&self, other: &Self) -> bool {
self.0.lsn_floor == other.0.lsn_floor
}
}
impl Eq for OrderedSearchResult {}
pub struct RangeSearchResult {
pub found: HashMap<SearchResult, KeySpaceAccum>,
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
pub not_found: KeySpaceAccum,
}
impl RangeSearchResult {
fn new() -> Self {
Self {
found: HashMap::new(),
found: BTreeMap::new(),
not_found: KeySpaceAccum::new(),
}
}
@@ -298,7 +314,7 @@ where
Some(search_result) => self
.result
.found
.entry(search_result)
.entry(OrderedSearchResult(search_result))
.or_default()
.add_range(covered_range),
None => self.pad_range(covered_range),
@@ -346,35 +362,6 @@ where
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub enum InMemoryLayerHandle {
Open {
lsn_floor: Lsn,
end_lsn: Lsn,
},
Frozen {
idx: usize,
lsn_floor: Lsn,
end_lsn: Lsn,
},
}
impl InMemoryLayerHandle {
pub fn get_lsn_floor(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
}
}
pub fn get_end_lsn(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -569,43 +556,6 @@ impl LayerMap {
self.historic.iter()
}
/// Get a handle for the first in memory layer that matches the provided predicate.
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
///
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
/// the same exclusive region established by holding the layer manager lock.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
where
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
{
if let Some(open) = &self.open_layer {
if pred(open) {
return Some(InMemoryLayerHandle::Open {
lsn_floor: open.get_lsn_range().start,
end_lsn: open.get_lsn_range().end,
});
}
}
let pos = self.frozen_layers.iter().rev().position(pred);
pos.map(|rev_idx| {
let idx = self.frozen_layers.len() - 1 - rev_idx;
InMemoryLayerHandle::Frozen {
idx,
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
}
})
}
/// Get the layer pointed to by the provided handle.
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
match handle {
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
}
}
///
/// Divide the whole given range of keys into sub-ranges based on the latest
/// image layer that covers each range at the specified lsn (inclusive).
@@ -919,8 +869,6 @@ impl LayerMap {
#[cfg(test)]
mod tests {
use pageserver_api::keyspace::KeySpace;
use super::*;
#[derive(Clone)]
@@ -947,15 +895,15 @@ mod tests {
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
let lhs: HashMap<SearchResult, KeySpace> = lhs
let lhs: Vec<_> = lhs
.found
.into_iter()
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
.collect();
let rhs: HashMap<SearchResult, KeySpace> = rhs
let rhs: Vec<_> = rhs
.found
.into_iter()
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
.collect();
assert_eq!(lhs, rhs);
@@ -975,7 +923,7 @@ mod tests {
Some(res) => {
range_search_result
.found
.entry(res)
.entry(OrderedSearchResult(res))
.or_default()
.add_key(key);
}

View File

@@ -294,6 +294,17 @@ pub enum LoadMetadataError {
Decode(#[from] anyhow::Error),
}
pub fn load_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Result<TimelineMetadata, LoadMetadataError> {
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
let metadata_bytes = std::fs::read(metadata_path)?;
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -32,7 +32,6 @@ use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
@@ -485,7 +484,7 @@ pub async fn init_tenant_mgr(
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
location_conf.shard,
location_conf.tenant_conf.clone(),
location_conf.tenant_conf,
&SecondaryLocationConfig { warm: false },
)),
);
@@ -795,7 +794,7 @@ pub(crate) async fn set_new_tenant_config(
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
// Note that we use ShardParameters::default below.
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
@@ -806,7 +805,7 @@ pub(crate) async fn set_new_tenant_config(
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf.clone(),
new_tenant_conf,
tenant.generation,
&ShardParameters::default(),
);
@@ -1377,7 +1376,7 @@ impl TenantManager {
result
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
pub(crate) async fn shard_split(
&self,
tenant_shard_id: TenantShardId,
@@ -1387,10 +1386,11 @@ impl TenantManager {
let tenant = get_tenant(tenant_shard_id, true)?;
// Plan: identify what the new child shards will be
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
if new_shard_count <= ShardCount(effective_old_shard_count) {
anyhow::bail!("Requested shard count is not an increase");
}
let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
if !expansion_factor.is_power_of_two() {
anyhow::bail!("Requested split is not a power of two");
}
@@ -1467,7 +1467,7 @@ impl TenantManager {
attach_mode: AttachmentMode::Single,
}),
shard: child_shard_identity,
tenant_conf: parent_tenant_conf.clone(),
tenant_conf: parent_tenant_conf,
};
self.upsert_location(
@@ -1490,16 +1490,6 @@ impl TenantManager {
peek_slot.and_then(|s| s.get_attached()).cloned()
};
if let Some(t) = child_shard {
// Wait for the child shard to become active: this should be very quick because it only
// has to download the index_part that we just uploaded when creating it.
if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
// This is not fatal: we have durably created the child shard. It just makes the
// split operation less seamless for clients, as we will may detach the parent
// shard before the child shards are fully ready to serve requests.
tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
continue;
}
let timelines = t.timelines.lock().unwrap().clone();
for timeline in timelines.values() {
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {

View File

@@ -614,7 +614,7 @@ impl RemoteTimelineClient {
metadata,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.metric_begin(&op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -654,7 +654,7 @@ impl RemoteTimelineClient {
metadata.generation, metadata.shard
);
let op = UploadOp::UploadLayer(layer, metadata);
self.metric_begin(&op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -823,14 +823,10 @@ impl RemoteTimelineClient {
}
// schedule the actual deletions
if with_metadata.is_empty() {
// avoid scheduling the op & bumping the metric
return;
}
let op = UploadOp::Delete(Delete {
layers: with_metadata,
});
self.metric_begin(&op);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
}
@@ -1520,10 +1516,10 @@ impl RemoteTimelineClient {
.await;
}
self.metric_end(&task.op);
self.calls_unfinished_metric_end(&task.op);
}
fn metric_impl(
fn calls_unfinished_metric_impl(
&self,
op: &UploadOp,
) -> Option<(
@@ -1560,17 +1556,17 @@ impl RemoteTimelineClient {
Some(res)
}
fn metric_begin(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
guard.will_decrement_manually(); // in metric_end(), see right below
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
}
fn metric_end(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
Some(x) => x,
None => return,
};
@@ -1655,7 +1651,7 @@ impl RemoteTimelineClient {
// Tear down queued ops
for op in qi.queued_operations.into_iter() {
self.metric_end(&op);
self.calls_unfinished_metric_end(&op);
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
// which is exactly what we want to happen.
drop(op);

View File

@@ -81,7 +81,15 @@ pub async fn download_layer_file<'a>(
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
.map_err(DownloadError::Other)?;
let download = storage.download(&remote_path, cancel).await?;
let download = storage
.download(&remote_path, cancel)
.await
.with_context(|| {
format!(
"open a download stream for layer with remote storage path '{remote_path:?}'"
)
})
.map_err(DownloadError::Other)?;
let mut destination_file =
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
@@ -90,11 +98,9 @@ pub async fn download_layer_file<'a>(
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
.await
.with_context(|| {
format!(
.with_context(|| format!(
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
)
})
))
.map_err(DownloadError::Other);
match bytes_amount {

View File

@@ -133,7 +133,7 @@ impl SecondaryTenant {
}
pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
*(self.tenant_conf.lock().unwrap()) = config.clone();
*(self.tenant_conf.lock().unwrap()) = *config;
}
/// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -144,13 +144,13 @@ impl SecondaryTenant {
let conf = models::LocationConfigSecondary { warm: conf.warm };
let tenant_conf = self.tenant_conf.lock().unwrap().clone();
let tenant_conf = *self.tenant_conf.lock().unwrap();
models::LocationConfig {
mode: models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(conf),
shard_number: self.tenant_shard_id.shard_number.0,
shard_count: self.tenant_shard_id.shard_count.literal(),
shard_count: self.tenant_shard_id.shard_count.0,
shard_stripe_size: self.shard_identity.stripe_size.0,
tenant_conf: tenant_conf.into(),
}

View File

@@ -8,21 +8,15 @@ pub(crate) mod layer;
mod layer_desc;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use enum_map::EnumMap;
use enumset::EnumSet;
use once_cell::sync::Lazy;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::models::{
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
};
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::Mutex;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
@@ -40,11 +34,6 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use super::layer_map::InMemoryLayerHandle;
use super::timeline::layer_manager::LayerManager;
use super::timeline::GetVectoredError;
use super::PageReconstructError;
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
@@ -78,287 +67,6 @@ pub struct ValueReconstructState {
pub img: Option<(Lsn, Bytes)>,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) enum ValueReconstructSituation {
Complete,
#[default]
Continue,
}
/// Reconstruct data accumulated for a single key during a vectored get
#[derive(Debug, Default, Clone)]
pub(crate) struct VectoredValueReconstructState {
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
pub(crate) img: Option<(Lsn, Bytes)>,
situation: ValueReconstructSituation,
}
impl VectoredValueReconstructState {
fn get_cached_lsn(&self) -> Option<Lsn> {
self.img.as_ref().map(|img| img.0)
}
}
impl From<VectoredValueReconstructState> for ValueReconstructState {
fn from(mut state: VectoredValueReconstructState) -> Self {
// walredo expects the records to be descending in terms of Lsn
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
ValueReconstructState {
records: state.records,
img: state.img,
}
}
}
/// Bag of data accumulated during a vectored get
pub(crate) struct ValuesReconstructState {
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
keys_done: KeySpaceRandomAccum,
}
impl ValuesReconstructState {
pub(crate) fn new() -> Self {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),
}
}
/// Associate a key with the error which it encountered and mark it as done
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
let previous = self.keys.insert(key, Err(err));
if let Some(Ok(state)) = previous {
if state.situation == ValueReconstructSituation::Continue {
self.keys_done.add_key(key);
}
}
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///
/// If the key is done after the update, mark it as such.
pub(crate) fn update_key(
&mut self,
key: &Key,
lsn: Lsn,
value: Value,
) -> ValueReconstructSituation {
let state = self
.keys
.entry(*key)
.or_insert(Ok(VectoredValueReconstructState::default()));
if let Ok(state) = state {
let key_done = match state.situation {
ValueReconstructSituation::Complete => unreachable!(),
ValueReconstructSituation::Continue => match value {
Value::Image(img) => {
state.img = Some((lsn, img));
true
}
Value::WalRecord(rec) => {
let reached_cache =
state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
let will_init = rec.will_init();
state.records.push((lsn, rec));
will_init || reached_cache
}
},
};
if key_done && state.situation == ValueReconstructSituation::Continue {
state.situation = ValueReconstructSituation::Complete;
self.keys_done.add_key(*key);
}
state.situation
} else {
ValueReconstructSituation::Complete
}
}
/// Returns the Lsn at which this key is cached if one exists.
/// The read path should go no further than this Lsn for the given key.
pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
self.keys
.get(key)
.and_then(|k| k.as_ref().ok())
.and_then(|state| state.get_cached_lsn())
}
/// Returns the key space describing the keys that have
/// been marked as completed since the last call to this function.
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
self.keys_done.consume_keyspace()
}
}
impl Default for ValuesReconstructState {
fn default() -> Self {
Self::new()
}
}
/// Description of layer to be read - the layer map can turn
/// this description into the actual layer.
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub(crate) enum ReadableLayerDesc {
Persistent {
desc: PersistentLayerDesc,
lsn_floor: Lsn,
lsn_ceil: Lsn,
},
InMemory {
handle: InMemoryLayerHandle,
lsn_ceil: Lsn,
},
}
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
#[derive(Debug)]
struct ReadableLayerDescOrdered(ReadableLayerDesc);
/// Data structure which maintains a fringe of layers for the
/// read path. The fringe is the set of layers which intersects
/// the current keyspace that the search is descending on.
/// Each layer tracks the keyspace that intersects it.
///
/// The fringe must appear sorted by Lsn. Hence, it uses
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
layers: HashMap<ReadableLayerDesc, KeySpace>,
}
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
layers_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
let handle = match self.layers_by_lsn.pop() {
Some(h) => h,
None => return None,
};
let removed = self.layers.remove_entry(&handle.0);
match removed {
Some((layer, keyspace)) => Some((layer, keyspace)),
None => unreachable!("fringe internals are always consistent"),
}
}
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
let entry = self.layers.entry(layer.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().merge(&keyspace);
}
Entry::Vacant(entry) => {
self.layers_by_lsn
.push(ReadableLayerDescOrdered(entry.key().clone()));
entry.insert(keyspace);
}
}
}
}
impl Default for LayerFringe {
fn default() -> Self {
Self::new()
}
}
impl Ord for ReadableLayerDescOrdered {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
if ord == std::cmp::Ordering::Equal {
self.0
.get_lsn_floor()
.cmp(&other.0.get_lsn_floor())
.reverse()
} else {
ord
}
}
}
impl PartialOrd for ReadableLayerDescOrdered {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadableLayerDescOrdered {
fn eq(&self, other: &Self) -> bool {
self.0.get_lsn_floor() == other.0.get_lsn_floor()
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
}
}
impl Eq for ReadableLayerDescOrdered {}
impl ReadableLayerDesc {
pub(crate) fn get_lsn_floor(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
}
}
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
layer_manager: &LayerManager,
keyspace: KeySpace,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayerDesc::Persistent {
desc,
lsn_floor,
lsn_ceil,
} => {
let layer = layer_manager.get_from_desc(desc);
layer
.get_values_reconstruct_data(
keyspace,
*lsn_floor,
*lsn_ceil,
reconstruct_state,
ctx,
)
.await
}
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
let layer = layer_manager
.layer_map()
.get_in_memory_layer(handle)
.unwrap();
layer
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
.await
}
}
}
}
/// Return value from [`Layer::get_value_reconstruct_data`]
#[derive(Clone, Copy, Debug)]
pub enum ValueReconstructResult {

View File

@@ -35,18 +35,12 @@ use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::BytesMut;
use anyhow::{bail, ensure, Context, Result};
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
@@ -65,9 +59,7 @@ use utils::{
lsn::Lsn,
};
use super::{
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
///
/// Header stored in the beginning of the file
@@ -215,7 +207,6 @@ pub struct DeltaLayerInner {
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
vectored_blob_reader: VectoredBlobReader,
/// Reader object for reading blocks from the file.
file: FileBlockReader,
@@ -251,7 +242,7 @@ impl DeltaLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, 0, ctx).await?;
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
inner.dump(ctx).await
}
@@ -287,25 +278,20 @@ impl DeltaLayer {
async fn load(
&self,
access_kind: LayerAccessKind,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<&Arc<DeltaLayerInner>> {
self.access_stats.record_access(access_kind, ctx);
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner(max_vectored_read_size, ctx))
.get_or_try_init(|| self.load_inner(ctx))
.await
.with_context(|| format!("Failed to load delta layer {}", self.path()))
}
async fn load_inner(
&self,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<Arc<DeltaLayerInner>> {
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
let path = self.path();
let loaded = DeltaLayerInner::load(&path, None, max_vectored_read_size, ctx)
let loaded = DeltaLayerInner::load(&path, None, ctx)
.await
.and_then(|res| res)?;
@@ -706,16 +692,15 @@ impl DeltaLayerInner {
pub(super) async fn load(
path: &Utf8Path,
summary: Option<Summary>,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let block_reader = FileBlockReader::new(file);
let file = FileBlockReader::new(file);
let summary_blk = match block_reader.read_blk(0, ctx).await {
let summary_blk = match file.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
@@ -737,16 +722,8 @@ impl DeltaLayerInner {
}
}
// TODO: don't open file twice
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let vectored_blob_reader = VectoredBlobReader::new(file, max_vectored_read_size);
Ok(Ok(DeltaLayerInner {
file: block_reader,
vectored_blob_reader,
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
}))
@@ -841,174 +818,6 @@ impl DeltaLayerInner {
}
}
// Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found.
//
// If the key is cached, go no further than the cached Lsn.
//
// Currently, the index is visited for each range, but this
// can be further optimised to visit the index only once.
pub(super) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
start_lsn: Lsn,
end_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let reads = self
.plan_reads(keyspace, start_lsn..end_lsn, reconstruct_state, ctx)
.await
.map_err(GetVectoredError::Other)?;
self.do_reads_and_update_state(reads, reconstruct_state)
.await;
Ok(())
}
async fn plan_reads(
&self,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<Vec<VectoredRead>> {
let mut planner = VectoredReadPlanner::new(self.vectored_blob_reader.get_max_read_size());
let file = &self.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
file,
);
for range in keyspace.ranges.iter() {
let mut range_end_handled = false;
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
tree_reader
.visit(
&start_key.0,
VisitDirection::Forwards,
|raw_key, value| {
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
let blob_ref = BlobRef(value);
assert!(key >= range.start && lsn >= lsn_range.start);
let cached_lsn = reconstruct_state.get_cached_lsn(&key);
let flag = {
if cached_lsn >= Some(lsn) {
BlobFlag::Ignore
} else if blob_ref.will_init() {
BlobFlag::Replaces
} else {
BlobFlag::None
}
};
if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
planner.handle_range_end(blob_ref.pos());
range_end_handled = true;
false
} else {
planner.handle(key, lsn, blob_ref.pos(), flag);
true
}
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
.await
.map_err(|err| anyhow!(err))?;
if !range_end_handled {
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
tracing::info!("Handling range end fallback at {}", payload_end);
planner.handle_range_end(payload_end);
}
}
Ok(planner.finish())
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
reconstruct_state: &mut ValuesReconstructState,
) {
let mut ignore_key_with_err = None;
let mut buf = Some(BytesMut::with_capacity(
self.vectored_blob_reader.get_max_read_size(),
));
for read in reads.into_iter().rev() {
let res = self
.vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await;
let blobs_buf = match res {
Ok(blobs_buf) => blobs_buf,
Err(err) => {
let kind = err.kind();
for (_, blob_meta) in read.blobs_at.as_slice() {
reconstruct_state.on_key_error(
blob_meta.key,
PageReconstructError::from(anyhow!(
"Failed to read blobs from virtual file {}: {}",
self.vectored_blob_reader.get_file_ref().path,
kind
)),
);
}
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(BytesMut::with_capacity(
self.vectored_blob_reader.get_max_read_size(),
));
continue;
}
};
for meta in blobs_buf.blobs.iter().rev() {
if Some(meta.meta.key) == ignore_key_with_err {
continue;
}
let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
let value = match value {
Ok(v) => v,
Err(e) => {
reconstruct_state.on_key_error(
meta.meta.key,
PageReconstructError::from(anyhow!(e).context(format!(
"Failed to deserialize blob from virtual file {}",
self.vectored_blob_reader.get_file_ref().path,
))),
);
ignore_key_with_err = Some(meta.meta.key);
continue;
}
};
// Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
// state, no further updates shall be made to it. The call below will
// panic if the invariant is violated.
reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
}
buf = Some(blobs_buf.buf);
}
}
pub(super) async fn load_keys<'a>(
&'a self,
ctx: &RequestContext,

View File

@@ -26,25 +26,20 @@
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::tenant::Timeline;
use crate::virtual_file::{self, VirtualFile};
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
@@ -64,7 +59,7 @@ use utils::{
};
use super::filename::ImageFileName;
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
///
/// Header stored in the beginning of the file
@@ -157,7 +152,6 @@ pub struct ImageLayerInner {
/// Reader object for reading blocks from the file.
file: FileBlockReader,
vectored_blob_reader: VectoredBlobReader,
}
impl std::fmt::Debug for ImageLayerInner {
@@ -214,7 +208,7 @@ impl ImageLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, 0, ctx).await?;
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
inner.dump(ctx).await?;
@@ -244,32 +238,21 @@ impl ImageLayer {
async fn load(
&self,
access_kind: LayerAccessKind,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<&ImageLayerInner> {
self.access_stats.record_access(access_kind, ctx);
self.inner
.get_or_try_init(|| self.load_inner(max_vectored_read_size, ctx))
.get_or_try_init(|| self.load_inner(ctx))
.await
.with_context(|| format!("Failed to load image layer {}", self.path()))
}
async fn load_inner(
&self,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<ImageLayerInner> {
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
let path = self.path();
let loaded = ImageLayerInner::load(
&path,
self.desc.image_layer_lsn(),
None,
max_vectored_read_size,
ctx,
)
.await
.and_then(|res| res)?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
.await
.and_then(|res| res)?;
// not production code
let actual_filename = path.file_name().unwrap().to_owned();
@@ -376,15 +359,14 @@ impl ImageLayerInner {
path: &Utf8Path,
lsn: Lsn,
summary: Option<Summary>,
max_vectored_read_size: usize,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let block_reader = FileBlockReader::new(file);
let summary_blk = match block_reader.read_blk(0, ctx).await {
let file = FileBlockReader::new(file);
let summary_blk = match file.read_blk(0, ctx).await {
Ok(blk) => blk,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
};
@@ -410,19 +392,11 @@ impl ImageLayerInner {
}
}
// TODO: don't open file twice
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
let vectored_blob_reader = VectoredBlobReader::new(file, max_vectored_read_size);
Ok(Ok(ImageLayerInner {
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn,
file: block_reader,
vectored_blob_reader,
file,
}))
}
@@ -464,124 +438,6 @@ impl ImageLayerInner {
Ok(ValueReconstructResult::Missing)
}
}
// Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found.
pub(super) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let reads = self
.plan_reads(keyspace, ctx)
.await
.map_err(GetVectoredError::Other)?;
self.do_reads_and_update_state(reads, reconstruct_state)
.await;
Ok(())
}
async fn plan_reads(
&self,
keyspace: KeySpace,
ctx: &RequestContext,
) -> anyhow::Result<Vec<VectoredRead>> {
let mut planner = VectoredReadPlanner::new(self.vectored_blob_reader.get_max_read_size());
let file = &self.file;
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
for range in keyspace.ranges.iter() {
let mut range_end_handled = false;
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
range.start.write_to_byte_slice(&mut search_key);
tree_reader
.visit(
&search_key,
VisitDirection::Forwards,
|raw_key, offset| {
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
assert!(key >= range.start);
if key >= range.end {
planner.handle_range_end(offset);
range_end_handled = true;
false
} else {
planner.handle(key, self.lsn, offset, BlobFlag::None);
true
}
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
if !range_end_handled {
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
planner.handle_range_end(payload_end);
}
}
Ok(planner.finish())
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
reconstruct_state: &mut ValuesReconstructState,
) {
let mut buf = Some(BytesMut::with_capacity(
self.vectored_blob_reader.get_max_read_size(),
));
for read in reads.into_iter().rev() {
let res = self
.vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await;
match res {
Ok(blobs_buf) => {
for meta in blobs_buf.blobs.iter().rev() {
let img_buf = Bytes::copy_from_slice(&blobs_buf.buf[meta.start..meta.end]);
reconstruct_state.update_key(
&meta.meta.key,
self.lsn,
Value::Image(img_buf),
);
}
buf = Some(blobs_buf.buf);
}
Err(err) => {
let kind = err.kind();
for (_, blob_meta) in read.blobs_at.as_slice() {
reconstruct_state.on_key_error(
blob_meta.key,
PageReconstructError::from(anyhow!(
"Failed to read blobs from virtual file {}: {}",
self.vectored_blob_reader.get_file_ref().path,
kind
)),
);
}
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(BytesMut::with_capacity(
self.vectored_blob_reader.get_max_read_size(),
));
}
};
}
}
}
/// A builder object for constructing a new image layer.

View File

@@ -9,15 +9,13 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::repository::{Key, Value};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
use crate::tenant::Timeline;
use crate::walrecord;
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use anyhow::{ensure, Result};
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::collections::HashMap;
use std::sync::{Arc, OnceLock};
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
@@ -27,10 +25,7 @@ use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::{RwLock, RwLockWriteGuard};
use super::{
DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
ValuesReconstructState,
};
use super::{DeltaLayerWriter, ResidentLayer};
pub struct InMemoryLayer {
conf: &'static PageServerConf,
@@ -207,91 +202,6 @@ impl InMemoryLayer {
Ok(ValueReconstructResult::Complete)
}
}
// Look up the keys in the provided keyspace and update
// the reconstruct state with whatever is found.
//
// If the key is cached, go no further than the cached Lsn.
pub(crate) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
end_lsn: Lsn,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
#[derive(Eq, PartialEq, Ord, PartialOrd)]
struct BlockRead {
key: Key,
lsn: Lsn,
block_offset: u64,
}
let mut planned_block_reads = BinaryHeap::new();
for range in keyspace.ranges.iter() {
let mut key = range.start;
while key < range.end {
if let Some(vec_map) = inner.index.get(&key) {
let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
None => self.start_lsn..end_lsn,
};
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
planned_block_reads.push(BlockRead {
key,
lsn: *entry_lsn,
block_offset: *pos,
});
}
}
key = key.next();
}
}
let keyspace_size = keyspace.total_size();
let mut completed_keys = HashSet::new();
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
let block_read = planned_block_reads.pop().unwrap();
if completed_keys.contains(&block_read.key) {
continue;
}
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
if let Err(e) = buf {
reconstruct_state
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.key);
continue;
}
let value = Value::des(&buf.unwrap());
if let Err(e) = value {
reconstruct_state
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.key);
continue;
}
let key_situation =
reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
completed_keys.insert(block_read.key);
}
}
Ok(())
}
}
impl std::fmt::Display for InMemoryLayer {
@@ -336,17 +246,32 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub(crate) async fn put_value(
&self,
key: Key,
lsn: Lsn,
buf: &[u8],
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
}
pub(crate) async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
for (key, vals) in values {
for (lsn, val) in vals {
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
.await?;
}
}
Ok(())
}
async fn put_value_locked(
@@ -354,16 +279,22 @@ impl InMemoryLayer {
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
key: Key,
lsn: Lsn,
buf: &[u8],
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let off = {
// Avoid doing allocations for "small" values.
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
locked_inner
.file
.write_blob(
buf,
&buf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build(),
@@ -391,12 +322,7 @@ impl InMemoryLayer {
pub async fn freeze(&self, end_lsn: Lsn) {
let inner = self.inner.write().await;
assert!(
self.start_lsn < end_lsn,
"{} >= {}",
self.start_lsn,
end_lsn
);
assert!(self.start_lsn < end_lsn);
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
for vec_map in inner.index.values() {

View File

@@ -1,6 +1,5 @@
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
@@ -17,14 +16,13 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::repository::Key;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer;
use super::{
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
ValueReconstructResult, ValueReconstructState,
};
use utils::generation::Generation;
@@ -264,37 +262,6 @@ impl Layer {
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
}
pub(crate) async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
start_lsn: Lsn,
end_lsn: Lsn,
reconstruct_data: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let layer = self
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
self.0
.access_stats
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
layer
.get_values_reconstruct_data(
keyspace,
start_lsn,
end_lsn,
reconstruct_data,
&self.0,
ctx,
)
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
.await
}
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
@@ -1210,7 +1177,7 @@ pub(crate) enum EvictionError {
/// Error internal to the [`LayerInner::get_or_maybe_download`]
#[derive(Debug, thiserror::Error)]
pub(crate) enum DownloadError {
enum DownloadError {
#[error("timeline has already shutdown")]
TimelineShutdown,
#[error("no remote storage configured")]
@@ -1307,14 +1274,9 @@ impl DownloadedLayer {
owner.desc.key_range.clone(),
owner.desc.lsn_range.clone(),
));
delta_layer::DeltaLayerInner::load(
&owner.path,
summary,
owner.conf.max_vectored_read_size,
ctx,
)
.await
.map(|res| res.map(LayerKind::Delta))
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
.await
.map(|res| res.map(LayerKind::Delta))
} else {
let lsn = owner.desc.image_layer_lsn();
let summary = Some(image_layer::Summary::expected(
@@ -1323,15 +1285,9 @@ impl DownloadedLayer {
owner.desc.key_range.clone(),
lsn,
));
image_layer::ImageLayerInner::load(
&owner.path,
lsn,
summary,
owner.conf.max_vectored_read_size,
ctx,
)
.await
.map(|res| res.map(LayerKind::Image))
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
.await
.map(|res| res.map(LayerKind::Image))
};
match res {
@@ -1381,29 +1337,6 @@ impl DownloadedLayer {
}
}
async fn get_values_reconstruct_data(
&self,
keyspace: KeySpace,
start_lsn: Lsn,
end_lsn: Lsn,
reconstruct_data: &mut ValuesReconstructState,
owner: &Arc<LayerInner>,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
use LayerKind::*;
match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
Delta(d) => {
d.get_values_reconstruct_data(keyspace, start_lsn, end_lsn, reconstruct_data, ctx)
.await
}
Image(i) => {
i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
.await
}
}
}
async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
use LayerKind::*;
match self.get(owner, ctx).await? {

View File

@@ -15,7 +15,7 @@ use utils::id::TenantId;
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct PersistentLayerDesc {
pub tenant_shard_id: TenantShardId,
pub timeline_id: TimelineId,

View File

@@ -9,7 +9,6 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
@@ -140,8 +139,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
// How many errors we have seen consequtively
let mut error_run_count = 0;
let mut last_throttle_flag_reset_at = Instant::now();
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -206,27 +203,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
walredo_mgr.maybe_quiesce(period * 10);
}
// TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
// so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
let now = Instant::now();
let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
if count_throttled == 0 {
return;
}
let allowed_rps = tenant.timeline_get_throttle.steady_rps();
let delta = now - prev;
warn!(
n_seconds=%format_args!("{:.3}",
delta.as_secs_f64()),
count_accounted,
count_throttled,
sum_throttled_usecs,
allowed_rps=%format_args!("{allowed_rps:.0}"),
"shard was throttled in the last n_seconds")
});
// Sleep
if tokio::time::timeout(sleep_duration, cancel.cancelled())
.await

View File

@@ -1,162 +0,0 @@
use std::{
str::FromStr,
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
time::{Duration, Instant},
};
use arc_swap::ArcSwap;
use enumset::EnumSet;
use tracing::error;
use crate::{context::RequestContext, task_mgr::TaskKind};
/// Throttle for `async` functions.
///
/// Runtime reconfigurable.
///
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
///
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
pub struct Throttle<M: Metric> {
inner: ArcSwap<Inner>,
metric: M,
/// will be turned into [`Stats::count_accounted`]
count_accounted: AtomicU64,
/// will be turned into [`Stats::count_throttled`]
count_throttled: AtomicU64,
/// will be turned into [`Stats::sum_throttled_usecs`]
sum_throttled_usecs: AtomicU64,
}
pub struct Inner {
task_kinds: EnumSet<TaskKind>,
rate_limiter: Arc<leaky_bucket::RateLimiter>,
config: Config,
}
pub type Config = pageserver_api::models::ThrottleConfig;
pub struct Observation {
pub wait_time: Duration,
}
pub trait Metric {
fn observe_throttling(&self, observation: &Observation);
}
/// See [`Throttle::reset_stats`].
pub struct Stats {
// Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
pub count_accounted: u64,
// Subset of the `accounted` requests that were actually throttled.
// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
pub count_throttled: u64,
// Sum of microseconds that throttled requests spent waiting for throttling.
pub sum_throttled_usecs: u64,
}
impl<M> Throttle<M>
where
M: Metric,
{
pub fn new(config: Config, metric: M) -> Self {
Self {
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
metric,
count_accounted: AtomicU64::new(0),
count_throttled: AtomicU64::new(0),
sum_throttled_usecs: AtomicU64::new(0),
}
}
fn new_inner(config: Config) -> Inner {
let Config {
task_kinds,
initial,
refill_interval,
refill_amount,
max,
fair,
} = &config;
let task_kinds: EnumSet<TaskKind> = task_kinds
.iter()
.filter_map(|s| match TaskKind::from_str(s) {
Ok(v) => Some(v),
Err(e) => {
// TODO: avoid this failure mode
error!(
"cannot parse task kind, ignoring for rate limiting {}",
utils::error::report_compact_sources(&e)
);
None
}
})
.collect();
Inner {
task_kinds,
rate_limiter: Arc::new(
leaky_bucket::RateLimiter::builder()
.initial(*initial)
.interval(*refill_interval)
.refill(refill_amount.get())
.max(*max)
.fair(*fair)
.build(),
),
config,
}
}
pub fn reconfigure(&self, config: Config) {
self.inner.store(Arc::new(Self::new_inner(config)));
}
/// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling.
/// This method allows retrieving & resetting that flag.
/// Useful for periodic reporting.
pub fn reset_stats(&self) -> Stats {
let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
Stats {
count_accounted,
count_throttled,
sum_throttled_usecs,
}
}
/// See [`Config::steady_rps`].
pub fn steady_rps(&self) -> f64 {
self.inner.load().config.steady_rps()
}
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
let inner = self.inner.load_full(); // clones the `Inner` Arc
if !inner.task_kinds.contains(ctx.task_kind()) {
return;
};
let start = std::time::Instant::now();
let mut did_throttle = false;
let acquire = inner.rate_limiter.acquire(key_count);
// turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
let acquire = tokio::task::unconstrained(acquire);
let mut acquire = std::pin::pin!(acquire);
std::future::poll_fn(|cx| {
use std::future::Future;
let poll = acquire.as_mut().poll(cx);
did_throttle = did_throttle || poll.is_pending();
poll
})
.await;
self.count_accounted.fetch_add(1, Ordering::Relaxed);
if did_throttle {
self.count_throttled.fetch_add(1, Ordering::Relaxed);
let now = Instant::now();
let wait_time = now - start;
self.sum_throttled_usecs
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
let observation = Observation { wait_time };
self.metric.observe_throttling(&observation);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -419,7 +419,6 @@ impl DeleteTimelineFlow {
TimelineResources {
remote_client,
deletion_queue_client,
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
},
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -343,6 +343,23 @@ pub(super) async fn handle_walreceiver_connection(
modification.commit(&ctx).await?;
uncommitted_records = 0;
filtered_records = 0;
//
// We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
// layer size can become much larger than `checkpoint_distance`.
// It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
// amount of data to key-value storage. So performing this check only after processing
// all WAL records in the chunk, can cause huge L0 layer files.
//
timeline
.check_checkpoint_distance()
.await
.with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
}
}
@@ -389,6 +406,16 @@ pub(super) async fn handle_walreceiver_connection(
}
}
timeline
.check_checkpoint_distance()
.await
.with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn = timeline
.get_remote_consistent_lsn_visible()

View File

@@ -1,412 +0,0 @@
//!
//! Utilities for vectored reading of variable-sized "blobs".
//!
//! The "blob" api is an abstraction on top of the "block" api,
//! with the main difference being that blobs do not have a fixed
//! size (each blob is prefixed with 1 or 4 byte length field)
//!
//! The vectored apis provided in this module allow for planning
//! and executing disk IO which covers multiple blobs.
//!
//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
//! adjacent blocks into a single disk IO request and exectuted by
//! [`VectoredBlobReader`] which does all the required offset juggling
//! and returns a buffer housing all the blobs and a list of offsets.
//!
//! Note that the vectored blob api does *not* go through the page cache.
use std::collections::BTreeMap;
use bytes::BytesMut;
use pageserver_api::key::Key;
use utils::lsn::Lsn;
use utils::vec_map::VecMap;
use crate::virtual_file::VirtualFile;
/// Metadata bundled with the start and end offset of a blob.
#[derive(Copy, Clone, Debug)]
pub struct BlobMeta {
pub key: Key,
pub lsn: Lsn,
}
/// Blob offsets into [`VectoredBlobsBuf::buf`]
pub struct VectoredBlob {
pub start: usize,
pub end: usize,
pub meta: BlobMeta,
}
/// Return type of [`VectoredBlobReader::read_blobs`]
pub struct VectoredBlobsBuf {
/// Buffer for all blobs in this read
pub buf: BytesMut,
/// Offsets into the buffer and metadata for all blobs in this read
pub blobs: Vec<VectoredBlob>,
}
/// Description of one disk read for multiple blobs.
/// Used as the argument form [`VectoredBlobReader::read_blobs`]
#[derive(Debug)]
pub struct VectoredRead {
pub start: u64,
pub end: u64,
/// Starting offsets and metadata for each blob in this read
pub blobs_at: VecMap<u64, BlobMeta>,
max_read_size: usize,
}
#[derive(Eq, PartialEq)]
enum VectoredReadExtended {
Yes,
No,
}
impl VectoredRead {
fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
let mut blobs_at = VecMap::default();
blobs_at
.append(start_offset, meta)
.expect("First insertion always succeeds");
Self {
start: start_offset,
end: end_offset,
blobs_at,
max_read_size,
}
}
/// Attempt to extend the current read with a new blob if the start
/// offset matches with the current end of the vectored read
/// and the resuting size is below the max read size
fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
let size = (end - start) as usize;
if self.end == start && self.size() + size <= self.max_read_size {
self.end = end;
self.blobs_at
.append(start, meta)
.expect("LSNs are ordered within vectored reads");
return VectoredReadExtended::Yes;
}
VectoredReadExtended::No
}
fn size(&self) -> usize {
(self.end - self.start) as usize
}
}
#[derive(Copy, Clone, Debug)]
pub enum BlobFlag {
None,
Ignore,
Replaces,
}
/// Planner for vectored blob reads.
///
/// Blob offsets are received via [`VectoredReadPlanner::handle`]
/// and coalesced into disk reads.
///
/// The implementation is very simple:
/// * Collect all blob offsets in an ordered structure
/// * Iterate over the collected blobs and coalesce them into reads at the end
pub struct VectoredReadPlanner {
// Track all the blob offsets. Start offsets must be ordered.
blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
// Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
prev: Option<(Key, Lsn, u64, BlobFlag)>,
max_read_size: usize,
}
impl VectoredReadPlanner {
pub fn new(max_read_size: usize) -> Self {
Self {
blobs: BTreeMap::new(),
prev: None,
max_read_size,
}
}
/// Include a new blob in the read plan.
///
/// Notes:
/// * This function should be called for each blob in the desired *inclusive* range.
/// See `DeltaLayerInner::plan_reads` and `ImageLayerInner::plan_reads`.
/// * Calls to this function should be for monotonically continuous (key, lsn) tuples.
///
/// The `flag` argument has two interesting values:
/// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
/// This is used for WAL records that `will_init`.
/// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
/// if the blob is cached.
pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
// Implementation note: internally lag behind by one blob such that
// we have a start and end offset when initialising [`VectoredRead`]
let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
None => {
self.prev = Some((key, lsn, offset, flag));
return;
}
Some(prev) => prev,
};
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
self.prev = Some((key, lsn, offset, flag));
}
pub fn handle_range_end(&mut self, offset: u64) {
if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
}
self.prev = None;
}
fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
match flag {
BlobFlag::None => {
let blobs_for_key = self.blobs.entry(key).or_default();
blobs_for_key.push((lsn, start_offset, end_offset));
}
BlobFlag::Replaces => {
let blobs_for_key = self.blobs.entry(key).or_default();
blobs_for_key.clear();
blobs_for_key.push((lsn, start_offset, end_offset));
}
BlobFlag::Ignore => {}
}
}
pub fn finish(self) -> Vec<VectoredRead> {
let mut current_read: Option<VectoredRead> = None;
let mut reads = Vec::new();
for (key, blobs_for_key) in self.blobs {
for (lsn, start_offset, end_offset) in blobs_for_key {
let extended = match &mut current_read {
Some(read) => read.extend(start_offset, end_offset, BlobMeta { key, lsn }),
None => VectoredReadExtended::No,
};
if extended == VectoredReadExtended::No {
let next_read = VectoredRead::new(
start_offset,
end_offset,
BlobMeta { key, lsn },
self.max_read_size,
);
let prev_read = current_read.replace(next_read);
if let Some(read) = prev_read {
reads.push(read);
}
}
}
}
if let Some(read) = current_read {
reads.push(read);
}
reads
}
}
/// Disk reader for vectored blob spans (does not go through the page cache)
pub struct VectoredBlobReader {
file: VirtualFile,
max_vectored_read_size: usize,
}
impl VectoredBlobReader {
pub fn new(file: VirtualFile, max_vectored_read_size: usize) -> Self {
Self {
file,
max_vectored_read_size,
}
}
pub fn get_max_read_size(&self) -> usize {
self.max_vectored_read_size
}
pub fn get_file_ref(&self) -> &VirtualFile {
&self.file
}
/// Read the requested blobs into the buffer.
///
/// We have to deal with the fact that blobs are not fixed size.
/// Each blob is prefixed by a size header.
///
/// The success return value is a struct which contains the buffer
/// filled from disk and a list of offsets at which each blob lies
/// in the buffer.
pub async fn read_blobs(
&self,
read: &VectoredRead,
buf: BytesMut,
) -> Result<VectoredBlobsBuf, std::io::Error> {
// tracing::info!("read_blobs(read={read:?}, read_size={})", read.size());
assert!(read.size() > 0);
assert!(
read.size() <= buf.capacity(),
"{} > {}",
read.size(),
buf.capacity()
);
let buf = self
.file
.read_exact_at_n(buf, read.start, read.size())
.await?;
let blobs_at = read.blobs_at.as_slice();
let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
let mut metas = Vec::new();
let pairs = blobs_at.iter().zip(
blobs_at
.iter()
.map(Some)
.skip(1)
.chain(std::iter::once(None)),
);
for ((offset, meta), next) in pairs {
let offset_in_buf = offset - start_offset;
let first_len_byte = buf[offset_in_buf as usize];
// Each blob is prefixed by a header containing it's size.
// Extract the size and skip that header to find the start of the data.
let (size_length, blob_size) = if first_len_byte < 0x80 {
(1, first_len_byte as u64)
} else {
let mut blob_size_buf = [0u8; 4];
let offset_in_buf = offset_in_buf as usize;
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
blob_size_buf[0] &= 0x7f;
(4, u32::from_be_bytes(blob_size_buf) as u64)
};
let start = offset_in_buf + size_length;
let end = match next {
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
None => start + blob_size,
};
assert_eq!(end - start, blob_size);
metas.push(VectoredBlob {
start: start as usize,
end: end as usize,
meta: *meta,
})
}
Ok(VectoredBlobsBuf { buf, blobs: metas })
}
}
#[cfg(test)]
mod tests {
use super::*;
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
assert_eq!(read.start, offset_range.first().unwrap().2);
let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
let offsets_in_read: Vec<_> = read
.blobs_at
.as_slice()
.iter()
.map(|(offset, _)| *offset)
.collect();
assert_eq!(expected_offsets_in_read, offsets_in_read);
}
#[test]
fn planner_max_read_size_test() {
let max_read_size = 128 * 1024;
let key = Key::MIN;
let lsn = Lsn(0);
let blob_descriptions = vec![
(key, lsn, 0, BlobFlag::None),
(key, lsn, 32 * 1024, BlobFlag::None),
(key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
(key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
(key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
(key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
(key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
(key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
];
let ranges = [
&blob_descriptions[0..3],
&blob_descriptions[3..4],
&blob_descriptions[4..5],
&blob_descriptions[5..6],
&blob_descriptions[6..7],
&blob_descriptions[7..],
];
let mut planner = VectoredReadPlanner::new(max_read_size);
for (key, lsn, offset, flag) in blob_descriptions.clone() {
planner.handle(key, lsn, offset, flag);
}
planner.handle_range_end(652 * 1024);
let reads = planner.finish();
assert_eq!(reads.len(), 6);
for (idx, read) in reads.iter().enumerate() {
validate_read(read, ranges[idx]);
}
}
#[test]
fn planner_replacement_test() {
let max_read_size = 128 * 1024;
let first_key = Key::MIN;
let second_key = first_key.next();
let lsn = Lsn(0);
let blob_descriptions = vec![
(first_key, lsn, 0, BlobFlag::None), // First in read 1
(first_key, lsn, 1024, BlobFlag::None), // Last in read 1
(second_key, lsn, 2 * 1024, BlobFlag::Replaces),
(second_key, lsn, 3 * 1024, BlobFlag::None),
(second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
(second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2
];
let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
let mut planner = VectoredReadPlanner::new(max_read_size);
for (key, lsn, offset, flag) in blob_descriptions.clone() {
planner.handle(key, lsn, offset, flag);
}
planner.handle_range_end(6 * 1024);
let reads = planner.finish();
assert_eq!(reads.len(), 2);
for (idx, read) in reads.iter().enumerate() {
validate_read(read, ranges[idx]);
}
}
}

View File

@@ -562,18 +562,7 @@ impl VirtualFile {
B: IoBufMut + Send,
{
let (buf, res) =
read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
res.map(|()| buf)
}
pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
where
B: IoBufMut + Send,
{
let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
self.read_at(buf, offset)
})
.await;
read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
res.map(|()| buf)
}
@@ -707,7 +696,6 @@ impl VirtualFile {
pub async fn read_exact_at_impl<B, F, Fut>(
buf: B,
mut offset: u64,
count: Option<usize>,
mut read_at: F,
) -> (B, std::io::Result<()>)
where
@@ -715,15 +703,7 @@ where
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
{
let mut buf: tokio_epoll_uring::Slice<B> = match count {
Some(count) => {
assert!(count <= buf.bytes_total());
assert!(count > 0);
buf.slice(..count) // may include uninitialized memory
}
None => buf.slice_full(), // includes all the uninitialized memory
};
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
while buf.bytes_total() != 0 {
let res;
(buf, res) = read_at(buf, offset).await;
@@ -813,7 +793,7 @@ mod test_read_exact_at_impl {
result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
}]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
@@ -822,33 +802,13 @@ mod test_read_exact_at_impl {
assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
}
#[tokio::test]
async fn test_with_count() {
let buf = Vec::with_capacity(5);
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::from(vec![Expectation {
offset: 0,
bytes_total: 3,
result: Ok(vec![b'a', b'b', b'c']),
}]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
.await;
assert!(res.is_ok());
assert_eq!(buf, vec![b'a', b'b', b'c']);
}
#[tokio::test]
async fn test_empty_buf_issues_no_syscall() {
let buf = Vec::new();
let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
expectations: VecDeque::new(),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
@@ -873,7 +833,7 @@ mod test_read_exact_at_impl {
},
]),
}));
let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})
@@ -904,7 +864,7 @@ mod test_read_exact_at_impl {
},
]),
}));
let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
let mock_read_at = Arc::clone(&mock_read_at);
async move { mock_read_at.lock().await.read_at(buf, offset).await }
})

View File

@@ -9,7 +9,7 @@ use bytes::Bytes;
use nix::poll::{PollFd, PollFlags};
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
use std::os::{fd::AsRawFd, unix::process::CommandExt};
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
@@ -26,6 +26,40 @@ mod no_leak_child;
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
mod protocol;
///
/// Command with ability not to give all file descriptors to child process
///
trait CloseFileDescriptors: CommandExt {
///
/// Close file descriptors (other than stdin, stdout, stderr) in child process
///
fn close_fds(&mut self) -> &mut Command;
}
impl<C: CommandExt> CloseFileDescriptors for C {
fn close_fds(&mut self) -> &mut Command {
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
// which means it should be safe to execute inside a signal handler.
// The precise meaning depends on platform. See `man signal-safety`
// for the linux definition.
//
// The set_fds_cloexec_threadsafe function is documented to be
// async-signal-safe.
//
// Aside from this function, the rest of the code is re-entrant and
// doesn't make any syscalls. We're just passing constants.
//
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
// which is not async-signal-safe. Be careful.
unsafe {
self.pre_exec(move || {
close_fds::set_fds_cloexec_threadsafe(3, &[]);
Ok(())
})
}
}
}
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
@@ -79,14 +113,16 @@ impl WalRedoProcess {
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
// The redo process is not trusted, and runs in seccomp mode that
// doesn't allow it to open any files. We have to also make sure it
// doesn't inherit any file descriptors from the pageserver, that
// would allow an attacker to read any files that happen to be open
// in the pageserver.
//
// The Rust standard library makes sure to mark any file descriptors with
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
.close_fds()
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();

256
poetry.lock generated
View File

@@ -158,28 +158,6 @@ files = [
attrs = ">=16.0.0"
pluggy = ">=0.4.0"
[[package]]
name = "anyio"
version = "4.3.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.8"
files = [
{file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
{file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
]
[package.dependencies]
exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
[package.extras]
doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
trio = ["trio (>=0.23)"]
[[package]]
name = "async-timeout"
version = "4.0.3"
@@ -858,43 +836,43 @@ files = [
[[package]]
name = "cryptography"
version = "42.0.2"
version = "42.0.0"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
{file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
{file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
{file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
{file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
{file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
{file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
{file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
{file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
{file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
{file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
{file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
{file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
{file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
{file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
{file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
{file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
{file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
{file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
{file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
{file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
{file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
{file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
{file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
{file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
{file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
{file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
{file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
{file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
{file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
]
[package.dependencies]
@@ -1095,100 +1073,6 @@ files = [
{file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
]
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
optional = false
python-versions = ">=3.7"
files = [
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]]
name = "h2"
version = "4.1.0"
description = "HTTP/2 State-Machine based protocol implementation"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
{file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
]
[package.dependencies]
hpack = ">=4.0,<5"
hyperframe = ">=6.0,<7"
[[package]]
name = "hpack"
version = "4.0.0"
description = "Pure-Python HPACK header compression"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
{file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
]
[[package]]
name = "httpcore"
version = "1.0.3"
description = "A minimal low-level HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
{file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
]
[package.dependencies]
certifi = "*"
h11 = ">=0.13,<0.15"
[package.extras]
asyncio = ["anyio (>=4.0,<5.0)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
trio = ["trio (>=0.22.0,<0.24.0)"]
[[package]]
name = "httpx"
version = "0.26.0"
description = "The next generation HTTP client."
optional = false
python-versions = ">=3.8"
files = [
{file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
{file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
]
[package.dependencies]
anyio = "*"
certifi = "*"
h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
httpcore = "==1.*"
idna = "*"
sniffio = "*"
[package.extras]
brotli = ["brotli", "brotlicffi"]
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
[[package]]
name = "hyperframe"
version = "6.0.1"
description = "HTTP/2 framing layer for Python"
optional = false
python-versions = ">=3.6.1"
files = [
{file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
{file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
]
[[package]]
name = "idna"
version = "3.3"
@@ -2025,20 +1909,6 @@ pytest = [
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
]
[[package]]
name = "pytest-repeat"
version = "0.9.3"
description = "pytest plugin for repeating tests"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
{file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
]
[package.dependencies]
pytest = "*"
[[package]]
name = "pytest-rerunfailures"
version = "13.0"
@@ -2182,6 +2052,7 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2271,28 +2142,28 @@ pyasn1 = ">=0.1.3"
[[package]]
name = "ruff"
version = "0.2.2"
version = "0.1.11"
description = "An extremely fast Python linter and code formatter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
{file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
{file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
{file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
{file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
{file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
{file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
{file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
{file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
{file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
{file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
{file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
]
[[package]]
@@ -2354,17 +2225,6 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
[[package]]
name = "sniffio"
version = "1.3.0"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
files = [
{file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
]
[[package]]
name = "sshpubkeys"
version = "3.3.1"
@@ -2571,6 +2431,16 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2808,4 +2678,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"

View File

@@ -171,8 +171,16 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
let mut ctx =
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(
&mut ctx,
dest_suffix,
tls_config,
tls_server_end_point,
socket,
)
.await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
@@ -240,7 +248,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
async fn handle_client(
mut ctx: RequestMonitoring,
ctx: &mut RequestMonitoring,
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,

View File

@@ -87,22 +87,6 @@ pub mod errors {
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::Console {
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
..
} => crate::error::ErrorKind::User,
ApiError::Console {
status: http::StatusCode::LOCKED,
text,
} if text.contains("quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
crate::error::ErrorKind::User
}
ApiError::Console {
status: http::StatusCode::TOO_MANY_REQUESTS,
..
} => crate::error::ErrorKind::ServiceRateLimit,
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
@@ -238,7 +222,7 @@ pub mod errors {
match self {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
}
}
}

View File

@@ -147,13 +147,15 @@ impl RequestMonitoring {
self.success = true;
}
pub fn log(self) {}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
pub fn log(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());
}
}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
self.log()
}
}

View File

@@ -37,12 +37,9 @@ pub enum ErrorKind {
/// Network error between user and proxy. Not necessarily user error
ClientDisconnect,
/// Proxy self-imposed user rate limits
/// Proxy self-imposed rate limits
RateLimit,
/// Proxy self-imposed service-wise rate limits
ServiceRateLimit,
/// internal errors
Service,
@@ -57,12 +54,25 @@ pub enum ErrorKind {
}
impl ErrorKind {
pub fn to_str(&self) -> &'static str {
match self {
ErrorKind::User => "request failed due to user error",
ErrorKind::ClientDisconnect => "client disconnected",
ErrorKind::RateLimit => "request cancelled due to rate limit",
ErrorKind::Service => "internal service error",
ErrorKind::ControlPlane => "non-retryable control plane error",
ErrorKind::Postgres => "postgres error",
ErrorKind::Compute => {
"non-retryable compute connection error (or exhausted retry capacity)"
}
}
}
pub fn to_metric_label(&self) -> &'static str {
match self {
ErrorKind::User => "user",
ErrorKind::ClientDisconnect => "clientdisconnect",
ErrorKind::RateLimit => "ratelimit",
ErrorKind::ServiceRateLimit => "serviceratelimit",
ErrorKind::Service => "service",
ErrorKind::ControlPlane => "controlplane",
ErrorKind::Postgres => "postgres",
@@ -75,6 +85,12 @@ pub trait ReportableError: fmt::Display + Send + 'static {
fn get_error_kind(&self) -> ErrorKind;
}
impl ReportableError for tokio::time::error::Elapsed {
fn get_error_kind(&self) -> ErrorKind {
ErrorKind::RateLimit
}
}
impl ReportableError for tokio_postgres::error::Error {
fn get_error_kind(&self) -> ErrorKind {
if self.as_db_error().is_some() {

View File

@@ -132,8 +132,9 @@ struct Scram(scram::ServerSecret);
impl Scram {
fn new(password: &str) -> anyhow::Result<Self> {
let secret =
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
let salt = rand::random::<[u8; 16]>();
let secret = scram::ServerSecret::build(password, &salt, 256)
.context("failed to generate scram secret")?;
Ok(Scram(secret))
}

View File

@@ -12,6 +12,9 @@ mod messages;
mod secret;
mod signature;
#[cfg(any(test, doc))]
mod password;
pub use exchange::{exchange, Exchange};
pub use key::ScramKey;
pub use secret::ServerSecret;
@@ -56,21 +59,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
#[cfg(test)]
mod tests {
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
use crate::sasl::{Mechanism, Step};
use super::{Exchange, ServerSecret};
use super::{password::SaltedPassword, Exchange, ServerSecret};
#[test]
fn snapshot() {
fn happy_path() {
let iterations = 4096;
let salt = "QSXCR+Q6sek8bf92";
let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
let secret = ServerSecret::parse(&secret).unwrap();
let salt_base64 = "QSXCR+Q6sek8bf92";
let pw = SaltedPassword::new(
b"pencil",
base64::decode(salt_base64).unwrap().as_slice(),
iterations,
);
let secret = ServerSecret {
iterations,
salt_base64: salt_base64.to_owned(),
stored_key: pw.client_key().sha256(),
server_key: pw.server_key(),
doomed: false,
};
const NONCE: [u8; 18] = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
];
@@ -112,33 +121,4 @@ mod tests {
]
);
}
fn run_round_trip_test(server_password: &str, client_password: &str) {
let scram_secret = ServerSecret::build(server_password).unwrap();
let sasl_client =
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
let outcome = super::exchange(
&scram_secret,
sasl_client,
crate::config::TlsServerEndPoint::Undefined,
)
.unwrap();
match outcome {
crate::sasl::Outcome::Success(_) => {}
crate::sasl::Outcome::Failure(r) => panic!("{r}"),
}
}
#[test]
fn round_trip() {
run_round_trip_test("pencil", "pencil")
}
#[test]
#[should_panic(expected = "password doesn't match")]
fn failure() {
run_round_trip_test("pencil", "eraser")
}
}

View File

@@ -3,7 +3,7 @@
/// Faithfully taken from PostgreSQL.
pub const SCRAM_KEY_LEN: usize = 32;
/// One of the keys derived from the user's password.
/// One of the keys derived from the [password](super::password::SaltedPassword).
/// We use the same structure for all keys, i.e.
/// `ClientKey`, `StoredKey`, and `ServerKey`.
#[derive(Clone, Default, PartialEq, Eq, Debug)]

View File

@@ -0,0 +1,74 @@
//! Password hashing routines.
use super::key::ScramKey;
pub const SALTED_PASSWORD_LEN: usize = 32;
/// Salted hashed password is essential for [key](super::key) derivation.
#[repr(transparent)]
pub struct SaltedPassword {
bytes: [u8; SALTED_PASSWORD_LEN],
}
impl SaltedPassword {
/// See `scram-common.c : scram_SaltedPassword` for details.
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
}
/// Derive `ClientKey` from a salted hashed password.
pub fn client_key(&self) -> ScramKey {
super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
}
/// Derive `ServerKey` from a salted hashed password.
pub fn server_key(&self) -> ScramKey {
super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
}
}
impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
#[inline(always)]
fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
Self { bytes }
}
}
#[cfg(test)]
mod tests {
use super::SaltedPassword;
fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
let one = 1_u32.to_be_bytes(); // magic
let mut current = super::super::hmac_sha256(password, [salt, &one]);
let mut result = current;
for _ in 1..iterations {
current = super::super::hmac_sha256(password, [current.as_ref()]);
// TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
for (i, x) in current.iter().enumerate() {
result[i] ^= x;
}
}
result.into()
}
#[test]
fn pbkdf2() {
let password = "a-very-secure-password";
let salt = "such-a-random-salt";
let iterations = 4096;
let output = [
203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
];
let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
assert_eq!(actual.bytes, output);
assert_eq!(actual.bytes, expected.bytes);
}
}

View File

@@ -3,7 +3,7 @@
use super::base64_decode_array;
use super::key::ScramKey;
/// Server secret is produced from user's password,
/// Server secret is produced from [password](super::password::SaltedPassword)
/// and is used throughout the authentication process.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct ServerSecret {
@@ -59,10 +59,21 @@ impl ServerSecret {
/// Build a new server secret from the prerequisites.
/// XXX: We only use this function in tests.
#[cfg(test)]
pub fn build(password: &str) -> Option<Self> {
Self::parse(&postgres_protocol::password::scram_sha_256(
password.as_bytes(),
))
pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
// TODO: implement proper password normalization required by the RFC
if !password.is_ascii() {
return None;
}
let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
Some(Self {
iterations,
salt_base64: base64::encode(salt),
stored_key: password.client_key().sha256(),
server_key: password.server_key(),
doomed: false,
})
}
}
@@ -92,4 +103,20 @@ mod tests {
assert_eq!(base64::encode(parsed.stored_key), stored_key);
assert_eq!(base64::encode(parsed.server_key), server_key);
}
#[test]
fn build_scram_secret() {
let salt = b"salt";
let secret = ServerSecret::build("password", salt, 4096).unwrap();
assert_eq!(secret.iterations, 4096);
assert_eq!(secret.salt_base64, base64::encode(salt));
assert_eq!(
base64::encode(secret.stored_key.as_ref()),
"lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
);
assert_eq!(
base64::encode(secret.server_key.as_ref()),
"ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
);
}
}

View File

@@ -88,10 +88,7 @@ pub async fn task_main(
return Ok(());
}
};
let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
// prefer http2, but support http/1.1
tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();
let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let _ = addr_incoming.set_nodelay(true);

View File

@@ -12,7 +12,7 @@ use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Value;
use tokio::try_join;
use tokio::join;
use tokio_postgres::error::DbError;
use tokio_postgres::error::ErrorPosition;
use tokio_postgres::GenericClient;
@@ -32,9 +32,11 @@ use crate::auth::ComputeUserInfoParseError;
use crate::config::ProxyConfig;
use crate::config::TlsConfig;
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::HTTP_CONTENT_LENGTH;
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
use crate::proxy::NeonOptions;
use crate::serverless::backend::HttpConnError;
use crate::DbName;
use crate::RoleName;
@@ -164,12 +166,9 @@ fn get_conn_info(
let mut options = Option::None;
for (key, value) in pairs {
match &*key {
"options" => {
options = Some(NeonOptions::parse_options_raw(&value));
}
"application_name" => ctx.set_application(Some(value.into())),
_ => {}
if key == "options" {
options = Some(NeonOptions::parse_options_raw(&value));
break;
}
}
@@ -285,10 +284,8 @@ pub async fn handle(
)?
}
},
Err(_) => {
// TODO: when http error classification is done, distinguish between
// timeout on sql vs timeout in proxy/cplane
// ctx.set_error_kind(crate::error::ErrorKind::RateLimit);
Err(e) => {
ctx.set_error_kind(e.get_error_kind());
let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
@@ -402,11 +399,16 @@ async fn handle_inner(
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
ctx.latency_timer.success();
Ok::<_, anyhow::Error>(client)
Ok::<_, HttpConnError>(client)
};
// Run both operations in parallel
let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;
let (payload_result, auth_and_connect_result) =
join!(fetch_and_process_request, authenticate_and_connect,);
// Handle the results
let payload = payload_result?; // Handle errors appropriately
let mut client = auth_and_connect_result?; // Handle errors appropriately
let mut response = Response::builder()
.status(StatusCode::OK)

View File

@@ -38,22 +38,17 @@ pytest-rerunfailures = "^13.0"
types-pytest-lazy-fixture = "^0.6.3.3"
pytest-split = "^0.8.1"
zstandard = "^0.21.0"
httpx = {extras = ["http2"], version = "^0.26.0"}
pytest-repeat = "^0.9.3"
[tool.poetry.group.dev.dependencies]
mypy = "==1.3.0"
ruff = "^0.2.2"
ruff = "^0.1.11"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.mypy]
exclude = [
"^vendor/",
"^target/",
]
exclude = "^vendor/"
check_untyped_defs = true
# Help mypy find imports when running against list of individual files.
# Without this line it would behave differently when executed on the entire project.
@@ -77,13 +72,7 @@ ignore_missing_imports = true
[tool.ruff]
target-version = "py39"
extend-exclude = [
"vendor/",
"target/",
]
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
[tool.ruff.lint]
extend-exclude = ["vendor/"]
ignore = [
"E501", # Line too long, we don't want to be too strict about it
]
@@ -94,3 +83,4 @@ select = [
"W", # pycodestyle
"B", # bugbear
]
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter

View File

@@ -12,12 +12,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
}
Ok(())
}
(Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Safekeeper auth",
claims.scope
)
.into(),
(Scope::PageServerApi, _) => Err(AuthError(
"PageServerApi scope makes no sense for Safekeeper".into(),
)),
(Scope::SafekeeperData, _) => Ok(()),
}

View File

@@ -695,11 +695,9 @@ impl Collector for TimelineCollector {
// report total number of timelines
self.timelines_count.set(timelines_count as i64);
mfs.extend(self.timelines_count.collect());
self.active_timelines_count
.set(active_timelines_count as i64);
mfs.extend(self.active_timelines_count.collect());
mfs.extend(self.timelines_count.collect());
mfs
}

View File

@@ -188,7 +188,7 @@ const reportSummary = async (params) => {
}
const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n`
let summary = `\n### Code coverage ([full report](${coverageUrl}))\n`
const coverage = await (await fetch(summaryJsonUrl)).json()
for (const covType of Object.keys(coverage).sort()) {
@@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
}
summary += "\n\\* collected from Rust tests only\n"
summary += `\n___\n`
return summary

View File

@@ -4,8 +4,6 @@ from typing import Dict, List, Optional, Tuple
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
from fixtures.log_helper import log
class Metrics:
metrics: Dict[str, List[Sample]]
@@ -33,60 +31,6 @@ class Metrics:
return res[0]
class MetricsGetter:
"""
Mixin for types that implement a `get_metrics` function and would like associated
helpers for querying the metrics
"""
def get_metrics(self) -> Metrics:
raise NotImplementedError()
def get_metric_value(
self, name: str, filter: Optional[Dict[str, str]] = None
) -> Optional[float]:
metrics = self.get_metrics()
results = metrics.query_all(name, filter=filter)
if not results:
log.info(f'could not find metric "{name}"')
return None
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
If it's expected that there may be no results for some of the metrics,
specify `absence_ok=True`. The returned dict will then not contain values
for these metrics.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if not absence_ok:
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
gen = text_string_to_metric_families(text)
@@ -103,8 +47,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
"pageserver_remote_timeline_client_calls_started_total",
"pageserver_remote_timeline_client_calls_finished_total",
"pageserver_remote_timeline_client_calls_unfinished",
"pageserver_remote_physical_size",
"pageserver_remote_timeline_client_bytes_started_total",
"pageserver_remote_timeline_client_bytes_finished_total",
@@ -133,6 +76,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),
*histogram("pageserver_remote_operation_seconds"),
*histogram("pageserver_remote_timeline_client_calls_started"),
*histogram("pageserver_io_operations_seconds"),
"pageserver_tenant_states_count",
)

View File

@@ -27,7 +27,6 @@ from urllib.parse import quote, urlparse
import asyncpg
import backoff
import httpx
import jwt
import psycopg2
import pytest
@@ -47,7 +46,6 @@ from urllib3.util.retry import Retry
from fixtures import overlayfs
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pageserver.allowed_errors import (
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
scan_pageserver_log_for_errors,
@@ -488,11 +486,6 @@ class NeonEnvBuilder:
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
self.pageserver_get_vectored_impl: Optional[str] = None
if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
self.pageserver_get_vectored_impl = "vectored"
log.debug('Overriding pageserver get_vectored_impl config to "vectored"')
assert test_name.startswith(
"test_"
), "Unexpectedly instantiated from outside a test function"
@@ -1060,8 +1053,6 @@ class NeonEnv:
}
if self.pageserver_virtual_file_io_engine is not None:
ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
if config.pageserver_get_vectored_impl is not None:
ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
# Create a corresponding NeonPageserver object
self.pageservers.append(
@@ -1922,7 +1913,7 @@ class Pagectl(AbstractNeonCli):
return IndexPartDump.from_json(parsed)
class NeonAttachmentService(MetricsGetter):
class NeonAttachmentService:
def __init__(self, env: NeonEnv, auth_enabled: bool):
self.env = env
self.running = False
@@ -1960,11 +1951,6 @@ class NeonAttachmentService(MetricsGetter):
return headers
def get_metrics(self) -> Metrics:
res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
res.raise_for_status()
return parse_metrics(res.text)
def ready(self) -> bool:
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
if resp.status_code == 503:
@@ -2108,17 +2094,6 @@ class NeonAttachmentService(MetricsGetter):
log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id
def consistency_check(self):
"""
Throw an exception if the service finds any inconsistencies in its state
"""
response = self.request(
"POST",
f"{self.env.attachment_service_api}/debug/v1/consistency_check",
)
response.raise_for_status()
log.info("Attachment service passed consistency check")
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -2864,34 +2839,9 @@ class NeonProxy(PgProtocol):
)
if expected_code is not None:
assert response.status_code == expected_code, f"response: {response.json()}"
assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
return response.json()
async def http2_query(self, query, args, **kwargs):
# TODO maybe use default values if not provided
user = kwargs["user"]
password = kwargs["password"]
expected_code = kwargs.get("expected_code")
connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
async with httpx.AsyncClient(
http2=True, verify=str(self.test_output_dir / "proxy.crt")
) as client:
response = await client.post(
f"https://{self.domain}:{self.external_http_port}/sql",
json={"query": query, "params": args},
headers={
"Content-Type": "application/sql",
"Neon-Connection-String": connstr,
"Neon-Pool-Opt-In": "true",
},
)
assert response.http_version == "HTTP/2"
if expected_code is not None:
assert response.status_code == expected_code, f"response: {response.json()}"
return response.json()
def get_metrics(self) -> str:
request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
request_result.raise_for_status()

View File

@@ -82,11 +82,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this
# up is tracked in https://github.com/neondatabase/neon/issues/6096
".*Cancelled, shutting down.*",
# Open layers are only rolled at Lsn boundaries to avoid name clashses.
# Hence, we can overshoot the soft limit set by checkpoint distance.
# This is especially pronounced in tests that set small checkpoint
# distances.
".*Flushed oversized open layer with size.*",
)

View File

@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.metrics import Metrics, parse_metrics
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
)
class PageserverHttpClient(requests.Session, MetricsGetter):
class PageserverHttpClient(requests.Session):
def __init__(
self,
port: int,
@@ -694,33 +694,71 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
},
).value
def get_remote_timeline_client_queue_count(
def get_remote_timeline_client_metric(
self,
metric_name: str,
tenant_id: TenantId,
timeline_id: TimelineId,
file_kind: str,
op_kind: str,
) -> Optional[int]:
metrics = [
"pageserver_remote_timeline_client_calls_started_total",
"pageserver_remote_timeline_client_calls_finished_total",
]
res = self.get_metrics_values(
metrics,
) -> Optional[float]:
metrics = self.get_metrics()
matches = metrics.query_all(
name=metric_name,
filter={
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
"file_kind": str(file_kind),
"op_kind": str(op_kind),
},
absence_ok=True,
)
if len(res) != 2:
if len(matches) == 0:
value = None
elif len(matches) == 1:
value = matches[0].value
assert value is not None
else:
assert len(matches) < 2, "above filter should uniquely identify metric"
return value
def get_metric_value(
self, name: str, filter: Optional[Dict[str, str]] = None
) -> Optional[float]:
metrics = self.get_metrics()
results = metrics.query_all(name, filter=filter)
if not results:
log.info(f'could not find metric "{name}"')
return None
inc, dec = [res[metric] for metric in metrics]
queue_count = int(inc) - int(dec)
assert queue_count >= 0
return queue_count
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def layer_map_info(
self,

View File

@@ -1,8 +1,7 @@
import time
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Union
from mypy_boto3_s3.type_defs import (
DeleteObjectOutputTypeDef,
EmptyResponseMetadataTypeDef,
ListObjectsV2OutputTypeDef,
ObjectTypeDef,
@@ -221,40 +220,16 @@ def wait_for_upload_queue_empty(
):
while True:
all_metrics = pageserver_http.get_metrics()
started = all_metrics.query_all(
"pageserver_remote_timeline_client_calls_started_total",
tl = all_metrics.query_all(
"pageserver_remote_timeline_client_calls_unfinished",
{
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
},
)
finished = all_metrics.query_all(
"pageserver_remote_timeline_client_calls_finished_total",
{
"tenant_id": str(tenant_id),
"timeline_id": str(timeline_id),
},
)
assert len(started) == len(finished)
# this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth
remaining_labels = ["shard_id", "file_kind", "op_kind"]
tl: List[Tuple[Any, float]] = []
for s in started:
found = False
for f in finished:
if all([s.labels[label] == f.labels[label] for label in remaining_labels]):
assert (
not found
), "duplicate match, remaining_labels don't uniquely identify sample"
tl.append((s.labels, int(s.value) - int(f.value)))
found = True
if not found:
tl.append((s.labels, int(s.value)))
assert len(tl) == len(started), "something broken with join logic"
log.info(f"upload queue for {tenant_id}/{timeline_id}:")
for labels, queue_count in tl:
log.info(f" {labels}: {queue_count}")
if all(queue_count == 0 for (_, queue_count) in tl):
assert len(tl) > 0
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
if all(m.value == 0 for m in tl):
return
time.sleep(0.2)
@@ -356,6 +331,7 @@ def list_prefix(
"""
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None
prefix_in_bucket = remote.prefix_in_bucket or ""
if not prefix:
@@ -374,29 +350,6 @@ def list_prefix(
return response
def remote_storage_delete_key(
remote: RemoteStorage,
key: str,
) -> DeleteObjectOutputTypeDef:
"""
Note that this function takes into account prefix_in_bucket.
"""
# For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now.
assert isinstance(remote, S3Storage), "localfs is currently not supported"
prefix_in_bucket = remote.prefix_in_bucket or ""
# real s3 tests have uniqie per test prefix
# mock_s3 tests use special pageserver prefix for pageserver stuff
key = "/".join((prefix_in_bucket, key))
response = remote.client.delete_object(
Bucket=remote.bucket_name,
Key=key,
)
return response
def enable_remote_storage_versioning(
remote: RemoteStorage,
) -> EmptyResponseMetadataTypeDef:
@@ -405,6 +358,7 @@ def enable_remote_storage_versioning(
"""
# local_fs has no support for versioning
assert isinstance(remote, S3Storage), "localfs is currently not supported"
assert remote.client is not None
# The SDK supports enabling versioning on normal S3 as well but we don't want to change
# these settings from a test in a live bucket (also, our access isn't enough nor should it be)

Some files were not shown because too many files have changed in this diff Show More