mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 21:40:39 +00:00
Compare commits
15 Commits
release-pr
...
release-52
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e5724d9c3 | ||
|
|
0d3e499059 | ||
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 |
58
.github/workflows/benchmarking.yml
vendored
58
.github/workflows/benchmarking.yml
vendored
@@ -147,16 +147,15 @@ jobs:
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -172,7 +171,7 @@ jobs:
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -191,7 +190,7 @@ jobs:
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -254,9 +253,6 @@ jobs:
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
@@ -274,15 +270,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -409,15 +401,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -519,15 +507,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -613,15 +597,11 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
|
||||
3
.github/workflows/build_and_test.yml
vendored
3
.github/workflows/build_and_test.yml
vendored
@@ -1127,7 +1127,6 @@ jobs:
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
@@ -1137,7 +1136,6 @@ jobs:
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
@@ -1146,7 +1144,6 @@ jobs:
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f deployStorageController=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
90
.github/workflows/trigger-e2e-tests.yml
vendored
90
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -62,14 +62,14 @@ jobs:
|
||||
|
||||
trigger-e2e-tests:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
env:
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
@@ -79,55 +79,41 @@ jobs:
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
id: e2e-platforms
|
||||
env:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
# no-op
|
||||
;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
fi
|
||||
|
||||
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
env:
|
||||
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
|
||||
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
|
||||
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
|
||||
# to place a job run status update later.
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
|
||||
--method POST \
|
||||
--raw-field "state=pending" \
|
||||
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
|
||||
--raw-field "context=neon-cloud-e2e"
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
|
||||
gh workflow --repo ${REMOTE_REPO} \
|
||||
run testing.yml \
|
||||
--ref "main" \
|
||||
--raw-field "ci_job_name=neon-cloud-e2e" \
|
||||
--raw-field "commit_hash=$COMMIT_SHA" \
|
||||
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
|
||||
--raw-field "storage_image_tag=${TAG}" \
|
||||
--raw-field "compute_image_tag=${TAG}" \
|
||||
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
|
||||
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\",
|
||||
\"storage_image_tag\": \"${TAG}\",
|
||||
\"compute_image_tag\": \"${TAG}\",
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
35
Cargo.lock
generated
35
Cargo.lock
generated
@@ -288,7 +288,6 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -2235,9 +2234,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.3.26"
|
||||
version = "0.3.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
|
||||
checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@@ -3436,9 +3435,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.3"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
|
||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
@@ -3582,7 +3581,6 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"sysinfo",
|
||||
"tenant_size_model",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -4200,7 +4198,6 @@ name = "proxy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
@@ -5623,26 +5620,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "storcon_cli"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"hyper",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.2"
|
||||
@@ -5956,9 +5933,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.37.0"
|
||||
version = "1.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
|
||||
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
|
||||
@@ -4,7 +4,6 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"control_plane/attachment_service",
|
||||
"control_plane/storcon_cli",
|
||||
"pageserver",
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
|
||||
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# Create remote extension download directory
|
||||
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
|
||||
@@ -1262,12 +1262,10 @@ LIMIT 100",
|
||||
.await
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
if download_size.is_ok() {
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
}
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
|
||||
download_size
|
||||
}
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -743,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
if let Err(e) = client.simple_query(query) {
|
||||
error!(
|
||||
"failed to upgrade neon extension during `handle_extension_neon`: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -809,8 +806,19 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -25,7 +25,6 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
-- This file should undo anything in `up.sql`
|
||||
|
||||
ALTER TABLE tenant_shards drop scheduling_policy;
|
||||
@@ -1,2 +0,0 @@
|
||||
|
||||
ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
|
||||
@@ -14,6 +14,7 @@ use utils::{
|
||||
|
||||
use crate::service::Config;
|
||||
|
||||
const BUSY_DELAY: Duration = Duration::from_secs(1);
|
||||
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||
|
||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||
@@ -279,10 +280,11 @@ impl ComputeHook {
|
||||
Err(NotifyError::SlowDown)
|
||||
}
|
||||
StatusCode::LOCKED => {
|
||||
// We consider this fatal, because it's possible that the operation blocking the control one is
|
||||
// also the one that is waiting for this reconcile. We should let the reconciler calling
|
||||
// this hook fail, to give control plane a chance to un-lock.
|
||||
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
|
||||
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
|
||||
// is not appropriate
|
||||
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
Err(NotifyError::Busy)
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE
|
||||
@@ -304,12 +306,7 @@ impl ComputeHook {
|
||||
let client = reqwest::Client::new();
|
||||
backoff::retry(
|
||||
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|
||||
|e| {
|
||||
matches!(
|
||||
e,
|
||||
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
|
||||
)
|
||||
},
|
||||
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
|
||||
3,
|
||||
10,
|
||||
"Send compute notification",
|
||||
|
||||
@@ -34,8 +34,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
@@ -399,15 +398,6 @@ async fn handle_tenant_describe(
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_list(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_list())
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -421,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
let nodes = state.service.node_list().await?;
|
||||
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
|
||||
|
||||
json_response(StatusCode::OK, api_nodes)
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -491,22 +478,6 @@ async fn handle_tenant_shard_migrate(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.tenant_update_policy(tenant_id, update_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
@@ -538,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -763,9 +726,6 @@ pub fn make_router(
|
||||
RequestName("debug_v1_consistency_check"),
|
||||
)
|
||||
})
|
||||
.post("/debug/v1/reconcile_all", |r| {
|
||||
request_span(r, handle_reconcile_all)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
@@ -805,16 +765,6 @@ pub fn make_router(
|
||||
RequestName("control_v1_tenant_describe"),
|
||||
)
|
||||
})
|
||||
.get("/control/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
|
||||
})
|
||||
.put("/control/v1/tenant/:tenant_id/policy", |r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_tenant_update_policy,
|
||||
RequestName("control_v1_tenant_policy"),
|
||||
)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
|
||||
@@ -37,9 +37,6 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
pub(crate) storage_controller_reconcile_complete:
|
||||
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
|
||||
|
||||
/// Count of how many times we make an optimization change to a tenant's scheduling
|
||||
pub(crate) storage_controller_schedule_optimization: measured::Counter,
|
||||
|
||||
/// HTTP request status counters for handled requests
|
||||
pub(crate) storage_controller_http_request_status:
|
||||
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
|
||||
@@ -104,7 +101,6 @@ impl StorageControllerMetricGroup {
|
||||
status: StaticLabelSet::new(),
|
||||
},
|
||||
),
|
||||
storage_controller_schedule_optimization: measured::Counter::new(),
|
||||
storage_controller_http_request_status: measured::CounterVec::new(
|
||||
HttpRequestStatusLabelGroupSet {
|
||||
path: lasso::ThreadedRodeo::new(),
|
||||
|
||||
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantLocateResponseShard,
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -257,19 +256,6 @@ impl Node {
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Generate the simplified API-friendly description of a node's state
|
||||
pub(crate) fn describe(&self) -> NodeDescribeResponse {
|
||||
NodeDescribeResponse {
|
||||
id: self.id,
|
||||
availability: self.availability.into(),
|
||||
scheduling: self.scheduling,
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Node {
|
||||
|
||||
@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::ShardSchedulingPolicy;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
/// Some methods can operate on either a whole tenant or a single shard
|
||||
pub(crate) enum TenantFilter {
|
||||
Tenant(TenantId),
|
||||
Shard(TenantShardId),
|
||||
}
|
||||
|
||||
impl Persistence {
|
||||
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
|
||||
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
|
||||
@@ -147,7 +140,7 @@ impl Persistence {
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let latency = &METRICS_REGISTRY
|
||||
@@ -175,7 +168,7 @@ impl Persistence {
|
||||
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
|
||||
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
|
||||
R: Send + 'static,
|
||||
{
|
||||
let mut conn = self.connection_pool.get()?;
|
||||
@@ -282,11 +275,6 @@ impl Persistence {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
}
|
||||
|
||||
if shard.scheduling_policy.is_empty() {
|
||||
shard.scheduling_policy =
|
||||
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
|
||||
@@ -477,45 +465,59 @@ impl Persistence {
|
||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||
pub(crate) async fn update_tenant_shard(
|
||||
&self,
|
||||
tenant: TenantFilter,
|
||||
input_placement_policy: Option<PlacementPolicy>,
|
||||
input_config: Option<TenantConfig>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
input_placement_policy: PlacementPolicy,
|
||||
input_config: TenantConfig,
|
||||
input_generation: Option<Generation>,
|
||||
input_scheduling_policy: Option<ShardSchedulingPolicy>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
|
||||
let query = match tenant {
|
||||
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.into_boxed(),
|
||||
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.into_boxed(),
|
||||
};
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||
|
||||
#[derive(AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
struct ShardUpdate {
|
||||
generation: Option<i32>,
|
||||
placement_policy: Option<String>,
|
||||
config: Option<String>,
|
||||
scheduling_policy: Option<String>,
|
||||
if let Some(input_generation) = input_generation {
|
||||
// Update includes generation column
|
||||
query
|
||||
.set((
|
||||
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
} else {
|
||||
// Update does not include generation column
|
||||
query
|
||||
.set((
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
let update = ShardUpdate {
|
||||
generation: input_generation.map(|g| g.into().unwrap() as i32),
|
||||
placement_policy: input_placement_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
};
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
query.set(update).execute(conn)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn update_tenant_config(
|
||||
&self,
|
||||
input_tenant_id: TenantId,
|
||||
input_config: TenantConfig,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
@@ -726,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) splitting: SplitState,
|
||||
#[serde(default)]
|
||||
pub(crate) config: String,
|
||||
#[serde(default)]
|
||||
pub(crate) scheduling_policy: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
|
||||
@@ -487,7 +487,6 @@ impl Reconciler {
|
||||
while let Err(e) = self.compute_notify().await {
|
||||
match e {
|
||||
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
||||
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Live migration blocked by compute notification error, retrying: {e}"
|
||||
|
||||
@@ -58,70 +58,6 @@ pub(crate) struct Scheduler {
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
}
|
||||
|
||||
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
|
||||
///
|
||||
/// For example, we may set an affinity score based on the number of shards from the same
|
||||
/// tenant already on a node, to implicitly prefer to balance out shards.
|
||||
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub(crate) struct AffinityScore(pub(crate) usize);
|
||||
|
||||
impl AffinityScore {
|
||||
/// If we have no anti-affinity at all toward a node, this is its score. It means
|
||||
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
|
||||
/// based on other information such as total utilization.
|
||||
pub(crate) const FREE: Self = Self(0);
|
||||
|
||||
pub(crate) fn inc(&mut self) {
|
||||
self.0 += 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Add for AffinityScore {
|
||||
type Output = Self;
|
||||
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
Self(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
|
||||
// it for many shards in the same tenant.
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct ScheduleContext {
|
||||
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
|
||||
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
}
|
||||
|
||||
impl ScheduleContext {
|
||||
/// Input is a list of nodes we would like to avoid using again within this context. The more
|
||||
/// times a node is passed into this call, the less inclined we are to use it.
|
||||
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
|
||||
for node_id in nodes {
|
||||
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
|
||||
entry.inc()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
|
||||
let entry = self.attached_nodes.entry(node_id).or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
|
||||
self.nodes
|
||||
.get(&node_id)
|
||||
.copied()
|
||||
.unwrap_or(AffinityScore::FREE)
|
||||
}
|
||||
|
||||
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
|
||||
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
@@ -288,47 +224,27 @@ impl Scheduler {
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
|
||||
/// are already in use by this shard -- we use this to avoid picking the same node
|
||||
/// as both attached and secondary location. This is a hard constraint: if we cannot
|
||||
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
|
||||
///
|
||||
/// context: we prefer to avoid using nodes identified in the context, according
|
||||
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
|
||||
/// the same tenant on the same node. This is a soft constraint: the context will never
|
||||
/// cause us to fail to schedule a shard.
|
||||
pub(crate) fn schedule_shard(
|
||||
&self,
|
||||
hard_exclude: &[NodeId],
|
||||
context: &ScheduleContext,
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
v.shard_count,
|
||||
))
|
||||
Some((*k, v.shard_count))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
|
||||
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
@@ -344,11 +260,10 @@ impl Scheduler {
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
// Lowest score wins
|
||||
let node_id = scores.first().unwrap().0;
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
@@ -356,12 +271,6 @@ impl Scheduler {
|
||||
|
||||
Ok(node_id)
|
||||
}
|
||||
|
||||
/// Unit test access to internal state
|
||||
#[cfg(test)]
|
||||
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
|
||||
self.nodes.get(&node_id).unwrap().shard_count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -407,17 +316,15 @@ mod tests {
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let context = ScheduleContext::default();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[], &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
|
||||
@@ -22,7 +22,6 @@ diesel::table! {
|
||||
placement_policy -> Varchar,
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
scheduling_policy -> Varchar,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,10 +8,7 @@ use std::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
id_lock_map::IdLockMap,
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::ReconcileError,
|
||||
scheduler::ScheduleContext,
|
||||
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use control_plane::storage_controller::{
|
||||
@@ -20,14 +17,12 @@ use control_plane::storage_controller::{
|
||||
use diesel::result::DatabaseErrorKind;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use hyper::StatusCode;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
|
||||
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
|
||||
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
UtilizationScore,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
|
||||
TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse, UtilizationScore,
|
||||
},
|
||||
models::{SecondaryProgress, TenantConfigRequest},
|
||||
};
|
||||
@@ -56,6 +51,7 @@ use utils::{
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
seqwait::SeqWait,
|
||||
sync::gate::Gate,
|
||||
};
|
||||
|
||||
@@ -70,6 +66,7 @@ use crate::{
|
||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
||||
ReconcilerWaiter, TenantState,
|
||||
},
|
||||
Sequence,
|
||||
};
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
@@ -347,15 +344,9 @@ impl Service {
|
||||
}
|
||||
|
||||
// Populate each tenant's intent state
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
|
||||
if tenant_shard_id.shard_number == ShardNumber(0) {
|
||||
// Reset scheduling context each time we advance to the next Tenant
|
||||
schedule_context = ScheduleContext::default();
|
||||
}
|
||||
|
||||
tenant_state.intent_from_observed(scheduler);
|
||||
if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) {
|
||||
if let Err(e) = tenant_state.schedule(scheduler) {
|
||||
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
|
||||
// not enough pageservers are available. The tenant may well still be available
|
||||
// to clients.
|
||||
@@ -679,13 +670,7 @@ impl Service {
|
||||
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
|
||||
while !self.cancel.is_cancelled() {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
let reconciles_spawned = self.reconcile_all();
|
||||
if reconciles_spawned == 0 {
|
||||
// Run optimizer only when we didn't find any other work to do
|
||||
self.optimize_all();
|
||||
}
|
||||
}
|
||||
_ = interval.tick() => { self.reconcile_all(); }
|
||||
_ = self.cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
@@ -972,14 +957,30 @@ impl Service {
|
||||
}
|
||||
for tsp in tenant_shard_persistence {
|
||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
||||
|
||||
let shard_identity = tsp.get_shard_identity()?;
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new();
|
||||
if let Some(generation_pageserver) = tsp.generation_pageserver {
|
||||
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
|
||||
}
|
||||
let new_tenant = TenantState::from_persistent(tsp, intent)?;
|
||||
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
splitting: tsp.splitting,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, new_tenant);
|
||||
}
|
||||
@@ -1103,8 +1104,6 @@ impl Service {
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
};
|
||||
|
||||
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
||||
@@ -1157,10 +1156,9 @@ impl Service {
|
||||
// when we reattaching a detached tenant.
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Shard(attach_req.tenant_shard_id),
|
||||
Some(PlacementPolicy::Attached(0)),
|
||||
Some(conf),
|
||||
None,
|
||||
attach_req.tenant_shard_id,
|
||||
PlacementPolicy::Attached(0),
|
||||
conf,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
@@ -1525,8 +1523,6 @@ impl Service {
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
let tenant_id = create_req.new_tenant_id.tenant_id;
|
||||
|
||||
// Exclude any concurrent attempts to create/access the same tenant ID
|
||||
let _tenant_lock = self
|
||||
.tenant_op_locks
|
||||
@@ -1535,12 +1531,7 @@ impl Service {
|
||||
|
||||
let (response, waiters) = self.do_tenant_create(create_req).await?;
|
||||
|
||||
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
|
||||
// Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to
|
||||
// accept compute notifications while it is in the process of creating. Reconciliation will
|
||||
// be retried in the background.
|
||||
tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})");
|
||||
}
|
||||
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -1617,31 +1608,15 @@ impl Service {
|
||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
match self
|
||||
.persistence
|
||||
self.persistence
|
||||
.insert_tenant_shards(persist_tenant_shards)
|
||||
.await
|
||||
{
|
||||
Ok(_) => {}
|
||||
Err(DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||
DatabaseErrorKind::UniqueViolation,
|
||||
_,
|
||||
))) => {
|
||||
// Unique key violation: this is probably a retry. Because the shard count is part of the unique key,
|
||||
// if we see a unique key violation it means that the creation request's shard count matches the previous
|
||||
// creation's shard count.
|
||||
tracing::info!("Tenant shards already present in database, proceeding with idempotent creation...");
|
||||
}
|
||||
// Any other database error is unexpected and a bug.
|
||||
Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
|
||||
};
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
.map_err(|e| {
|
||||
// TODO: distinguish primary key constraint (idempotent, OK), from other errors
|
||||
ApiError::InternalServerError(anyhow::anyhow!(e))
|
||||
})?;
|
||||
|
||||
let (waiters, response_shards) = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
@@ -1664,14 +1639,11 @@ impl Service {
|
||||
// attached and secondary locations (independently) away frorm those
|
||||
// pageservers also holding a shard for this tenant.
|
||||
|
||||
entry
|
||||
.get_mut()
|
||||
.schedule(scheduler, &mut schedule_context)
|
||||
.map_err(|e| {
|
||||
ApiError::Conflict(format!(
|
||||
"Failed to schedule shard {tenant_shard_id}: {e}"
|
||||
))
|
||||
})?;
|
||||
entry.get_mut().schedule(scheduler).map_err(|e| {
|
||||
ApiError::Conflict(format!(
|
||||
"Failed to schedule shard {tenant_shard_id}: {e}"
|
||||
))
|
||||
})?;
|
||||
|
||||
if let Some(node_id) = entry.get().intent.get_attached() {
|
||||
let generation = entry
|
||||
@@ -1699,7 +1671,7 @@ impl Service {
|
||||
|
||||
state.generation = initial_generation;
|
||||
state.config = create_req.config.clone();
|
||||
if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
|
||||
if let Err(e) = state.schedule(scheduler) {
|
||||
schcedule_error = Some(e);
|
||||
}
|
||||
|
||||
@@ -1907,7 +1879,6 @@ impl Service {
|
||||
// Persist updates
|
||||
// Ordering: write to the database before applying changes in-memory, so that
|
||||
// we will not appear time-travel backwards on a restart.
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for ShardUpdate {
|
||||
tenant_shard_id,
|
||||
placement_policy,
|
||||
@@ -1917,11 +1888,10 @@ impl Service {
|
||||
{
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Shard(*tenant_shard_id),
|
||||
Some(placement_policy.clone()),
|
||||
Some(tenant_config.clone()),
|
||||
*tenant_shard_id,
|
||||
placement_policy.clone(),
|
||||
tenant_config.clone(),
|
||||
*generation,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1955,7 +1925,7 @@ impl Service {
|
||||
shard.generation = Some(generation);
|
||||
}
|
||||
|
||||
shard.schedule(scheduler, &mut schedule_context)?;
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
@@ -1999,13 +1969,7 @@ impl Service {
|
||||
let config = req.config;
|
||||
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Tenant(req.tenant_id),
|
||||
None,
|
||||
Some(config.clone()),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.update_tenant_config(req.tenant_id, config.clone())
|
||||
.await?;
|
||||
|
||||
let waiters = {
|
||||
@@ -2115,7 +2079,7 @@ impl Service {
|
||||
let scheduler = &locked.scheduler;
|
||||
// Right now we only perform the operation on a single node without parallelization
|
||||
// TODO fan out the operation to multiple nodes for better performance
|
||||
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
@@ -2358,58 +2322,6 @@ impl Service {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
/// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
|
||||
/// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies
|
||||
/// the tenant's policies (configuration) within the storage controller
|
||||
pub(crate) async fn tenant_update_policy(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TenantPolicyRequest,
|
||||
) -> Result<(), ApiError> {
|
||||
// We require an exclusive lock, because we are updating persistent and in-memory state
|
||||
let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
|
||||
|
||||
let TenantPolicyRequest {
|
||||
placement,
|
||||
scheduling,
|
||||
} = req;
|
||||
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Tenant(tenant_id),
|
||||
placement.clone(),
|
||||
None,
|
||||
None,
|
||||
scheduling,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
if let Some(placement) = &placement {
|
||||
shard.policy = placement.clone();
|
||||
|
||||
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
|
||||
"Updated placement policy to {placement:?}");
|
||||
}
|
||||
|
||||
if let Some(scheduling) = &scheduling {
|
||||
shard.set_scheduling_policy(*scheduling);
|
||||
|
||||
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
|
||||
"Updated scheduling policy to {scheduling:?}");
|
||||
}
|
||||
|
||||
// In case scheduling is being switched back on, try it now.
|
||||
shard.schedule(scheduler, &mut schedule_context).ok();
|
||||
self.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -2736,71 +2648,45 @@ impl Service {
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns None if the input iterator of shards does not include a shard with number=0
|
||||
fn tenant_describe_impl<'a>(
|
||||
&self,
|
||||
shards: impl Iterator<Item = &'a TenantState>,
|
||||
) -> Option<TenantDescribeResponse> {
|
||||
let mut shard_zero = None;
|
||||
let mut describe_shards = Vec::new();
|
||||
|
||||
for shard in shards {
|
||||
if shard.tenant_shard_id.is_zero() {
|
||||
shard_zero = Some(shard);
|
||||
}
|
||||
|
||||
describe_shards.push(TenantDescribeResponseShard {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
node_attached: *shard.intent.get_attached(),
|
||||
node_secondary: shard.intent.get_secondary().to_vec(),
|
||||
last_error: shard.last_error.lock().unwrap().clone(),
|
||||
is_reconciling: shard.reconciler.is_some(),
|
||||
is_pending_compute_notification: shard.pending_compute_notification,
|
||||
is_splitting: matches!(shard.splitting, SplitState::Splitting),
|
||||
scheduling_policy: *shard.get_scheduling_policy(),
|
||||
})
|
||||
}
|
||||
|
||||
let shard_zero = shard_zero?;
|
||||
|
||||
Some(TenantDescribeResponse {
|
||||
tenant_id: shard_zero.tenant_shard_id.tenant_id,
|
||||
shards: describe_shards,
|
||||
stripe_size: shard_zero.shard.stripe_size,
|
||||
policy: shard_zero.policy.clone(),
|
||||
config: shard_zero.config.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_describe(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<TenantDescribeResponse, ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
self.tenant_describe_impl(
|
||||
locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.map(|(_k, v)| v),
|
||||
)
|
||||
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
|
||||
}
|
||||
let mut shard_zero = None;
|
||||
let mut shards = Vec::new();
|
||||
|
||||
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
let mut result = Vec::new();
|
||||
for (_tenant_id, tenant_shards) in
|
||||
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
|
||||
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
result.push(
|
||||
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
|
||||
.expect("Groups are always non-empty"),
|
||||
);
|
||||
if tenant_shard_id.is_zero() {
|
||||
shard_zero = Some(shard);
|
||||
}
|
||||
|
||||
let response_shard = TenantDescribeResponseShard {
|
||||
tenant_shard_id: *tenant_shard_id,
|
||||
node_attached: *shard.intent.get_attached(),
|
||||
node_secondary: shard.intent.get_secondary().to_vec(),
|
||||
last_error: shard.last_error.lock().unwrap().clone(),
|
||||
is_reconciling: shard.reconciler.is_some(),
|
||||
is_pending_compute_notification: shard.pending_compute_notification,
|
||||
is_splitting: matches!(shard.splitting, SplitState::Splitting),
|
||||
};
|
||||
shards.push(response_shard);
|
||||
}
|
||||
|
||||
result
|
||||
let Some(shard_zero) = shard_zero else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
Ok(TenantDescribeResponse {
|
||||
shards,
|
||||
stripe_size: shard_zero.shard.stripe_size,
|
||||
policy: shard_zero.policy.clone(),
|
||||
config: shard_zero.config.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
|
||||
@@ -2893,7 +2779,7 @@ impl Service {
|
||||
|
||||
tracing::info!("Restoring parent shard {tenant_shard_id}");
|
||||
shard.splitting = SplitState::Idle;
|
||||
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
|
||||
if let Err(e) = shard.schedule(scheduler) {
|
||||
// If this shard can't be scheduled now (perhaps due to offline nodes or
|
||||
// capacity issues), that must not prevent us rolling back a split. In this
|
||||
// case it should be eventually scheduled in the background.
|
||||
@@ -3017,7 +2903,6 @@ impl Service {
|
||||
)
|
||||
};
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for child in child_ids {
|
||||
let mut child_shard = parent_ident;
|
||||
child_shard.number = child.shard_number;
|
||||
@@ -3053,7 +2938,7 @@ impl Service {
|
||||
|
||||
child_locations.push((child, pageserver, child_shard.stripe_size));
|
||||
|
||||
if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
|
||||
if let Err(e) = child_state.schedule(scheduler) {
|
||||
// This is not fatal, because we've implicitly already got an attached
|
||||
// location for the child shard. Failure here just means we couldn't
|
||||
// find a secondary (e.g. because cluster is overloaded).
|
||||
@@ -3346,10 +3231,6 @@ impl Service {
|
||||
placement_policy: serde_json::to_string(&policy).unwrap(),
|
||||
config: serde_json::to_string(&config).unwrap(),
|
||||
splitting: SplitState::Splitting,
|
||||
|
||||
// Scheduling policies do not carry through to children
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -3917,7 +3798,6 @@ impl Service {
|
||||
AvailabilityTransition::ToOffline => {
|
||||
tracing::info!("Node {} transition to offline", node_id);
|
||||
let mut tenants_affected: usize = 0;
|
||||
|
||||
for (tenant_shard_id, tenant_state) in tenants {
|
||||
if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
|
||||
// When a node goes offline, we set its observed configuration to None, indicating unknown: we will
|
||||
@@ -3934,13 +3814,7 @@ impl Service {
|
||||
|
||||
if tenant_state.intent.demote_attached(node_id) {
|
||||
tenant_state.sequence = tenant_state.sequence.next();
|
||||
|
||||
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
|
||||
// for tenants without secondary locations: if they have a secondary location, then this
|
||||
// schedule() call is just promoting an existing secondary)
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
match tenant_state.schedule(scheduler, &mut schedule_context) {
|
||||
match tenant_state.schedule(scheduler) {
|
||||
Err(e) => {
|
||||
// It is possible that some tenants will become unschedulable when too many pageservers
|
||||
// go offline: in this case there isn't much we can do other than make the issue observable.
|
||||
@@ -3991,6 +3865,9 @@ impl Service {
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
///
|
||||
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
|
||||
/// an attached policy. We should error out if it isn't.
|
||||
fn ensure_attached_schedule(
|
||||
&self,
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
@@ -3999,27 +3876,10 @@ impl Service {
|
||||
let mut waiters = Vec::new();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.schedule(scheduler, &mut schedule_context)?;
|
||||
|
||||
// The shard's policies may not result in an attached location being scheduled: this
|
||||
// is an error because our caller needs it attached somewhere.
|
||||
if shard.intent.get_attached().is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Tenant {tenant_id} not scheduled to be attached"
|
||||
));
|
||||
};
|
||||
|
||||
if shard.stably_attached().is_some() {
|
||||
// We do not require the shard to be totally up to date on reconciliation: we just require
|
||||
// that it has been attached on the intended node. Other dirty state such as unattached secondary
|
||||
// locations, or compute hook notifications can be ignored.
|
||||
continue;
|
||||
}
|
||||
for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
|
||||
tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
@@ -4081,144 +3941,8 @@ impl Service {
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
let pageservers = nodes.clone();
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
let mut reconciles_spawned = 0;
|
||||
for (tenant_shard_id, shard) in tenants.iter_mut() {
|
||||
if tenant_shard_id.is_zero() {
|
||||
schedule_context = ScheduleContext::default();
|
||||
}
|
||||
|
||||
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
|
||||
// dirty, spawn another rone
|
||||
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
|
||||
reconciles_spawned += 1;
|
||||
}
|
||||
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
}
|
||||
|
||||
reconciles_spawned
|
||||
}
|
||||
|
||||
/// `optimize` in this context means identifying shards which have valid scheduled locations, but
|
||||
/// could be scheduled somewhere better:
|
||||
/// - Cutting over to a secondary if the node with the secondary is more lightly loaded
|
||||
/// * e.g. after a node fails then recovers, to move some work back to it
|
||||
/// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
|
||||
/// * e.g. after a shard split, the initial attached locations will all be on the node where
|
||||
/// we did the split, but are probably better placed elsewhere.
|
||||
/// - Creating new secondary locations if it improves the spreading of a sharded tenant
|
||||
/// * e.g. after a shard split, some locations will be on the same node (where the split
|
||||
/// happened), and will probably be better placed elsewhere.
|
||||
///
|
||||
/// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
|
||||
/// the time of scheduling, this function looks for cases where a better-scoring location is available
|
||||
/// according to those same soft constraints.
|
||||
fn optimize_all(&self) -> usize {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
let pageservers = nodes.clone();
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
|
||||
let mut reconciles_spawned = 0;
|
||||
|
||||
let mut tenant_shards: Vec<&TenantState> = Vec::new();
|
||||
|
||||
// Limit on how many shards' optmizations each call to this function will execute. Combined
|
||||
// with the frequency of background calls, this acts as an implicit rate limit that runs a small
|
||||
// trickle of optimizations in the background, rather than executing a large number in parallel
|
||||
// when a change occurs.
|
||||
const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
|
||||
|
||||
let mut work = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in tenants.iter() {
|
||||
if tenant_shard_id.is_zero() {
|
||||
// Reset accumulators on the first shard in a tenant
|
||||
schedule_context = ScheduleContext::default();
|
||||
tenant_shards.clear();
|
||||
}
|
||||
|
||||
if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
|
||||
break;
|
||||
}
|
||||
|
||||
match shard.get_scheduling_policy() {
|
||||
ShardSchedulingPolicy::Active => {
|
||||
// Ok to do optimization
|
||||
}
|
||||
ShardSchedulingPolicy::Essential
|
||||
| ShardSchedulingPolicy::Pause
|
||||
| ShardSchedulingPolicy::Stop => {
|
||||
// Policy prevents optimizing this shard.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate the schedule context for all the shards in a tenant: we must have
|
||||
// the total view of all shards before we can try to optimize any of them.
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
tenant_shards.push(shard);
|
||||
|
||||
// Once we have seen the last shard in the tenant, proceed to search across all shards
|
||||
// in the tenant for optimizations
|
||||
if shard.shard.number.0 == shard.shard.count.count() - 1 {
|
||||
if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
|
||||
// Do not start any optimizations while another change to the tenant is ongoing: this
|
||||
// is not necessary for correctness, but simplifies operations and implicitly throttles
|
||||
// optimization changes to happen in a "trickle" over time.
|
||||
continue;
|
||||
}
|
||||
|
||||
if tenant_shards.iter().any(|s| {
|
||||
!matches!(s.splitting, SplitState::Idle)
|
||||
|| matches!(s.policy, PlacementPolicy::Detached)
|
||||
}) {
|
||||
// Never attempt to optimize a tenant that is currently being split, or
|
||||
// a tenant that is meant to be detached
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: optimization calculations are relatively expensive: create some fast-path for
|
||||
// the common idle case (avoiding the search on tenants that we have recently checked)
|
||||
|
||||
for shard in &tenant_shards {
|
||||
if let Some(optimization) =
|
||||
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
|
||||
// its primary location based on soft constraints, cut it over.
|
||||
shard.optimize_attachment(nodes, &schedule_context)
|
||||
{
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
} else if let Some(optimization) =
|
||||
// If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
|
||||
// better placed on another node, based on ScheduleContext, then adjust it. This
|
||||
// covers cases like after a shard split, where we might have too many shards
|
||||
// in the same tenant with secondary locations on the node where they originally split.
|
||||
shard.optimize_secondary(scheduler, &schedule_context)
|
||||
{
|
||||
work.push((shard.tenant_shard_id, optimization));
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: extend this mechanism to prefer attaching on nodes with fewer attached
|
||||
// tenants (i.e. extend schedule state to distinguish attached from secondary counts),
|
||||
// for the total number of attachments on a node (not just within a tenant.)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (tenant_shard_id, optimization) in work {
|
||||
let shard = tenants
|
||||
.get_mut(&tenant_shard_id)
|
||||
.expect("We held lock from place we got this ID");
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
|
||||
for (_tenant_shard_id, shard) in tenants.iter_mut() {
|
||||
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
|
||||
reconciles_spawned += 1;
|
||||
}
|
||||
@@ -4227,32 +3951,6 @@ impl Service {
|
||||
reconciles_spawned
|
||||
}
|
||||
|
||||
/// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
|
||||
/// also wait for any generated Reconcilers to complete. Calling this until it returns zero should
|
||||
/// put the system into a quiescent state where future background reconciliations won't do anything.
|
||||
pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
|
||||
let reconciles_spawned = self.reconcile_all();
|
||||
if reconciles_spawned == 0 {
|
||||
// Only optimize when we are otherwise idle
|
||||
self.optimize_all();
|
||||
}
|
||||
|
||||
let waiters = {
|
||||
let mut waiters = Vec::new();
|
||||
let locked = self.inner.read().unwrap();
|
||||
for (_tenant_shard_id, shard) in locked.tenants.iter() {
|
||||
if let Some(waiter) = shard.get_waiter() {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
waiters
|
||||
};
|
||||
|
||||
let waiter_count = waiters.len();
|
||||
self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
|
||||
Ok(waiter_count)
|
||||
}
|
||||
|
||||
pub async fn shutdown(&self) {
|
||||
// Note that this already stops processing any results from reconciles: so
|
||||
// we do not expect that our [`TenantState`] objects will reach a neat
|
||||
|
||||
@@ -7,9 +7,8 @@ use std::{
|
||||
use crate::{
|
||||
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
|
||||
persistence::TenantShardPersistence,
|
||||
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
|
||||
};
|
||||
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -117,10 +116,6 @@ pub(crate) struct TenantState {
|
||||
/// sending it. This is the mechanism by which compute notifications are included in the scope
|
||||
/// of state that we publish externally in an eventually consistent way.
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
|
||||
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
|
||||
// be set to a non-active state to avoid making changes while the issue is fixed.
|
||||
scheduling_policy: ShardSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
@@ -251,13 +246,8 @@ impl IntentState {
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
|
||||
// We do not check this while panicking, to avoid polluting unit test failures or
|
||||
// other assertions with this assertion's output. It's still wrong to leak these,
|
||||
// but if we already have a panic then we don't need to independently flag this case.
|
||||
if !(std::thread::panicking()) {
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
|
||||
Failed(TenantShardId, String),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct ReplaceSecondary {
|
||||
old_node_id: NodeId,
|
||||
new_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) struct MigrateAttachment {
|
||||
old_attached_node_id: NodeId,
|
||||
new_attached_node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub(crate) enum ScheduleOptimization {
|
||||
// Replace one of our secondary locations with a different node
|
||||
ReplaceSecondary(ReplaceSecondary),
|
||||
// Migrate attachment to an existing secondary location
|
||||
MigrateAttachment(MigrateAttachment),
|
||||
}
|
||||
|
||||
impl ReconcilerWaiter {
|
||||
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
|
||||
tokio::select! {
|
||||
@@ -400,7 +370,6 @@ impl TenantState {
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,7 +425,6 @@ impl TenantState {
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
@@ -470,33 +438,14 @@ impl TenantState {
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &mut ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
let r = self.do_schedule(scheduler, context);
|
||||
|
||||
context.avoid(&self.intent.all_pageservers());
|
||||
if let Some(attached) = self.intent.get_attached() {
|
||||
context.push_attached(*attached);
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) fn do_schedule(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
context: &ScheduleContext,
|
||||
) -> Result<(), ScheduleError> {
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
// pageservers if so.
|
||||
@@ -504,16 +453,6 @@ impl TenantState {
|
||||
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
||||
// change their attach location.
|
||||
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
|
||||
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
|
||||
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
|
||||
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
|
||||
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut modified = false;
|
||||
@@ -540,13 +479,12 @@ impl TenantState {
|
||||
}
|
||||
|
||||
// Should have exactly one attached, and N secondaries
|
||||
let (modified_attached, attached_node_id) =
|
||||
self.schedule_attached(scheduler, context)?;
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
@@ -559,7 +497,7 @@ impl TenantState {
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[], context)?;
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -586,167 +524,6 @@ impl TenantState {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Optimize attachments: if a shard has a secondary location that is preferable to
|
||||
/// its primary location based on soft constraints, switch that secondary location
|
||||
/// to be attached.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_attachment(
|
||||
&self,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
let attached = (*self.intent.get_attached())?;
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
let current_affinity_score = schedule_context.get_node_affinity(attached);
|
||||
let current_attachment_count = schedule_context.get_node_attachments(attached);
|
||||
|
||||
// Generate score for each node, dropping any un-schedulable nodes.
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
let mut scores = all_pageservers
|
||||
.iter()
|
||||
.flat_map(|node_id| {
|
||||
if matches!(
|
||||
nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule())
|
||||
.unwrap_or(MaySchedule::No),
|
||||
MaySchedule::No
|
||||
) {
|
||||
None
|
||||
} else {
|
||||
let affinity_score = schedule_context.get_node_affinity(*node_id);
|
||||
let attachment_count = schedule_context.get_node_attachments(*node_id);
|
||||
Some((*node_id, affinity_score, attachment_count))
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Sort precedence:
|
||||
// 1st - prefer nodes with the lowest total affinity score
|
||||
// 2nd - prefer nodes with the lowest number of attachments in this context
|
||||
// 3rd - if all else is equal, sort by node ID for determinism in tests.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
|
||||
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
|
||||
scores.first()
|
||||
{
|
||||
if attached != *preferred_node {
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called (e.g. there's no point migrating from
|
||||
// a location with score 1 to a score zero, because on next location the situation
|
||||
// would be the same, but in reverse).
|
||||
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|
||||
|| current_attachment_count > *preferred_attachment_count + 1
|
||||
{
|
||||
tracing::info!(
|
||||
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: attached,
|
||||
new_attached_node_id: *preferred_node,
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
tracing::debug!(
|
||||
"Node {} is already preferred (score {:?})",
|
||||
preferred_node,
|
||||
preferred_affinity_score
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Fall-through: we didn't find an optimization
|
||||
None
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn optimize_secondary(
|
||||
&self,
|
||||
scheduler: &Scheduler,
|
||||
schedule_context: &ScheduleContext,
|
||||
) -> Option<ScheduleOptimization> {
|
||||
if self.intent.secondary.is_empty() {
|
||||
// We can only do useful work if we have both attached and secondary locations: this
|
||||
// function doesn't schedule new locations, only swaps between attached and secondaries.
|
||||
return None;
|
||||
}
|
||||
|
||||
for secondary in self.intent.get_secondary() {
|
||||
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
|
||||
// We're already on a node unaffected any affinity constraints,
|
||||
// so we won't change it.
|
||||
continue;
|
||||
};
|
||||
|
||||
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
|
||||
// This implicitly limits the choice to nodes that are available, and prefers nodes
|
||||
// with lower utilization.
|
||||
let Ok(candidate_node) =
|
||||
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
|
||||
else {
|
||||
// A scheduling error means we have no possible candidate replacements
|
||||
continue;
|
||||
};
|
||||
|
||||
let candidate_affinity_score = schedule_context
|
||||
.nodes
|
||||
.get(&candidate_node)
|
||||
.unwrap_or(&AffinityScore::FREE);
|
||||
|
||||
// The best alternative must be more than 1 better than us, otherwise we could end
|
||||
// up flapping back next time we're called.
|
||||
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
|
||||
// If some other node is available and has a lower score than this node, then
|
||||
// that other node is a good place to migrate to.
|
||||
tracing::info!(
|
||||
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
|
||||
self.intent.get_secondary()
|
||||
);
|
||||
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: *secondary,
|
||||
new_node_id: candidate_node,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub(crate) fn apply_optimization(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
optimization: ScheduleOptimization,
|
||||
) {
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_schedule_optimization
|
||||
.inc();
|
||||
|
||||
match optimization {
|
||||
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id,
|
||||
new_attached_node_id,
|
||||
}) => {
|
||||
self.intent.demote_attached(old_attached_node_id);
|
||||
self.intent
|
||||
.promote_attached(scheduler, new_attached_node_id);
|
||||
}
|
||||
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id,
|
||||
new_node_id,
|
||||
}) => {
|
||||
self.intent.remove_secondary(scheduler, old_node_id);
|
||||
self.intent.push_secondary(scheduler, new_node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
|
||||
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
|
||||
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
|
||||
@@ -891,19 +668,6 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-checks done: finally check whether we may actually do the work
|
||||
match self.scheduling_policy {
|
||||
ShardSchedulingPolicy::Active
|
||||
| ShardSchedulingPolicy::Essential
|
||||
| ShardSchedulingPolicy::Pause => {}
|
||||
ShardSchedulingPolicy::Stop => {
|
||||
// We only reach this point if there is work to do and we're going to skip
|
||||
// doing it: warn it obvious why this tenant isn't doing what it ought to.
|
||||
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
// Build list of nodes from which the reconciler should detach
|
||||
let mut detach = Vec::new();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
@@ -1040,22 +804,6 @@ impl TenantState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
|
||||
/// if it is not already running
|
||||
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
|
||||
if self.reconciler.is_some() {
|
||||
Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
error_seq_wait: self.error_waiter.clone(),
|
||||
error: self.last_error.clone(),
|
||||
seq: self.sequence,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||
@@ -1081,40 +829,6 @@ impl TenantState {
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
|
||||
self.scheduling_policy = p;
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
|
||||
&self.scheduling_policy
|
||||
}
|
||||
|
||||
pub(crate) fn from_persistent(
|
||||
tsp: TenantShardPersistence,
|
||||
intent: IntentState,
|
||||
) -> anyhow::Result<Self> {
|
||||
let tenant_shard_id = tsp.get_tenant_shard_id()?;
|
||||
let shard_identity = tsp.get_shard_identity()?;
|
||||
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
config: serde_json::from_str(&tsp.config).unwrap(),
|
||||
reconciler: None,
|
||||
splitting: tsp.splitting,
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
@@ -1126,7 +840,6 @@ impl TenantState {
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
(0..shard_count.count())
|
||||
.map(|i| {
|
||||
let shard_number = ShardNumber(i);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy.clone(),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
@@ -1200,11 +887,10 @@ pub(crate) mod tests {
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut context = ScheduleContext::default();
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
@@ -1230,7 +916,7 @@ pub(crate) mod tests {
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_state
|
||||
.schedule(&mut scheduler, &mut context)
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
@@ -1294,219 +980,4 @@ pub(crate) mod tests {
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scheduling_mode() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// In pause mode, schedule() shouldn't do anything
|
||||
tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
|
||||
assert!(tenant_state
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(tenant_state.intent.all_pageservers().is_empty());
|
||||
|
||||
// In active mode, schedule() works
|
||||
tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
|
||||
assert!(tenant_state
|
||||
.schedule(&mut scheduler, &mut ScheduleContext::default())
|
||||
.is_ok());
|
||||
assert!(!tenant_state.intent.all_pageservers().is_empty());
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_attachment() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
|
||||
|
||||
// Either shard should recognize that it has the option to switch to a secondary location where there
|
||||
// would be no other shards from the same tenant, and request to do so.
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(2)
|
||||
}))
|
||||
);
|
||||
|
||||
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
|
||||
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
|
||||
// of [`Service::optimize_all`] to avoid trying
|
||||
// to do optimizations for multiple shards in the same tenant at the same time. Generating
|
||||
// both optimizations is just done for test purposes
|
||||
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
|
||||
assert_eq!(
|
||||
optimization_b,
|
||||
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
|
||||
old_attached_node_id: NodeId(1),
|
||||
new_attached_node_id: NodeId(3)
|
||||
}))
|
||||
);
|
||||
|
||||
// Applying these optimizations should result in the end state proposed
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
|
||||
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
|
||||
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn optimize_secondary() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
// Initially: both nodes attached on shard 1, and both have secondary locations
|
||||
// on different nodes.
|
||||
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
|
||||
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
|
||||
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
|
||||
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
schedule_context.avoid(&shard_a.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
|
||||
schedule_context.avoid(&shard_b.intent.all_pageservers());
|
||||
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
|
||||
|
||||
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
|
||||
|
||||
// Since there is a node with no locations available, the node with two locations for the
|
||||
// same tenant should generate an optimization to move one away
|
||||
assert_eq!(
|
||||
optimization_a,
|
||||
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
|
||||
old_node_id: NodeId(3),
|
||||
new_node_id: NodeId(4)
|
||||
}))
|
||||
);
|
||||
|
||||
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
|
||||
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
|
||||
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
|
||||
|
||||
shard_a.intent.clear(&mut scheduler);
|
||||
shard_b.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Optimize til quiescent: this emulates what Service::optimize_all does, when
|
||||
// called repeatedly in the background.
|
||||
fn optimize_til_idle(
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
scheduler: &mut Scheduler,
|
||||
shards: &mut [TenantState],
|
||||
) {
|
||||
let mut loop_n = 0;
|
||||
loop {
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
let mut any_changed = false;
|
||||
|
||||
for shard in shards.iter() {
|
||||
schedule_context.avoid(&shard.intent.all_pageservers());
|
||||
if let Some(attached) = shard.intent.get_attached() {
|
||||
schedule_context.push_attached(*attached);
|
||||
}
|
||||
}
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
let optimization = shard.optimize_attachment(nodes, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
|
||||
if let Some(optimization) = optimization {
|
||||
shard.apply_optimization(scheduler, optimization);
|
||||
any_changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !any_changed {
|
||||
break;
|
||||
}
|
||||
|
||||
// Assert no infinite loop
|
||||
loop_n += 1;
|
||||
assert!(loop_n < 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
|
||||
/// that it converges.
|
||||
#[test]
|
||||
fn optimize_add_nodes() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(4);
|
||||
|
||||
// Only show the scheduler a couple of nodes
|
||||
let mut scheduler = Scheduler::new([].iter());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
|
||||
|
||||
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
|
||||
let mut schedule_context = ScheduleContext::default();
|
||||
for shard in &mut shards {
|
||||
assert!(shard
|
||||
.schedule(&mut scheduler, &mut schedule_context)
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
|
||||
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
|
||||
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
|
||||
|
||||
for shard in shards.iter_mut() {
|
||||
shard.intent.clear(&mut scheduler);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
@@ -1058,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
}
|
||||
|
||||
Some(("set-state", subcommand_args)) => {
|
||||
let pageserver = get_pageserver(env, subcommand_args)?;
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
availability: availability.cloned(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Some(("status", subcommand_args)) => {
|
||||
match get_pageserver(env, subcommand_args)?.check_status().await {
|
||||
Ok(_) => println!("Page server is up and running"),
|
||||
@@ -1498,6 +1515,12 @@ fn cli() -> Command {
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("set-state")
|
||||
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
|
||||
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
|
||||
.about("Set scheduling or availability state of pageserver node")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_controller")
|
||||
|
||||
@@ -389,10 +389,6 @@ impl PageServerNode {
|
||||
.remove("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
@@ -505,12 +501,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
[package]
|
||||
name = "storcon_cli"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,587 +0,0 @@
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use hyper::Method;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use reqwest::Url;
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
|
||||
/// since pageservers auto-register when they start up
|
||||
NodeRegister {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
|
||||
#[arg(long)]
|
||||
listen_pg_addr: String,
|
||||
#[arg(long)]
|
||||
listen_pg_port: u16,
|
||||
|
||||
#[arg(long)]
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
listen_http_port: u16,
|
||||
},
|
||||
|
||||
/// Modify a node's configuration in the storage controller
|
||||
NodeConfigure {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
|
||||
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
|
||||
/// manually mark a node offline
|
||||
#[arg(long)]
|
||||
availability: Option<NodeAvailabilityArg>,
|
||||
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
|
||||
/// or is in the normal attached state with N secondary locations (`attached:N`)
|
||||
#[arg(long)]
|
||||
placement: Option<PlacementPolicyArg>,
|
||||
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
|
||||
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
|
||||
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
|
||||
/// unavailable, and are only for use in emergencies.
|
||||
#[arg(long)]
|
||||
scheduling: Option<ShardSchedulingPolicyArg>,
|
||||
},
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {},
|
||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||
TenantCreate {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Delete a tenant in the storage controller, and by extension on pageservers.
|
||||
TenantDelete {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Split an existing tenant into a higher number of shards than its current shard count.
|
||||
TenantShardSplit {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
shard_count: u8,
|
||||
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
|
||||
#[arg(long)]
|
||||
stripe_size: Option<u32>,
|
||||
},
|
||||
/// Migrate the attached location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrate {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
|
||||
/// that is passed through to pageservers, and does not affect storage controller behavior.
|
||||
TenantConfig {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||
/// alternative to the storage controller's scheduling optimization behavior.
|
||||
TenantScatter {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Print details about a particular tenant, including all its shards' states.
|
||||
TenantDescribe {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
author,
|
||||
version,
|
||||
about,
|
||||
long_about = "CLI for Storage Controller Support/Debug"
|
||||
)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
struct Cli {
|
||||
#[arg(long)]
|
||||
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
|
||||
api: Url,
|
||||
|
||||
#[arg(long)]
|
||||
/// JWT token for authenticating with storage controller. Depending on the API used, this
|
||||
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
|
||||
/// a token with both scopes to use with this tool.
|
||||
jwt: Option<String>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct PlacementPolicyArg(PlacementPolicy);
|
||||
|
||||
impl FromStr for PlacementPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"detached" => Ok(Self(PlacementPolicy::Detached)),
|
||||
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
|
||||
_ if s.starts_with("attached:") => {
|
||||
let mut splitter = s.split(':');
|
||||
let _prefix = splitter.next().unwrap();
|
||||
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
|
||||
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"Invalid format '{s}', a valid example is 'attached:1'"
|
||||
)),
|
||||
}
|
||||
}
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
|
||||
|
||||
impl FromStr for ShardSchedulingPolicyArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
|
||||
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
|
||||
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
|
||||
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
|
||||
_ => Err(anyhow::anyhow!(
|
||||
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
|
||||
|
||||
impl FromStr for NodeAvailabilityArg {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
|
||||
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||
|
||||
let mut trimmed = cli.api.to_string();
|
||||
trimmed.pop();
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||
|
||||
match cli.command {
|
||||
Command::NodeRegister {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
} => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
"control/v1/node".to_string(),
|
||||
Some(NodeRegisterRequest {
|
||||
node_id,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
vps_client
|
||||
.tenant_create(&TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDelete { tenant_id } => {
|
||||
let status = vps_client
|
||||
.tenant_delete(TenantShardId::unsharded(tenant_id))
|
||||
.await?;
|
||||
tracing::info!("Delete status: {}", status);
|
||||
}
|
||||
Command::Nodes {} => {
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::NodeConfigure {
|
||||
node_id,
|
||||
availability,
|
||||
scheduling,
|
||||
} => {
|
||||
let req = NodeConfigureRequest {
|
||||
node_id,
|
||||
availability: availability.map(|a| a.0),
|
||||
scheduling,
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/config"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {} => {
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
"ShardCount",
|
||||
"StripeSize",
|
||||
"Placement",
|
||||
"Scheduling",
|
||||
]);
|
||||
for tenant in resp {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
format!("{:?}", shard_zero.scheduling_policy),
|
||||
]);
|
||||
}
|
||||
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantPolicy {
|
||||
tenant_id,
|
||||
placement,
|
||||
scheduling,
|
||||
} => {
|
||||
let req = TenantPolicyRequest {
|
||||
scheduling: scheduling.map(|s| s.0),
|
||||
placement: placement.map(|p| p.0),
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/policy"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardSplit {
|
||||
tenant_id,
|
||||
shard_count,
|
||||
stripe_size,
|
||||
} => {
|
||||
let req = TenantShardSplitRequest {
|
||||
new_shard_count: shard_count,
|
||||
new_stripe_size: stripe_size.map(ShardStripeSize),
|
||||
};
|
||||
|
||||
let response = storcon_client
|
||||
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into {} shards: {}",
|
||||
tenant_id,
|
||||
shard_count,
|
||||
response
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
Command::TenantShardMigrate {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id: node,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantConfig { tenant_id, config } => {
|
||||
let tenant_conf = serde_json::from_str(&config)?;
|
||||
|
||||
vps_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantScatter { tenant_id } => {
|
||||
// Find the shards
|
||||
let locate_response = storcon_client
|
||||
.dispatch::<(), TenantLocateResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = locate_response.shards;
|
||||
|
||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
let shard_count = shards.len();
|
||||
for s in shards {
|
||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||
entry.push(s.shard_id);
|
||||
}
|
||||
|
||||
// Load list of available nodes
|
||||
let nodes_resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in nodes_resp {
|
||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||
node_to_shards.entry(node.id).or_default();
|
||||
}
|
||||
}
|
||||
|
||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||
|
||||
loop {
|
||||
let mut migrate_shard = None;
|
||||
for shards in node_to_shards.values_mut() {
|
||||
if shards.len() > max_shard_per_node {
|
||||
// Pick the emptiest
|
||||
migrate_shard = Some(shards.pop().unwrap());
|
||||
}
|
||||
}
|
||||
let Some(migrate_shard) = migrate_shard else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Pick the emptiest node to migrate to
|
||||
let mut destinations = node_to_shards
|
||||
.iter()
|
||||
.map(|(k, v)| (k, v.len()))
|
||||
.collect::<Vec<_>>();
|
||||
destinations.sort_by_key(|i| i.1);
|
||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||
if destination_count + 1 > max_shard_per_node {
|
||||
// Even the emptiest destination doesn't have space: we're done
|
||||
break;
|
||||
}
|
||||
let destination_node = *destination_node;
|
||||
|
||||
node_to_shards
|
||||
.get_mut(&destination_node)
|
||||
.unwrap()
|
||||
.push(migrate_shard);
|
||||
|
||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: migrate_shard,
|
||||
node_id: destination_node,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||
}
|
||||
|
||||
// Spread the shards across the nodes
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
for shard in shards {
|
||||
let secondary = shard
|
||||
.node_secondary
|
||||
.iter()
|
||||
.map(|n| format!("{}", n))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
|
||||
let mut status_parts = Vec::new();
|
||||
if shard.is_reconciling {
|
||||
status_parts.push("reconciling");
|
||||
}
|
||||
|
||||
if shard.is_pending_compute_notification {
|
||||
status_parts.push("pending_compute");
|
||||
}
|
||||
|
||||
if shard.is_splitting {
|
||||
status_parts.push("splitting");
|
||||
}
|
||||
let status = status_parts.join(",");
|
||||
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or(String::new()),
|
||||
secondary,
|
||||
shard.last_error,
|
||||
status,
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -40,7 +40,7 @@ macro_rules! register_hll {
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP))
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::str::FromStr;
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`attachment_service::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantPolicyRequest {
|
||||
pub placement: Option<PlacementPolicy>,
|
||||
pub scheduling: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantLocateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub tenant_id: TenantId,
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
pub policy: PlacementPolicy,
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct NodeDescribeResponse {
|
||||
pub id: NodeId,
|
||||
|
||||
pub availability: NodeAvailabilityWrapper,
|
||||
pub scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_pg_port: u16,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
|
||||
pub is_pending_compute_notification: bool,
|
||||
/// A shard split is currently underway
|
||||
pub is_splitting: bool,
|
||||
|
||||
pub scheduling_policy: ShardSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
@@ -129,7 +106,7 @@ impl UtilizationScore {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Clone, Copy)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
|
||||
// This wrapper provides serde functionality and it should only be used to
|
||||
// communicate with external callers which don't know or care about the
|
||||
// utilisation score of the pageserver it is targeting.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum NodeAvailabilityWrapper {
|
||||
Active,
|
||||
Offline,
|
||||
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
// for non-essential optimization.
|
||||
Active,
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
|
||||
// For example, this still permits a node's attachment location to change to a secondary in
|
||||
// response to a node failure, or to assign a new secondary if a node was removed.
|
||||
Essential,
|
||||
|
||||
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
|
||||
// unavailable, it will not be rescheduled to another node.
|
||||
Pause,
|
||||
|
||||
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
|
||||
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
|
||||
Stop,
|
||||
}
|
||||
|
||||
impl Default for ShardSchedulingPolicy {
|
||||
fn default() -> Self {
|
||||
Self::Active
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
// This is used when parsing node configuration requests from neon-local.
|
||||
// Assume the worst possible utilisation score
|
||||
// and let it get updated via the heartbeats.
|
||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
|
||||
@@ -301,7 +301,6 @@ pub struct TenantConfig {
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
||||
@@ -565,16 +565,6 @@ impl GenericRemoteStorage {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
|
||||
fn from(arr: [(&str, &str); N]) -> Self {
|
||||
let map: HashMap<String, String> = arr
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect();
|
||||
Self(map)
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
|
||||
@@ -182,18 +182,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
|
||||
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
|
||||
let internal = self.internal.lock().unwrap();
|
||||
let cnt = internal.current.cnt_value();
|
||||
drop(internal);
|
||||
if cnt >= num {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(cnt)
|
||||
}
|
||||
}
|
||||
|
||||
/// Register and return a channel that will be notified when a number arrives,
|
||||
/// or None, if it has already arrived.
|
||||
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||
|
||||
@@ -59,7 +59,6 @@ signal-hook.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
sysinfo.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
|
||||
@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
|
||||
let exp_base = fanout.max(2);
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -600,37 +600,33 @@ fn start_pageserver(
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
tenant_manager,
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
&conf.metric_collection_bucket,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
local_disk_storage,
|
||||
cancel,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
@@ -95,8 +95,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -158,8 +156,6 @@ pub mod defaults {
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
|
||||
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
|
||||
|
||||
[remote_storage]
|
||||
|
||||
"#
|
||||
@@ -283,13 +279,6 @@ pub struct PageServerConf {
|
||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||
|
||||
pub validate_vectored_get: bool,
|
||||
|
||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
|
||||
/// of ephemeral data.
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -411,8 +400,6 @@ struct PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -499,7 +486,6 @@ impl PageServerConfigBuilder {
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -679,10 +665,6 @@ impl PageServerConfigBuilder {
|
||||
self.validate_vectored_get = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -738,7 +720,6 @@ impl PageServerConfigBuilder {
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1029,9 +1010,6 @@ impl PageServerConf {
|
||||
"validate_vectored_get" => {
|
||||
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||
}
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1113,7 +1091,6 @@ impl PageServerConf {
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1351,7 +1328,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1423,7 +1399,6 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -3,9 +3,7 @@
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
|
||||
};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use pageserver_api::models::TenantState;
|
||||
@@ -43,7 +41,6 @@ type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
/// Main thread that serves metrics collection
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn collect_metrics(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_bucket: &Option<RemoteStorageConfig>,
|
||||
metric_collection_interval: Duration,
|
||||
@@ -70,19 +67,15 @@ pub async fn collect_metrics(
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
false,
|
||||
{
|
||||
let tenant_manager = tenant_manager.clone();
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
tenant_manager,
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
async move {
|
||||
calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval,
|
||||
&cancel,
|
||||
&worker_ctx,
|
||||
)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
@@ -123,7 +116,7 @@ pub async fn collect_metrics(
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
@@ -278,7 +271,6 @@ async fn reschedule(
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
async fn calculate_synthetic_size_worker(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -291,7 +283,7 @@ async fn calculate_synthetic_size_worker(
|
||||
loop {
|
||||
let started_at = Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(e) => {
|
||||
warn!("cannot get tenant list: {e:#}");
|
||||
@@ -310,14 +302,10 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else {
|
||||
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
if !tenant.is_active() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// there is never any reason to exit calculate_synthetic_size_worker following any
|
||||
// return value -- we don't need to care about shutdown because no tenant is found when
|
||||
// pageserver is shut down.
|
||||
@@ -355,7 +343,9 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
|
||||
};
|
||||
|
||||
// this error can be returned if timeline is shutting down, but it does not
|
||||
// mean the synthetic size worker should terminate.
|
||||
// mean the synthetic size worker should terminate. we do not need any checks
|
||||
// in this function because `mgr::get_tenant` will error out after shutdown has
|
||||
// progressed to shutting down tenants.
|
||||
let shutting_down = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::tenant::mgr::TenantManager;
|
||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
@@ -182,7 +181,6 @@ impl MetricsKey {
|
||||
}
|
||||
|
||||
pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<RawMetric> {
|
||||
@@ -190,7 +188,7 @@ pub(super) async fn collect_all_metrics(
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let tenants = match tenant_manager.list_tenants() {
|
||||
let tenants = match crate::tenant::mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(err) => {
|
||||
tracing::error!("failed to list tenants: {:?}", err);
|
||||
@@ -202,8 +200,7 @@ pub(super) async fn collect_all_metrics(
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
.get_attached_tenant_shard(id)
|
||||
crate::tenant::mgr::get_tenant(id, true)
|
||||
.ok()
|
||||
.map(|tenant| (id.tenant_id, tenant))
|
||||
}
|
||||
|
||||
@@ -61,6 +61,7 @@ use crate::{
|
||||
metrics::disk_usage_based_eviction::METRICS,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
self,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
@@ -813,8 +814,8 @@ async fn collect_eviction_candidates(
|
||||
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant_manager
|
||||
.list_tenants()
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
.context("get list of tenants")?;
|
||||
|
||||
// TODO: avoid listing every layer in every tenant: this loop can block the executor,
|
||||
@@ -826,12 +827,8 @@ async fn collect_eviction_candidates(
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) {
|
||||
Ok(tenant) if tenant.is_active() => tenant,
|
||||
Ok(_) => {
|
||||
debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active");
|
||||
continue;
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
debug!("failed to get tenant: {e:#}");
|
||||
|
||||
@@ -1038,7 +1038,7 @@ paths:
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: Timeline was created, or already existed with matching parameters
|
||||
description: TimelineInfo
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -1068,17 +1068,11 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Timeline already exists, with different parameters. Creation cannot proceed.
|
||||
description: Timeline already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"429":
|
||||
description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -49,8 +49,8 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError,
|
||||
TenantSlotUpsertError, TenantStateError,
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
|
||||
use crate::tenant::remote_timeline_client;
|
||||
@@ -249,11 +249,16 @@ impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
GetTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
// in fact exist locally. If we did, the caller could draw the conclusion
|
||||
// that it can attach the tenant to another PS and we'd be in split-brain.
|
||||
//
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||
@@ -264,9 +269,6 @@ impl From<GetTenantError> for ApiError {
|
||||
impl From<GetActiveTenantError> for ApiError {
|
||||
fn from(e: GetActiveTenantError) -> ApiError {
|
||||
match e {
|
||||
GetActiveTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
@@ -277,6 +279,19 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
}
|
||||
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
@@ -480,7 +495,7 @@ async fn timeline_create_handler(
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -520,13 +535,10 @@ async fn timeline_create_handler(
|
||||
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||
)
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::Conflict) => {
|
||||
json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response(
|
||||
StatusCode::TOO_MANY_REQUESTS,
|
||||
HttpErrorBody::from_msg(e.to_string()),
|
||||
),
|
||||
Err(
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -569,7 +581,7 @@ async fn timeline_list_handler(
|
||||
let response_data = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -607,7 +619,6 @@ async fn timeline_preserve_initdb_handler(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
// Part of the process for disaster recovery from safekeeper-stored WAL:
|
||||
// If we don't recover into a new timeline but want to keep the timeline ID,
|
||||
@@ -615,9 +626,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -659,7 +668,7 @@ async fn timeline_detail_handler(
|
||||
let timeline_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
@@ -846,7 +855,7 @@ async fn timeline_delete_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
// GetTenantError has a built-in conversion to ApiError, but in this context we don't
|
||||
@@ -964,11 +973,10 @@ async fn tenant_list_handler(
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let response_data = state
|
||||
.tenant_manager
|
||||
.list_tenants()
|
||||
let response_data = mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||
})?
|
||||
@@ -991,27 +999,9 @@ async fn tenant_status(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
|
||||
let activate = true;
|
||||
#[cfg(feature = "testing")]
|
||||
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
if activate {
|
||||
// This is advisory: we prefer to let the tenant activate on-demand when this function is
|
||||
// called, but it is still valid to return 200 and describe the current state of the tenant
|
||||
// if it doesn't make it into an active state.
|
||||
tenant
|
||||
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -1084,7 +1074,9 @@ async fn tenant_size_handler(
|
||||
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
|
||||
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
@@ -1092,12 +1084,6 @@ async fn tenant_size_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs(
|
||||
@@ -1166,15 +1152,10 @@ async fn tenant_shard_split_handler(
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(
|
||||
tenant,
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
@@ -1392,11 +1373,8 @@ async fn get_tenant_config_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -1424,31 +1402,15 @@ async fn update_tenant_config_handler(
|
||||
let tenant_id = request_data.tenant_id;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_tenant_conf =
|
||||
let tenant_conf =
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let tenant = state
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.get_generation(),
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
.set_new_tenant_config(tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1672,12 +1634,10 @@ async fn handle_tenant_break(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
|
||||
|
||||
let state = get_state(&r);
|
||||
state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?
|
||||
.set_broken("broken from test".to_owned())
|
||||
.await;
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1921,7 +1881,7 @@ async fn active_timeline_of_active_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
|
||||
@@ -1483,18 +1483,12 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) bytes_received: IntCounter,
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
bytes_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_bytes_received",
|
||||
"Bytes of WAL ingested from safekeepers",
|
||||
)
|
||||
.unwrap(),
|
||||
records_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_received",
|
||||
"Number of WAL records received from safekeepers"
|
||||
|
||||
@@ -760,7 +760,6 @@ impl PageServerHandler {
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
tenant.clone(),
|
||||
&mut copyin_reader,
|
||||
base_lsn,
|
||||
self.broker_client.clone(),
|
||||
@@ -876,13 +875,7 @@ impl PageServerHandler {
|
||||
if lsn <= last_record_lsn {
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
@@ -894,13 +887,7 @@ impl PageServerHandler {
|
||||
"invalid LSN(0) in request".into(),
|
||||
));
|
||||
}
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
}
|
||||
|
||||
if lsn < **latest_gc_cutoff_lsn {
|
||||
@@ -1227,13 +1214,7 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline
|
||||
.wait_lsn(
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
|
||||
@@ -214,12 +214,13 @@ pub enum TaskKind {
|
||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
||||
///
|
||||
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr`.
|
||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task spawns a
|
||||
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
@@ -229,6 +230,7 @@ pub enum TaskKind {
|
||||
WalReceiverManager,
|
||||
|
||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
///
|
||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use enumset::EnumSet;
|
||||
@@ -99,7 +98,7 @@ use std::ops::Bound::Included;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::span;
|
||||
@@ -261,7 +260,7 @@ pub struct Tenant {
|
||||
// We keep TenantConfOpt sturct here to preserve the information
|
||||
// about parameters that are not set.
|
||||
// This is necessary to allow global config updates.
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -1412,7 +1411,7 @@ impl Tenant {
|
||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn create_timeline(
|
||||
self: &Arc<Tenant>,
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
@@ -1516,7 +1515,7 @@ impl Tenant {
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline
|
||||
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
|
||||
.wait_lsn(*lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||
@@ -1560,7 +1559,7 @@ impl Tenant {
|
||||
})?;
|
||||
}
|
||||
|
||||
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
loaded_timeline.activate(broker_client, None, ctx);
|
||||
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
@@ -1607,7 +1606,7 @@ impl Tenant {
|
||||
);
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.load();
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
|
||||
if !conf.location.may_delete_layers_hint() {
|
||||
info!("Skipping GC in location state {:?}", conf.location);
|
||||
@@ -1634,7 +1633,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.load();
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||
info!("Skipping compaction in location state {:?}", conf.location);
|
||||
return Ok(());
|
||||
@@ -1732,12 +1731,7 @@ impl Tenant {
|
||||
let mut activated_timelines = 0;
|
||||
|
||||
for timeline in timelines_to_activate {
|
||||
timeline.activate(
|
||||
self.clone(),
|
||||
broker_client.clone(),
|
||||
background_jobs_can_start,
|
||||
ctx,
|
||||
);
|
||||
timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
|
||||
activated_timelines += 1;
|
||||
}
|
||||
|
||||
@@ -1783,7 +1777,7 @@ impl Tenant {
|
||||
async fn shutdown(
|
||||
&self,
|
||||
shutdown_progress: completion::Barrier,
|
||||
shutdown_mode: timeline::ShutdownMode,
|
||||
freeze_and_flush: bool,
|
||||
) -> Result<(), completion::Barrier> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -1830,8 +1824,16 @@ impl Tenant {
|
||||
timelines.values().for_each(|timeline| {
|
||||
let timeline = Arc::clone(timeline);
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
|
||||
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
|
||||
|
||||
let span =
|
||||
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
|
||||
js.spawn(async move {
|
||||
if freeze_and_flush {
|
||||
timeline.flush_and_shutdown().instrument(span).await
|
||||
} else {
|
||||
timeline.shutdown().instrument(span).await
|
||||
}
|
||||
});
|
||||
})
|
||||
};
|
||||
// test_long_timeline_create_then_tenant_delete is leaning on this message
|
||||
@@ -2061,12 +2063,7 @@ impl Tenant {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken { reason, .. } => {
|
||||
// This is fatal, and reported distinctly from the general case of "will never be active" because
|
||||
// it's logically a 500 to external API users (broken is always a bug).
|
||||
return Err(GetActiveTenantError::Broken(reason));
|
||||
}
|
||||
TenantState::Stopping { .. } => {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// There's no chance the tenant can transition back into ::Active
|
||||
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
||||
}
|
||||
@@ -2075,14 +2072,14 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
||||
self.tenant_conf.load().location.attach_mode
|
||||
self.tenant_conf.read().unwrap().location.attach_mode
|
||||
}
|
||||
|
||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
|
||||
/// rare external API calls, like a reconciliation at startup.
|
||||
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
|
||||
let conf = self.tenant_conf.load();
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
|
||||
let location_config_mode = match conf.location.attach_mode {
|
||||
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
|
||||
@@ -2229,7 +2226,7 @@ where
|
||||
|
||||
impl Tenant {
|
||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.load().tenant_conf.clone()
|
||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
||||
}
|
||||
|
||||
pub fn effective_config(&self) -> TenantConf {
|
||||
@@ -2238,84 +2235,84 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.checkpoint_distance
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn get_checkpoint_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.checkpoint_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||
}
|
||||
|
||||
pub fn get_compaction_target_size(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_target_size
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||
}
|
||||
|
||||
pub fn get_compaction_period(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
|
||||
}
|
||||
|
||||
pub fn get_compaction_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.compaction_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
pub fn get_gc_horizon(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.gc_horizon
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
|
||||
}
|
||||
|
||||
pub fn get_gc_period(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.gc_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.gc_period)
|
||||
}
|
||||
|
||||
pub fn get_image_creation_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.image_creation_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
pub fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.pitr_interval
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
pub fn get_trace_read_requests(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.trace_read_requests
|
||||
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
|
||||
}
|
||||
|
||||
pub fn get_min_resident_size_override(&self) -> Option<u64> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.min_resident_size_override
|
||||
.or(self.conf.default_tenant_conf.min_resident_size_override)
|
||||
}
|
||||
|
||||
pub fn get_heatmap_period(&self) -> Option<Duration> {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
let heatmap_period = tenant_conf
|
||||
.heatmap_period
|
||||
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
|
||||
@@ -2327,40 +2324,26 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
|
||||
// Use read-copy-update in order to avoid overwriting the location config
|
||||
// state if this races with [`Tenant::set_new_location_config`]. Note that
|
||||
// this race is not possible if both request types come from the storage
|
||||
// controller (as they should!) because an exclusive op lock is required
|
||||
// on the storage controller side.
|
||||
self.tenant_conf.rcu(|inner| {
|
||||
Arc::new(AttachedTenantConf {
|
||||
tenant_conf: new_tenant_conf.clone(),
|
||||
location: inner.location,
|
||||
})
|
||||
});
|
||||
|
||||
self.tenant_conf_updated(&new_tenant_conf);
|
||||
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
|
||||
self.tenant_conf_updated();
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||
// mutexes in struct Timeline in the future.
|
||||
let timelines = self.list_timelines();
|
||||
for timeline in timelines {
|
||||
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||
timeline.tenant_conf_updated();
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
|
||||
let new_tenant_conf = new_conf.tenant_conf.clone();
|
||||
|
||||
self.tenant_conf.store(Arc::new(new_conf));
|
||||
|
||||
self.tenant_conf_updated(&new_tenant_conf);
|
||||
*self.tenant_conf.write().unwrap() = new_conf;
|
||||
self.tenant_conf_updated();
|
||||
// Don't hold self.timelines.lock() during the notifies.
|
||||
// There's no risk of deadlock right now, but there could be if we consolidate
|
||||
// mutexes in struct Timeline in the future.
|
||||
let timelines = self.list_timelines();
|
||||
for timeline in timelines {
|
||||
timeline.tenant_conf_updated(&new_tenant_conf);
|
||||
timeline.tenant_conf_updated();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2374,8 +2357,11 @@ impl Tenant {
|
||||
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
|
||||
pub(crate) fn tenant_conf_updated(&self) {
|
||||
let conf = {
|
||||
let guard = self.tenant_conf.read().unwrap();
|
||||
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
|
||||
};
|
||||
self.timeline_get_throttle.reconfigure(conf)
|
||||
}
|
||||
|
||||
@@ -2523,7 +2509,7 @@ impl Tenant {
|
||||
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
&crate::metrics::tenant_throttling::TIMELINE_GET,
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
tenant_conf: Arc::new(RwLock::new(attached_conf)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3509,7 +3495,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||
self.tenant_conf.load().tenant_conf.clone()
|
||||
self.tenant_conf.read().unwrap().tenant_conf.clone()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3657,9 +3643,6 @@ pub(crate) mod harness {
|
||||
heatmap_period: Some(tenant_conf.heatmap_period),
|
||||
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
|
||||
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3858,7 +3841,6 @@ mod tests {
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::timeline::ShutdownMode;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -4304,7 +4286,7 @@ mod tests {
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.await
|
||||
.ok()
|
||||
@@ -4345,7 +4327,7 @@ mod tests {
|
||||
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.await
|
||||
.ok()
|
||||
@@ -5126,7 +5108,7 @@ mod tests {
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown(super::timeline::ShutdownMode::Hard)
|
||||
.shutdown()
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
|
||||
@@ -57,9 +57,6 @@ pub mod defaults {
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
// By default ingest enough WAL for two new L0 layers before checking if new image
|
||||
// image layers should be created.
|
||||
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
@@ -365,10 +362,6 @@ pub struct TenantConf {
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -461,9 +454,6 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -518,9 +508,6 @@ impl TenantConfOpt {
|
||||
.timeline_get_throttle
|
||||
.clone()
|
||||
.unwrap_or(global_conf.timeline_get_throttle),
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -561,7 +548,6 @@ impl Default for TenantConf {
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -635,7 +621,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,10 +14,7 @@ use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
timeline::ShutdownMode,
|
||||
},
|
||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -466,7 +463,7 @@ impl DeleteTenantFlow {
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||
if tenant.shutdown(progress, false).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
|
||||
@@ -72,10 +72,6 @@ impl EphemeralFile {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
|
||||
@@ -346,6 +346,35 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub enum InMemoryLayerHandle {
|
||||
Open {
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
idx: usize,
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
impl InMemoryLayerHandle {
|
||||
pub fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_end_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -547,18 +576,41 @@ impl LayerMap {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
|
||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
||||
///
|
||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
||||
/// the same exclusive region established by holding the layer manager lock.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
||||
where
|
||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||
{
|
||||
if let Some(open) = &self.open_layer {
|
||||
if pred(open) {
|
||||
return Some(open.clone());
|
||||
return Some(InMemoryLayerHandle::Open {
|
||||
lsn_floor: open.get_lsn_range().start,
|
||||
end_lsn: open.get_lsn_range().end,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
|
||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
||||
pos.map(|rev_idx| {
|
||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
||||
InMemoryLayerHandle::Frozen {
|
||||
idx,
|
||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the layer pointed to by the provided handle.
|
||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
||||
match handle {
|
||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
use pageserver_api::models::{LocationConfigMode, ShardParameters};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
@@ -16,7 +16,6 @@ use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use sysinfo::SystemExt;
|
||||
use tokio::fs;
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
@@ -40,11 +39,10 @@ use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
TenantConfOpt,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -545,18 +543,6 @@ pub async fn init_tenant_mgr(
|
||||
|
||||
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
|
||||
|
||||
// Initialize dynamic limits that depend on system resources
|
||||
let system_memory =
|
||||
sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory())
|
||||
.total_memory();
|
||||
let max_ephemeral_layer_bytes =
|
||||
conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024);
|
||||
tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory");
|
||||
inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store(
|
||||
max_ephemeral_layer_bytes,
|
||||
std::sync::atomic::Ordering::Relaxed,
|
||||
);
|
||||
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
||||
|
||||
@@ -784,9 +770,11 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
|
||||
t.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
@@ -887,6 +875,16 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum SetNewTenantConfigError {
|
||||
#[error(transparent)]
|
||||
GetTenant(#[from] GetTenantError),
|
||||
#[error(transparent)]
|
||||
Persist(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum UpsertLocationError {
|
||||
#[error("Bad config request: {0}")]
|
||||
@@ -912,21 +910,32 @@ impl TenantManager {
|
||||
self.conf
|
||||
}
|
||||
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
|
||||
/// undergoing a state change (i.e. slot is InProgress).
|
||||
///
|
||||
/// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
|
||||
/// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
|
||||
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub(crate) fn get_attached_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
@@ -1106,7 +1115,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
@@ -1222,7 +1231,7 @@ impl TenantManager {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
info!("Finished shutting down just-spawned tenant");
|
||||
}
|
||||
@@ -1272,7 +1281,7 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
@@ -1419,8 +1428,7 @@ impl TenantManager {
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_)
|
||||
| GetActiveTenantError::Broken(_) => {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
@@ -1447,30 +1455,29 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let r = self
|
||||
.do_shard_split(tenant, new_shard_count, new_stripe_size, ctx)
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting after shard split failure");
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset: {e}");
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1480,12 +1487,12 @@ impl TenantManager {
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Validate the incoming request
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
@@ -1531,6 +1538,7 @@ impl TenantManager {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
// have been left in a partially-shut-down state.
|
||||
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
|
||||
self.reset_tenant(tenant_shard_id, false, ctx).await?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -1648,14 +1656,7 @@ impl TenantManager {
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline
|
||||
.wait_lsn(
|
||||
*target_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::Tenant,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
tracing::warn!(
|
||||
@@ -1676,7 +1677,7 @@ impl TenantManager {
|
||||
|
||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, ShutdownMode::Hard).await {
|
||||
match parent.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(other) => {
|
||||
other.wait().await;
|
||||
@@ -1935,23 +1936,38 @@ impl TenantManager {
|
||||
removal_result
|
||||
}
|
||||
|
||||
pub(crate) fn list_tenants(
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
&self,
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
// Legacy API: does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1964,12 +1980,51 @@ pub(crate) enum GetTenantError {
|
||||
|
||||
#[error("Tenant {0} is not active")]
|
||||
NotActive(TenantShardId),
|
||||
/// Broken is logically a subset of NotActive, but a distinct error is useful as
|
||||
/// NotActive is usually a retryable state for API purposes, whereas Broken
|
||||
/// is a stuck error state
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
// Initializing or shutting down: cannot authoritatively say whether we have this tenant
|
||||
#[error("Tenant map is not available: {0}")]
|
||||
MapState(#[from] TenantMapError),
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
///
|
||||
/// This method is cancel-safe.
|
||||
pub(crate) fn get_tenant(
|
||||
tenant_shard_id: TenantShardId,
|
||||
active_only: bool,
|
||||
) -> Result<Arc<Tenant>, GetTenantError> {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Attached(tenant)) => match tenant.current_state() {
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} if active_only => Err(GetTenantError::Broken(reason)),
|
||||
TenantState::Active => Ok(Arc::clone(tenant)),
|
||||
_ => {
|
||||
if active_only {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id))
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
},
|
||||
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum GetActiveTenantError {
|
||||
/// We may time out either while TenantSlot is InProgress, or while the Tenant
|
||||
@@ -1993,12 +2048,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken)
|
||||
#[error("will not become active. Current state: {0}")]
|
||||
WillNotBecomeActive(TenantState),
|
||||
|
||||
/// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as
|
||||
/// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
}
|
||||
|
||||
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
|
||||
@@ -2218,6 +2267,27 @@ pub(crate) enum TenantMapListError {
|
||||
Initializing,
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub(crate) async fn list_tenants(
|
||||
) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
Some((*id, tenant.current_state(), tenant.generation()))
|
||||
}
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapInsertError {
|
||||
#[error(transparent)]
|
||||
@@ -2663,11 +2733,11 @@ where
|
||||
let attached_tenant = match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let shutdown_mode = ShutdownMode::Hard;
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(progress, shutdown_mode).await {
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
|
||||
@@ -1569,7 +1569,7 @@ impl RemoteTimelineClient {
|
||||
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
|
||||
///
|
||||
/// In-progress operations will still be running after this function returns.
|
||||
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
|
||||
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
|
||||
/// to wait for them to complete, after calling this function.
|
||||
pub(crate) fn stop(&self) {
|
||||
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
|
||||
|
||||
@@ -786,35 +786,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
|
||||
// are already present on disk are really there.
|
||||
let local_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id)
|
||||
.join(layer.name.file_name());
|
||||
match tokio::fs::metadata(&local_path).await {
|
||||
Ok(meta) => {
|
||||
tracing::debug!(
|
||||
"Layer {} present at {}, size {}",
|
||||
layer.name,
|
||||
local_path,
|
||||
meta.len(),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Layer {} not found at {} ({})",
|
||||
layer.name,
|
||||
local_path,
|
||||
e
|
||||
);
|
||||
debug_assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::GetTenantError,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
@@ -293,11 +292,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
"Starting heatmap write on command");
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(*tenant_shard_id)
|
||||
.get_attached_tenant_shard(*tenant_shard_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
if !tenant.is_active() {
|
||||
return Err(GetTenantError::NotActive(*tenant_shard_id).into());
|
||||
}
|
||||
|
||||
Ok(UploadPending {
|
||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
pub mod delta_layer;
|
||||
mod filename;
|
||||
pub mod image_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tracing::warn;
|
||||
use utils::history_buffer::HistoryBufferWithDropCounter;
|
||||
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::layer_map::InMemoryLayerHandle;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::PageReconstructError;
|
||||
|
||||
@@ -204,30 +204,23 @@ impl Default for ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
/// A key that uniquely identifies a layer in a timeline
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub(crate) enum LayerId {
|
||||
PersitentLayerId(PersistentLayerKey),
|
||||
InMemoryLayerId(InMemoryLayerFileId),
|
||||
/// Description of layer to be read - the layer map can turn
|
||||
/// this description into the actual layer.
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_range: Range<Lsn>,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum ReadableLayer {
|
||||
PersistentLayer(Layer),
|
||||
InMemoryLayer(Arc<InMemoryLayer>),
|
||||
}
|
||||
|
||||
/// A partial description of a read to be done.
|
||||
#[derive(Debug, Clone)]
|
||||
struct ReadDesc {
|
||||
/// An id used to resolve the readable layer within the fringe
|
||||
layer_id: LayerId,
|
||||
/// Lsn range for the read, used for selecting the next read
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
/// read path. The fringe is the set of layers which intersects
|
||||
@@ -238,64 +231,41 @@ struct ReadDesc {
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
|
||||
layers: HashMap<LayerId, LayerKeyspace>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LayerKeyspace {
|
||||
layer: ReadableLayer,
|
||||
target_keyspace: KeySpace,
|
||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
planned_reads_by_lsn: BinaryHeap::new(),
|
||||
layers_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
|
||||
let read_desc = match self.planned_reads_by_lsn.pop() {
|
||||
Some(desc) => desc,
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
||||
let handle = match self.layers_by_lsn.pop() {
|
||||
Some(h) => h,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&read_desc.layer_id);
|
||||
let removed = self.layers.remove_entry(&handle.0);
|
||||
match removed {
|
||||
Some((
|
||||
_,
|
||||
LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace,
|
||||
},
|
||||
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
|
||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(
|
||||
&mut self,
|
||||
layer: ReadableLayer,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
) {
|
||||
let layer_id = layer.id();
|
||||
let entry = self.layers.entry(layer_id.clone());
|
||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
||||
let entry = self.layers.entry(layer.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().target_keyspace.merge(&keyspace);
|
||||
entry.get_mut().merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.planned_reads_by_lsn.push(ReadDesc {
|
||||
lsn_range,
|
||||
layer_id: layer_id.clone(),
|
||||
});
|
||||
entry.insert(LayerKeyspace {
|
||||
layer,
|
||||
target_keyspace: keyspace,
|
||||
});
|
||||
self.layers_by_lsn
|
||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
||||
entry.insert(keyspace);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -307,55 +277,77 @@ impl Default for LayerFringe {
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadDesc {
|
||||
impl Ord for ReadableLayerDescOrdered {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
|
||||
self.0
|
||||
.get_lsn_floor()
|
||||
.cmp(&other.0.get_lsn_floor())
|
||||
.reverse()
|
||||
} else {
|
||||
ord
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadDesc {
|
||||
impl PartialOrd for ReadableLayerDescOrdered {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadDesc {
|
||||
impl PartialEq for ReadableLayerDescOrdered {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.lsn_range == other.lsn_range
|
||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadDesc {}
|
||||
impl Eq for ReadableLayerDescOrdered {}
|
||||
|
||||
impl ReadableLayer {
|
||||
pub(crate) fn id(&self) -> LayerId {
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
|
||||
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
layer_manager: &LayerManager,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_range } => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range.clone(),
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
let layer = layer_manager
|
||||
.layer_map()
|
||||
.get_in_memory_layer(handle)
|
||||
.unwrap();
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -947,34 +946,6 @@ impl DeltaLayerInner {
|
||||
Ok(planner.finish())
|
||||
}
|
||||
|
||||
fn get_min_read_buffer_size(
|
||||
planned_reads: &[VectoredRead],
|
||||
read_size_soft_max: usize,
|
||||
) -> usize {
|
||||
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
|
||||
return read_size_soft_max;
|
||||
};
|
||||
|
||||
let largest_read_size = largest_read.size();
|
||||
if largest_read_size > read_size_soft_max {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = largest_read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
largest_read_size,
|
||||
read_size_soft_max,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
largest_read_size
|
||||
}
|
||||
|
||||
async fn do_reads_and_update_state(
|
||||
&self,
|
||||
reads: Vec<VectoredRead>,
|
||||
@@ -988,8 +959,7 @@ impl DeltaLayerInner {
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into();
|
||||
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
// Note that reads are processed in reverse order (from highest key+lsn).
|
||||
// This is the order that `ReconstructState` requires such that it can
|
||||
@@ -1016,7 +986,7 @@ impl DeltaLayerInner {
|
||||
|
||||
// We have "lost" the buffer since the lower level IO api
|
||||
// doesn't return the buffer on error. Allocate a new one.
|
||||
buf = Some(BytesMut::with_capacity(buf_size));
|
||||
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1240,16 +1210,9 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use itertools::MinMaxResult;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
|
||||
DEFAULT_PG_VERSION,
|
||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
||||
};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
@@ -1369,229 +1332,4 @@ mod test {
|
||||
|
||||
assert_eq!(planned_blobs, expected_blobs);
|
||||
}
|
||||
|
||||
mod constants {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Offset used by all lsns in this test
|
||||
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
|
||||
/// Number of unique keys including in the test data
|
||||
pub(super) const KEY_COUNT: u8 = 60;
|
||||
/// Max number of different lsns for each key
|
||||
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
|
||||
/// Possible value sizes for each key along with a probability weight
|
||||
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
|
||||
/// Probability that there will be a gap between the current key and the next one (33.3%)
|
||||
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
|
||||
/// The minimum size of a key range in all the generated reads
|
||||
pub(super) const MIN_RANGE_SIZE: i128 = 10;
|
||||
/// The number of ranges included in each vectored read
|
||||
pub(super) const RANGES_COUNT: u8 = 2;
|
||||
/// The number of vectored reads performed
|
||||
pub(super) const READS_COUNT: u8 = 100;
|
||||
/// Soft max size of a vectored read. Will be violated if we have to read keys
|
||||
/// with values larger than the limit
|
||||
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
|
||||
}
|
||||
|
||||
struct Entry {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
value: Vec<u8>,
|
||||
}
|
||||
|
||||
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
|
||||
let mut current_key = Key::MIN;
|
||||
|
||||
let mut entries = Vec::new();
|
||||
for _ in 0..constants::KEY_COUNT {
|
||||
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
|
||||
let mut lsns_iter =
|
||||
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
|
||||
Some(Lsn(lsn.0 + 0x08))
|
||||
});
|
||||
let mut lsns = Vec::new();
|
||||
while lsns.len() < count as usize {
|
||||
let take = rng.gen_bool(0.5);
|
||||
let lsn = lsns_iter.next().unwrap();
|
||||
if take {
|
||||
lsns.push(lsn);
|
||||
}
|
||||
}
|
||||
|
||||
for lsn in lsns {
|
||||
let size = constants::VALUE_SIZES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
let mut buf = vec![0; size];
|
||||
rng.fill_bytes(&mut buf);
|
||||
|
||||
entries.push(Entry {
|
||||
key: current_key,
|
||||
lsn,
|
||||
value: buf,
|
||||
})
|
||||
}
|
||||
|
||||
let gap = constants::KEY_GAP_CHANGES
|
||||
.choose_weighted(rng, |item| item.1)
|
||||
.unwrap()
|
||||
.0;
|
||||
if gap {
|
||||
current_key = current_key.add(2);
|
||||
} else {
|
||||
current_key = current_key.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
entries
|
||||
}
|
||||
|
||||
struct EntriesMeta {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
index: BTreeMap<(Key, Lsn), Vec<u8>>,
|
||||
}
|
||||
|
||||
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
|
||||
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
|
||||
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
|
||||
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
|
||||
_ => panic!("More than one entry is always expected"),
|
||||
};
|
||||
|
||||
let mut index = BTreeMap::new();
|
||||
for entry in entries.iter() {
|
||||
index.insert((entry.key, entry.lsn), entry.value.clone());
|
||||
}
|
||||
|
||||
EntriesMeta {
|
||||
key_range,
|
||||
lsn_range,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
|
||||
let start = key_range.start.to_i128();
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
let mut keyspace = KeySpace::default();
|
||||
|
||||
for _ in 0..constants::RANGES_COUNT {
|
||||
let mut range: Option<Range<Key>> = Option::default();
|
||||
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
|
||||
let range_start = rng.gen_range(start..end);
|
||||
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
|
||||
if range_end_offset >= end {
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
|
||||
} else {
|
||||
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
|
||||
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
|
||||
}
|
||||
}
|
||||
keyspace.ranges.push(range.unwrap());
|
||||
}
|
||||
|
||||
keyspace
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
tracing::info!("Generating test data ...");
|
||||
|
||||
let rng = &mut StdRng::seed_from_u64(0);
|
||||
let entries = generate_entries(rng);
|
||||
let entries_meta = get_entries_meta(&entries);
|
||||
|
||||
tracing::info!("Done generating {} entries", entries.len());
|
||||
|
||||
tracing::info!("Writing test data to delta layer ...");
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
harness.conf,
|
||||
timeline_id,
|
||||
harness.tenant_shard_id,
|
||||
entries_meta.key_range.start,
|
||||
entries_meta.lsn_range.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
for entry in entries {
|
||||
let (_, res) = writer
|
||||
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
|
||||
.await;
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||
|
||||
let inner = resident.get_inner_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
"Done writing test data to delta layer. Resulting file size is: {}",
|
||||
file_size
|
||||
);
|
||||
|
||||
for i in 0..constants::READS_COUNT {
|
||||
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
|
||||
|
||||
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
|
||||
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||
keyspace.clone(),
|
||||
entries_meta.lsn_range.clone(),
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
|
||||
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
|
||||
&vectored_reads,
|
||||
constants::MAX_VECTORED_READ_BYTES,
|
||||
);
|
||||
let mut buf = Some(BytesMut::with_capacity(buf_size));
|
||||
|
||||
for read in vectored_reads {
|
||||
let blobs_buf = vectored_blob_reader
|
||||
.read_blobs(&read, buf.take().expect("Should have a buffer"))
|
||||
.await?;
|
||||
for meta in blobs_buf.blobs.iter() {
|
||||
let value = &blobs_buf.buf[meta.start..meta.end];
|
||||
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
|
||||
}
|
||||
|
||||
buf = Some(blobs_buf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -541,25 +540,7 @@ impl ImageLayerInner {
|
||||
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
|
||||
for read in reads.into_iter() {
|
||||
let buf_size = read.size();
|
||||
|
||||
if buf_size > max_vectored_read_bytes {
|
||||
// If the read is oversized, it should only contain one key.
|
||||
let offenders = read
|
||||
.blobs_at
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
|
||||
.join(", ");
|
||||
tracing::warn!(
|
||||
"Oversized vectored read ({} > {}) for keys {}",
|
||||
buf_size,
|
||||
max_vectored_read_bytes,
|
||||
offenders
|
||||
);
|
||||
}
|
||||
|
||||
let buf = BytesMut::with_capacity(buf_size);
|
||||
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
|
||||
let res = vectored_blob_reader.read_blobs(&read, buf).await;
|
||||
|
||||
match res {
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{page_cache, walrecord};
|
||||
use crate::walrecord;
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
@@ -36,14 +36,10 @@ use super::{
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
file_id: InMemoryLayerFileId,
|
||||
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive.
|
||||
@@ -93,10 +89,7 @@ impl std::fmt::Debug for InMemoryLayerInner {
|
||||
///
|
||||
/// This global state is used to implement behaviors that require a global view of the system, e.g.
|
||||
/// rolling layers proactively to limit the total amount of dirty data.
|
||||
pub(crate) struct GlobalResources {
|
||||
// Limit on how high dirty_bytes may grow before we start freezing layers to reduce it.
|
||||
// Zero means unlimited.
|
||||
pub(crate) max_dirty_bytes: AtomicU64,
|
||||
struct GlobalResources {
|
||||
// How many bytes are in all EphemeralFile objects
|
||||
dirty_bytes: AtomicU64,
|
||||
// How many layers are contributing to dirty_bytes
|
||||
@@ -125,12 +118,11 @@ impl GlobalResourceUnits {
|
||||
|
||||
/// Do not call this frequently: all timelines will write to these same global atomics,
|
||||
/// so this is a relatively expensive operation. Wait at least a few seconds between calls.
|
||||
///
|
||||
/// Returns the effective layer size limit that should be applied, if any, to keep
|
||||
/// the total number of dirty bytes below the configured maximum.
|
||||
fn publish_size(&mut self, size: u64) -> Option<u64> {
|
||||
fn publish_size(&mut self, size: u64) {
|
||||
let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
|
||||
Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
|
||||
Ordering::Equal => {
|
||||
return;
|
||||
}
|
||||
Ordering::Greater => {
|
||||
let delta = size - self.dirty_bytes;
|
||||
let old = GLOBAL_RESOURCES
|
||||
@@ -154,21 +146,6 @@ impl GlobalResourceUnits {
|
||||
TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
|
||||
|
||||
self.dirty_bytes = size;
|
||||
|
||||
let max_dirty_bytes = GLOBAL_RESOURCES
|
||||
.max_dirty_bytes
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes {
|
||||
// Set the layer file limit to the average layer size: this implies that all above-average
|
||||
// sized layers will be elegible for freezing. They will be frozen in the order they
|
||||
// next enter publish_size.
|
||||
Some(
|
||||
new_global_dirty_bytes
|
||||
/ GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Call publish_size if the input size differs from last published size by more than
|
||||
@@ -197,17 +174,12 @@ impl Drop for GlobalResourceUnits {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
max_dirty_bytes: AtomicU64::new(0),
|
||||
static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
|
||||
dirty_bytes: AtomicU64::new(0),
|
||||
dirty_layers: AtomicUsize::new(0),
|
||||
};
|
||||
|
||||
impl InMemoryLayer {
|
||||
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
|
||||
self.file_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
@@ -222,10 +194,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn try_len(&self) -> Option<u64> {
|
||||
self.inner.try_read().map(|i| i.file.len()).ok()
|
||||
}
|
||||
|
||||
pub(crate) fn assert_writable(&self) {
|
||||
assert!(self.end_lsn.get().is_none());
|
||||
}
|
||||
@@ -451,10 +419,8 @@ impl InMemoryLayer {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||
let key = InMemoryLayerFileId(file.id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -520,10 +486,10 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tick(&self) -> Option<u64> {
|
||||
pub(crate) async fn tick(&self) {
|
||||
let mut inner = self.inner.write().await;
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.publish_size(size)
|
||||
inner.resource_units.publish_size(size);
|
||||
}
|
||||
|
||||
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||
|
||||
@@ -1759,18 +1759,6 @@ impl ResidentLayer {
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.owner.metadata()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_inner_delta<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
|
||||
let owner = &self.owner.0;
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
LayerKind::Delta(d) => Ok(d),
|
||||
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsLayerDesc for ResidentLayer {
|
||||
|
||||
@@ -9,7 +9,6 @@ pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use arc_swap::ArcSwap;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use enumset::EnumSet;
|
||||
@@ -20,7 +19,7 @@ use pageserver_api::{
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
EvictionPolicy, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -119,11 +118,11 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(super) enum FlushLoopState {
|
||||
@@ -184,7 +183,7 @@ pub(crate) struct AuxFilesState {
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
|
||||
myself: Weak<Self>,
|
||||
|
||||
@@ -310,8 +309,6 @@ pub struct Timeline {
|
||||
/// Configuration: how often should the partitioning be recalculated.
|
||||
repartition_threshold: u64,
|
||||
|
||||
last_image_layer_creation_check_at: AtomicLsn,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: LogicalSize,
|
||||
|
||||
@@ -613,25 +610,6 @@ pub enum GetVectoredImpl {
|
||||
Vectored,
|
||||
}
|
||||
|
||||
pub(crate) enum WaitLsnWaiter<'a> {
|
||||
Timeline(&'a Timeline),
|
||||
Tenant,
|
||||
PageService,
|
||||
}
|
||||
|
||||
/// Argument to [`Timeline::shutdown`].
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum ShutdownMode {
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -1080,8 +1058,7 @@ impl Timeline {
|
||||
pub(crate) async fn wait_lsn(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
who_is_waiting: WaitLsnWaiter<'_>,
|
||||
ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||
) -> Result<(), WaitLsnError> {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(WaitLsnError::Shutdown);
|
||||
@@ -1089,28 +1066,20 @@ impl Timeline {
|
||||
return Err(WaitLsnError::BadState);
|
||||
}
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
match ctx.task_kind() {
|
||||
TaskKind::WalReceiverManager
|
||||
| TaskKind::WalReceiverConnectionHandler
|
||||
| TaskKind::WalReceiverConnectionPoller => {
|
||||
let is_myself = match who_is_waiting {
|
||||
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
|
||||
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
|
||||
};
|
||||
if is_myself {
|
||||
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
|
||||
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
|
||||
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
|
||||
}
|
||||
} else {
|
||||
// if another timeline's is waiting for us, there's no deadlock risk because
|
||||
// our walreceiver task can make progress independent of theirs
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
// This should never be called from the WAL receiver, because that could lead
|
||||
// to a deadlock.
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
debug_assert!(
|
||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
|
||||
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
|
||||
|
||||
@@ -1173,79 +1142,6 @@ impl Timeline {
|
||||
self.flush_frozen_layers_and_wait().await
|
||||
}
|
||||
|
||||
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
|
||||
///
|
||||
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
|
||||
/// even if there are no ongoing writes to drive that.
|
||||
async fn maybe_freeze_ephemeral_layer(&self) {
|
||||
let Ok(_write_guard) = self.write_lock.try_lock() else {
|
||||
// If the write lock is held, there is an active wal receiver: rolling open layers
|
||||
// is their responsibility while they hold this lock.
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(layers_guard) = self.layers.try_read() else {
|
||||
// Don't block if the layer lock is busy
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
|
||||
// No open layer, no work to do.
|
||||
return;
|
||||
};
|
||||
|
||||
let Some(current_size) = open_layer.try_len() else {
|
||||
// Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
|
||||
// read lock to get size should always succeed.
|
||||
tracing::warn!("Lock conflict while reading size of open layer");
|
||||
return;
|
||||
};
|
||||
|
||||
let current_lsn = self.get_last_record_lsn();
|
||||
|
||||
let checkpoint_distance_override = open_layer.tick().await;
|
||||
|
||||
if let Some(size_override) = checkpoint_distance_override {
|
||||
if current_size > size_override {
|
||||
// This is not harmful, but it only happens in relatively rare cases where
|
||||
// time-based checkpoints are not happening fast enough to keep the amount of
|
||||
// ephemeral data within configured limits. It's a sign of stress on the system.
|
||||
tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure");
|
||||
}
|
||||
}
|
||||
|
||||
let checkpoint_distance =
|
||||
checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance());
|
||||
|
||||
if self.should_roll(
|
||||
current_size,
|
||||
current_size,
|
||||
checkpoint_distance,
|
||||
self.get_last_record_lsn(),
|
||||
self.last_freeze_at.load(),
|
||||
*self.last_freeze_ts.read().unwrap(),
|
||||
) {
|
||||
match open_layer.info() {
|
||||
InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
|
||||
// We may reach this point if the layer was already frozen by not yet flushed: flushing
|
||||
// happens asynchronously in the background.
|
||||
tracing::debug!(
|
||||
"Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
|
||||
);
|
||||
}
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
// Upgrade to a write lock and freeze the layer
|
||||
drop(layers_guard);
|
||||
let mut layers_guard = self.layers.write().await;
|
||||
layers_guard
|
||||
.try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
self.flush_frozen_layers();
|
||||
}
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub(crate) async fn compact(
|
||||
self: &Arc<Self>,
|
||||
@@ -1268,11 +1164,6 @@ impl Timeline {
|
||||
(guard, permit)
|
||||
};
|
||||
|
||||
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
|
||||
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
|
||||
// an ephemeral layer open forever when idle.
|
||||
self.maybe_freeze_ephemeral_layer().await;
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let (_guard, _permit) = tokio::select! {
|
||||
@@ -1305,7 +1196,6 @@ impl Timeline {
|
||||
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<crate::tenant::Tenant>,
|
||||
broker_client: BrokerClientChannel,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
@@ -1316,122 +1206,86 @@ impl Timeline {
|
||||
}
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(parent, background_jobs_can_start);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
}
|
||||
|
||||
/// After this function returns, there are no timeline-scoped tasks are left running.
|
||||
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||
///
|
||||
/// The preferred pattern for is:
|
||||
/// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
|
||||
/// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
|
||||
/// go the extra mile and keep track of JoinHandles
|
||||
/// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
|
||||
/// instead of spawning directly on a runtime. It is a more composable / testable pattern.
|
||||
///
|
||||
/// For legacy reasons, we still have multiple tasks spawned using
|
||||
/// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
|
||||
/// We refer to these as "timeline-scoped task_mgr tasks".
|
||||
/// Some of these tasks are already sensitive to Timeline::cancel while others are
|
||||
/// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
|
||||
/// or [`task_mgr::shutdown_watcher`].
|
||||
/// We want to gradually convert the code base away from these.
|
||||
///
|
||||
/// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
|
||||
/// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
|
||||
/// ones that aren't mentioned here):
|
||||
/// - [`TaskKind::TimelineDeletionWorker`]
|
||||
/// - NB: also used for tenant deletion
|
||||
/// - [`TaskKind::RemoteUploadTask`]`
|
||||
/// - [`TaskKind::InitialLogicalSizeCalculation`]
|
||||
/// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
|
||||
// Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
|
||||
/// - [`TaskKind::Eviction`]
|
||||
/// - [`TaskKind::LayerFlushTask`]
|
||||
/// - [`TaskKind::OndemandLogicalSizeCalculation`]
|
||||
/// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
|
||||
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
|
||||
/// While we are flushing, we continue to accept read I/O.
|
||||
pub(crate) async fn flush_and_shutdown(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let try_freeze_and_flush = match mode {
|
||||
ShutdownMode::FreezeAndFlush => true,
|
||||
ShutdownMode::Hard => false,
|
||||
};
|
||||
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
|
||||
// trying to flush
|
||||
tracing::debug!("Waiting for WalReceiverManager...");
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// or not, stop ingesting any more data. Walreceiver only provides
|
||||
// cancellation but no "wait until gone", because it uses the Timeline::gate.
|
||||
// So, only after the self.gate.close() below will we know for sure that
|
||||
// no walreceiver tasks are left.
|
||||
// For `try_freeze_and_flush=true`, this means that we might still be ingesting
|
||||
// data during the call to `self.freeze_and_flush()` below.
|
||||
// That's not ideal, but, we don't have the concept of a ChildGuard,
|
||||
// which is what we'd need to properly model early shutdown of the walreceiver
|
||||
// task sub-tree before the other Timeline task sub-trees.
|
||||
let walreceiver = self.walreceiver.lock().unwrap().take();
|
||||
tracing::debug!(
|
||||
is_some = walreceiver.is_some(),
|
||||
"Waiting for WalReceiverManager..."
|
||||
);
|
||||
if let Some(walreceiver) = walreceiver {
|
||||
walreceiver.cancel();
|
||||
}
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
if try_freeze_and_flush {
|
||||
// we shut down walreceiver above, so, we won't add anything more
|
||||
// to the InMemoryLayer; freeze it and wait for all frozen layers
|
||||
// to reach the disk & upload queue, then shut the upload queue and
|
||||
// wait for it to drain.
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(_) => {
|
||||
// drain the upload queue
|
||||
if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
||||
match self.freeze_and_flush().await {
|
||||
Ok(_) => {
|
||||
// drain the upload queue
|
||||
if let Some(client) = self.remote_client.as_ref() {
|
||||
// if we did not wait for completion here, it might be our shutdown process
|
||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||
// be spawned.
|
||||
//
|
||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||
// obviously it does not make sense to stop while we wait for it, but what
|
||||
// about corner cases like s3 suddenly hanging up?
|
||||
client.shutdown().await;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||
// we have some extra WAL replay to do next time the timeline starts.
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
self.shutdown().await;
|
||||
}
|
||||
|
||||
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
||||
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
self.cancel.cancel();
|
||||
|
||||
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
|
||||
// while doing so.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::LayerFlushTask),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
||||
// case our caller wants to use that for a deletion
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
// As documented in remote_client.stop()'s doc comment, it's our responsibility
|
||||
// to shut down the upload queue tasks.
|
||||
// TODO: fix that, task management should be encapsulated inside remote_client.
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::RemoteUploadTask),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
|
||||
|
||||
// Finally wait until any gate-holders are complete.
|
||||
//
|
||||
// TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
|
||||
// and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
|
||||
// Finally wait until any gate-holders are complete
|
||||
self.gate.close().await;
|
||||
|
||||
self.metrics.shutdown();
|
||||
@@ -1580,53 +1434,6 @@ impl Timeline {
|
||||
Err(EvictionError::Timeout) => Ok(Some(false)),
|
||||
}
|
||||
}
|
||||
|
||||
fn should_roll(
|
||||
&self,
|
||||
layer_size: u64,
|
||||
projected_layer_size: u64,
|
||||
checkpoint_distance: u64,
|
||||
projected_lsn: Lsn,
|
||||
last_freeze_at: Lsn,
|
||||
last_freeze_ts: Instant,
|
||||
) -> bool {
|
||||
let distance = projected_lsn.widening_sub(last_freeze_at);
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
projected_lsn, layer_size, distance
|
||||
);
|
||||
|
||||
true
|
||||
} else if projected_layer_size >= checkpoint_distance {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
projected_lsn, layer_size, projected_layer_size
|
||||
);
|
||||
|
||||
true
|
||||
} else if distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
projected_lsn,
|
||||
layer_size,
|
||||
last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of times we will compute partition within a checkpoint distance.
|
||||
@@ -1635,65 +1442,57 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.lazy_slru_download
|
||||
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
|
||||
}
|
||||
|
||||
fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.checkpoint_distance
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
|
||||
}
|
||||
|
||||
fn get_checkpoint_timeout(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.checkpoint_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||
}
|
||||
|
||||
fn get_compaction_target_size(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_target_size
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
|
||||
}
|
||||
|
||||
fn get_compaction_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
fn get_image_creation_threshold(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_creation_threshold
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||
let tenant_conf = &self.tenant_conf.load();
|
||||
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.compaction_algorithm
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.eviction_policy
|
||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
||||
}
|
||||
@@ -1707,26 +1506,14 @@ impl Timeline {
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(
|
||||
self.conf
|
||||
.default_tenant_conf
|
||||
.image_layer_creation_check_threshold,
|
||||
)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
|
||||
pub(super) fn tenant_conf_updated(&self) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
|
||||
// The threshold is embedded in the metric. So, we need to update it.
|
||||
{
|
||||
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
new_conf,
|
||||
&self.tenant_conf.read().unwrap().tenant_conf,
|
||||
&self.conf.default_tenant_conf,
|
||||
);
|
||||
|
||||
@@ -1753,7 +1540,7 @@ impl Timeline {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
|
||||
metadata: &TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
timeline_id: TimelineId,
|
||||
@@ -1772,13 +1559,14 @@ impl Timeline {
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let evictions_low_residence_duration_metric_threshold = {
|
||||
let loaded_tenant_conf = tenant_conf.load();
|
||||
let tenant_conf_guard = tenant_conf.read().unwrap();
|
||||
|
||||
let evictions_low_residence_duration_metric_threshold =
|
||||
Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&loaded_tenant_conf.tenant_conf,
|
||||
&tenant_conf_guard.tenant_conf,
|
||||
&conf.default_tenant_conf,
|
||||
)
|
||||
};
|
||||
);
|
||||
drop(tenant_conf_guard);
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let mut result = Timeline {
|
||||
@@ -1855,7 +1643,6 @@ impl Timeline {
|
||||
},
|
||||
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
repartition_threshold: 0,
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
@@ -1884,7 +1671,6 @@ impl Timeline {
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
result
|
||||
.metrics
|
||||
.last_record_gauge
|
||||
@@ -1961,19 +1747,20 @@ impl Timeline {
|
||||
self.timeline_id, self.tenant_shard_id
|
||||
);
|
||||
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
let wal_connect_timeout = tenant_conf
|
||||
let tenant_conf_guard = self.tenant_conf.read().unwrap();
|
||||
let wal_connect_timeout = tenant_conf_guard
|
||||
.tenant_conf
|
||||
.walreceiver_connect_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
|
||||
let lagging_wal_timeout = tenant_conf
|
||||
let lagging_wal_timeout = tenant_conf_guard
|
||||
.tenant_conf
|
||||
.lagging_wal_timeout
|
||||
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
|
||||
let max_lsn_wal_lag = tenant_conf
|
||||
let max_lsn_wal_lag = tenant_conf_guard
|
||||
.tenant_conf
|
||||
.max_lsn_wal_lag
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
drop(tenant_conf_guard);
|
||||
|
||||
let mut guard = self.walreceiver.lock().unwrap();
|
||||
assert!(
|
||||
@@ -2521,6 +2308,10 @@ impl Timeline {
|
||||
debug!("cancelling logical size calculation for timeline shutdown");
|
||||
calculation.await
|
||||
}
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
debug!("cancelling logical size calculation for task shutdown");
|
||||
calculation.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2796,10 +2587,6 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let open_layer = open_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match open_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2817,7 +2604,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || open_layer.traversal_id()),
|
||||
Box::new({
|
||||
let open_layer = Arc::clone(open_layer);
|
||||
move || open_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2827,10 +2617,6 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match frozen_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
@@ -2848,7 +2634,10 @@ impl Timeline {
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || frozen_layer.traversal_id()),
|
||||
Box::new({
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
move || frozen_layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2856,8 +2645,6 @@ impl Timeline {
|
||||
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
@@ -2975,6 +2762,16 @@ impl Timeline {
|
||||
|
||||
let mut completed_keyspace = KeySpace::default();
|
||||
|
||||
// Hold the layer map whilst visiting the timeline to prevent
|
||||
// compaction, eviction and flushes from rendering the layers unreadable.
|
||||
//
|
||||
// TODO: Do we actually need to do this? In theory holding on
|
||||
// to [`tenant::storage_layer::Layer`] should be enough. However,
|
||||
// [`Timeline::get`] also holds the lock during IO, so more investigation
|
||||
// is needed.
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
@@ -2984,9 +2781,6 @@ impl Timeline {
|
||||
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
|
||||
completed_keyspace.merge(&keys_done_last_step);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||
let start_lsn = l.get_lsn_range().start;
|
||||
cont_lsn > start_lsn
|
||||
@@ -2994,11 +2788,12 @@ impl Timeline {
|
||||
|
||||
match in_memory_layer {
|
||||
Some(l) => {
|
||||
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||
fringe.update(
|
||||
ReadableLayer::InMemoryLayer(l),
|
||||
ReadableLayerDesc::InMemory {
|
||||
handle: l,
|
||||
lsn_ceil: cont_lsn,
|
||||
},
|
||||
unmapped_keyspace.clone(),
|
||||
lsn_range,
|
||||
);
|
||||
}
|
||||
None => {
|
||||
@@ -3010,43 +2805,30 @@ impl Timeline {
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||
ReadableLayerDesc::Persistent {
|
||||
desc: (*layer).clone(),
|
||||
lsn_range: lsn_floor..cont_lsn,
|
||||
},
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
// The fringe keeps readable handles for the layers which are safe to read even
|
||||
// if layers were compacted or flushed.
|
||||
//
|
||||
// The more interesting consideration is: "Why is the read algorithm still correct
|
||||
// if the layer map changes while it is operating?". Doing a vectored read on a
|
||||
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
|
||||
// covered by the read. The layer map tells us how to move the lsn downwards for a
|
||||
// range at *a particular point in time*. It is fine for the answer to be different
|
||||
// at two different time points.
|
||||
drop(guard);
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
&guard,
|
||||
keyspace_to_read.clone(),
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
cont_lsn = layer_to_read.get_lsn_floor();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -3124,7 +2906,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
ancestor
|
||||
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
|
||||
.wait_lsn(self.ancestor_lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
|
||||
@@ -3204,11 +2986,16 @@ impl Timeline {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = self.cancel.cancelled() => {
|
||||
info!("shutting down layer flush task due to Timeline::cancel");
|
||||
info!("shutting down layer flush task");
|
||||
break;
|
||||
},
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("shutting down layer flush task");
|
||||
break;
|
||||
},
|
||||
_ = layer_flush_start_rx.changed() => {}
|
||||
}
|
||||
|
||||
trace!("waking up");
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
@@ -3584,24 +3371,6 @@ impl Timeline {
|
||||
|
||||
// Is it time to create a new image layer for the given partition?
|
||||
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
|
||||
let last = self.last_image_layer_creation_check_at.load();
|
||||
if lsn != Lsn(0) {
|
||||
let distance = lsn
|
||||
.checked_sub(last)
|
||||
.expect("Attempt to compact with LSN going backwards");
|
||||
|
||||
let min_distance = self.get_image_layer_creation_check_threshold() as u64
|
||||
* self.get_checkpoint_distance();
|
||||
|
||||
// Skip the expensive delta layer counting below if we've not ingested
|
||||
// sufficient WAL since the last check.
|
||||
if distance.0 < min_distance {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_image_layer_creation_check_at.store(lsn);
|
||||
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
@@ -3943,24 +3712,6 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Schedules the uploads of the given image layers
|
||||
fn upload_new_image_layers(
|
||||
self: &Arc<Self>,
|
||||
new_images: impl IntoIterator<Item = ResidentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(remote_client) = &self.remote_client else {
|
||||
return Ok(());
|
||||
};
|
||||
for layer in new_images {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
}
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
// size, which will fail some tests, but should not be an issue otherwise.
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update information about which layer files need to be retained on
|
||||
/// garbage collection. This is separate from actually performing the GC,
|
||||
/// and is updated more frequently, so that compaction can remove obsolete
|
||||
@@ -4700,6 +4451,52 @@ impl<'a> TimelineWriter<'a> {
|
||||
res
|
||||
}
|
||||
|
||||
/// "Tick" the timeline writer: it will roll the open layer if required
|
||||
/// and do nothing else.
|
||||
pub(crate) async fn tick(&mut self) -> anyhow::Result<()> {
|
||||
self.open_layer_if_present().await?;
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let action = self.get_open_layer_action(last_record_lsn, 0);
|
||||
if action == OpenLayerAction::Roll {
|
||||
self.roll_layer(last_record_lsn).await?;
|
||||
} else if let Some(writer_state) = &mut *self.write_guard {
|
||||
// Periodic update of statistics
|
||||
writer_state.open_layer.tick().await;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Populate the timeline writer state only if an in-memory layer
|
||||
/// is already open.
|
||||
async fn open_layer_if_present(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.write_guard.is_none());
|
||||
|
||||
let open_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
match layers.open_layer {
|
||||
Some(ref open_layer) => open_layer.clone(),
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let initial_size = open_layer.size().await?;
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
|
||||
self.write_guard.replace(TimelineWriterState::new(
|
||||
open_layer,
|
||||
initial_size,
|
||||
last_freeze_at,
|
||||
last_freeze_ts,
|
||||
));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_open_layer_action(
|
||||
&mut self,
|
||||
at: Lsn,
|
||||
@@ -4771,14 +4568,43 @@ impl<'a> TimelineWriter<'a> {
|
||||
return OpenLayerAction::None;
|
||||
}
|
||||
|
||||
if self.tl.should_roll(
|
||||
state.current_size,
|
||||
state.current_size + new_value_size,
|
||||
self.get_checkpoint_distance(),
|
||||
lsn,
|
||||
state.cached_last_freeze_at,
|
||||
state.cached_last_freeze_ts,
|
||||
) {
|
||||
let distance = lsn.widening_sub(state.cached_last_freeze_at);
|
||||
let proposed_open_layer_size = state.current_size + new_value_size;
|
||||
|
||||
// Rolling the open layer can be triggered by:
|
||||
// 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
|
||||
// the safekeepers need to store. For sharded tenants, we multiply by shard count to
|
||||
// account for how writes are distributed across shards: we expect each node to consume
|
||||
// 1/count of the LSN on average.
|
||||
// 2. The size of the currently open layer.
|
||||
// 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
|
||||
// up and suspend activity.
|
||||
if distance
|
||||
>= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to LSN distance ({})",
|
||||
lsn, state.current_size, distance
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if proposed_open_layer_size >= self.get_checkpoint_distance() {
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
lsn, state.current_size, proposed_open_layer_size
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else if distance > 0
|
||||
&& state.cached_last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()
|
||||
{
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to time since last flush ({:?})",
|
||||
lsn,
|
||||
state.current_size,
|
||||
state.cached_last_freeze_ts.elapsed()
|
||||
);
|
||||
|
||||
OpenLayerAction::Roll
|
||||
} else {
|
||||
OpenLayerAction::None
|
||||
|
||||
@@ -125,8 +125,18 @@ impl Timeline {
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
for layer in layers {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.upload_new_image_layers(layers)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
// size, which will fail some tests, but should not be an issue otherwise.
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
// no partitioning? This is normal, if the timeline was just created
|
||||
@@ -808,10 +818,7 @@ impl TimelineAdaptor {
|
||||
self.timeline
|
||||
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
|
||||
.await?;
|
||||
|
||||
self.timeline
|
||||
.upload_new_image_layers(std::mem::take(&mut self.new_images))?;
|
||||
|
||||
self.new_images.clear();
|
||||
self.new_deltas.clear();
|
||||
self.layers_to_delete.clear();
|
||||
Ok(())
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use tracing::{debug, error, info, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
@@ -14,6 +14,7 @@ use crate::{
|
||||
deletion_queue::DeletionQueueClient,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
@@ -22,6 +23,58 @@ use crate::{
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
|
||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// Notify any timeline work to drop out of loops/requests
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
timeline.cancel.cancel();
|
||||
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||
if let Some(walreceiver) = maybe_started_walreceiver {
|
||||
walreceiver.stop().await;
|
||||
}
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::LayerFlushTask),
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
remote_client.stop();
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(
|
||||
None,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
|
||||
tracing::debug!("Waiting for gate...");
|
||||
timeline.gate.close().await;
|
||||
tracing::debug!("Shutdown complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
@@ -215,14 +268,7 @@ impl DeleteTimelineFlow {
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
stop_tasks(&timeline).await?;
|
||||
|
||||
set_deleted_in_remote_index(&timeline).await?;
|
||||
|
||||
|
||||
@@ -51,7 +51,6 @@ pub struct EvictionTaskTenantState {
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<Tenant>,
|
||||
background_tasks_can_start: Option<&completion::Barrier>,
|
||||
) {
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -67,19 +66,20 @@ impl Timeline {
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
tokio::select! {
|
||||
_ = self_clone.cancel.cancelled() => { return Ok(()); }
|
||||
_ = cancel.cancelled() => { return Ok(()); }
|
||||
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
|
||||
};
|
||||
|
||||
self_clone.eviction_task(parent).await;
|
||||
self_clone.eviction_task(cancel).await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
|
||||
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
|
||||
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
|
||||
use crate::tenant::tasks::random_init_delay;
|
||||
|
||||
// acquire the gate guard only once within a useful span
|
||||
@@ -94,7 +94,7 @@ impl Timeline {
|
||||
EvictionPolicy::OnlyImitiate(lat) => lat.period,
|
||||
EvictionPolicy::NoEviction => Duration::from_secs(10),
|
||||
};
|
||||
if random_init_delay(period, &self.cancel).await.is_err() {
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -103,13 +103,13 @@ impl Timeline {
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self
|
||||
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
|
||||
.eviction_iteration(&policy, &cancel, &guard, &ctx)
|
||||
.await;
|
||||
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
@@ -123,7 +123,6 @@ impl Timeline {
|
||||
#[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
|
||||
async fn eviction_iteration(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
policy: &EvictionPolicy,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -138,7 +137,7 @@ impl Timeline {
|
||||
}
|
||||
EvictionPolicy::LayerAccessThreshold(p) => {
|
||||
match self
|
||||
.eviction_iteration_threshold(tenant, p, cancel, gate, ctx)
|
||||
.eviction_iteration_threshold(p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
@@ -147,11 +146,7 @@ impl Timeline {
|
||||
(p.period, p.threshold)
|
||||
}
|
||||
EvictionPolicy::OnlyImitiate(p) => {
|
||||
if self
|
||||
.imitiate_only(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
.is_break()
|
||||
{
|
||||
if self.imitiate_only(p, cancel, gate, ctx).await.is_break() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
(p.period, p.threshold)
|
||||
@@ -180,7 +175,6 @@ impl Timeline {
|
||||
|
||||
async fn eviction_iteration_threshold(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -199,10 +193,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
match self
|
||||
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
match self.imitate_layer_accesses(p, cancel, gate, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
@@ -324,7 +315,6 @@ impl Timeline {
|
||||
/// disk usage based eviction task.
|
||||
async fn imitiate_only(
|
||||
self: &Arc<Self>,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -341,8 +331,7 @@ impl Timeline {
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
self.imitate_layer_accesses(p, cancel, gate, ctx).await
|
||||
}
|
||||
|
||||
/// If we evict layers but keep cached values derived from those layers, then
|
||||
@@ -372,7 +361,6 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
@@ -408,11 +396,17 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx)
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
@@ -486,7 +480,7 @@ impl Timeline {
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Tenant,
|
||||
tenant: &Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
|
||||
@@ -86,7 +86,6 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
/// Prepares timeline data by loading it from the basebackup archive.
|
||||
pub(crate) async fn import_basebackup_from_tar(
|
||||
self,
|
||||
tenant: Arc<Tenant>,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
@@ -115,7 +114,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines map
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(tenant, broker_client, None, ctx);
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
}
|
||||
|
||||
|
||||
@@ -24,21 +24,26 @@ mod connection_manager;
|
||||
mod walreceiver_connection;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
|
||||
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::timeline::walreceiver::connection_manager::{
|
||||
connection_manager_loop_step, ConnectionManagerState,
|
||||
};
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::select;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use self::connection_manager::ConnectionManagerStatus;
|
||||
|
||||
use super::Timeline;
|
||||
@@ -57,10 +62,9 @@ pub struct WalReceiverConf {
|
||||
}
|
||||
|
||||
pub struct WalReceiver {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
|
||||
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
|
||||
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl WalReceiver {
|
||||
@@ -74,58 +78,65 @@ impl WalReceiver {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let walreceiver_ctx =
|
||||
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
|
||||
|
||||
let loop_status = Arc::new(std::sync::RwLock::new(None));
|
||||
let manager_status = Arc::clone(&loop_status);
|
||||
let cancel = timeline.cancel.child_token();
|
||||
WALRECEIVER_RUNTIME.spawn({
|
||||
let cancel = cancel.clone();
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverManager,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline_id),
|
||||
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// acquire timeline gate so we know the task doesn't outlive the Timeline
|
||||
let Ok(_guard) = timeline.gate.enter() else {
|
||||
debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
|
||||
return;
|
||||
};
|
||||
debug!("WAL receiver manager started, connecting to broker");
|
||||
let mut connection_manager_state = ConnectionManagerState::new(
|
||||
timeline,
|
||||
conf,
|
||||
cancel.clone(),
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
let loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&cancel,
|
||||
&loop_status,
|
||||
).await;
|
||||
match loop_step_result {
|
||||
Ok(()) => continue,
|
||||
Err(_cancelled) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
loop {
|
||||
select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
trace!("WAL receiver shutdown requested, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&mut broker_client,
|
||||
&mut connection_manager_state,
|
||||
&walreceiver_ctx,
|
||||
&loop_status,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
ControlFlow::Break(()) => {
|
||||
trace!("Connection manager loop ended, shutting down");
|
||||
break;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
connection_manager_state.shutdown().await;
|
||||
*loop_status.write().unwrap() = None;
|
||||
debug!("task exits");
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
|
||||
});
|
||||
);
|
||||
|
||||
Self {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
manager_status,
|
||||
cancel,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, level = tracing::Level::DEBUG)]
|
||||
pub fn cancel(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
debug!("cancelling walreceiver tasks");
|
||||
self.cancel.cancel();
|
||||
pub async fn stop(self) {
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(self.tenant_shard_id),
|
||||
Some(self.timeline_id),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
|
||||
@@ -159,18 +170,14 @@ enum TaskStateUpdate<E> {
|
||||
|
||||
impl<E: Clone> TaskHandle<E> {
|
||||
/// Initializes the task, starting it immediately after the creation.
|
||||
///
|
||||
/// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
|
||||
/// It being a child token enables us to provide a [`Self::shutdown`] method.
|
||||
fn spawn<Fut>(
|
||||
cancel_parent: &CancellationToken,
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
||||
) -> Self
|
||||
where
|
||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||
E: Send + Sync + 'static,
|
||||
{
|
||||
let cancellation = cancel_parent.child_token();
|
||||
let cancellation = CancellationToken::new();
|
||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||
|
||||
let cancellation_clone = cancellation.clone();
|
||||
@@ -190,9 +197,6 @@ impl<E: Clone> TaskHandle<E> {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation-safe.
|
||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||
match self.events_receiver.changed().await {
|
||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::metrics::{
|
||||
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
|
||||
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::{shutdown_token, TaskKind};
|
||||
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
@@ -27,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -45,33 +45,27 @@ use super::{
|
||||
TaskEvent, TaskHandle,
|
||||
};
|
||||
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
/// If storage broker subscription is cancelled, exits.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||
pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
|
||||
) -> Result<(), Cancelled> {
|
||||
match tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); },
|
||||
st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st }
|
||||
} {
|
||||
) -> ControlFlow<(), ()> {
|
||||
match connection_manager_state
|
||||
.timeline
|
||||
.wait_to_become_active(ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(new_state) => {
|
||||
debug!(
|
||||
?new_state,
|
||||
"state changed, stopping wal connection manager loop"
|
||||
);
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +86,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -100,7 +94,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
|
||||
// These things are happening concurrently:
|
||||
//
|
||||
// - cancellation request
|
||||
// - keep receiving WAL on the current connection
|
||||
// - if the shared state says we need to change connection, disconnect and return
|
||||
// - this runs in a separate task and we receive updates via a watch channel
|
||||
@@ -108,11 +101,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// - receive updates from broker
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
|
||||
// NB: make sure each of the select expressions are cancellation-safe
|
||||
// (no need for arms to be cancellation-safe).
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
select! {
|
||||
Some(wal_connection_update) = async {
|
||||
match connection_manager_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
@@ -144,7 +133,7 @@ pub(super) async fn connection_manager_loop_step(
|
||||
},
|
||||
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
@@ -158,17 +147,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
// Reminder: this match arm needs to be cancellation-safe.
|
||||
loop {
|
||||
if connection_manager_state.timeline.current_state() == TimelineState::Loading {
|
||||
warn!("wal connection manager should only be launched after timeline has become active");
|
||||
@@ -194,11 +182,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(()) => {
|
||||
return Ok(());
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
debug!("Timeline is no longer active, stopping wal connection manager loop");
|
||||
return Err(Cancelled);
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
},
|
||||
|
||||
@@ -230,15 +218,16 @@ pub(super) async fn connection_manager_loop_step(
|
||||
async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<SafekeeperTimelineInfo>, Cancelled> {
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
let cancel = shutdown_token();
|
||||
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
@@ -252,14 +241,9 @@ async fn subscribe_for_timeline_updates(
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_safekeeper_info(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
match broker_client.subscribe_safekeeper_info(request).await {
|
||||
Ok(resp) => {
|
||||
return Ok(resp.into_inner());
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
@@ -280,8 +264,6 @@ pub(super) struct ConnectionManagerState {
|
||||
id: TenantTimelineId,
|
||||
/// Use pageserver data about the timeline to filter out some of the safekeepers.
|
||||
timeline: Arc<Timeline>,
|
||||
/// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
|
||||
cancel: CancellationToken,
|
||||
conf: WalReceiverConf,
|
||||
/// Current connection to safekeeper for WAL streaming.
|
||||
wal_connection: Option<WalConnection>,
|
||||
@@ -404,11 +386,7 @@ struct BrokerSkTimeline {
|
||||
}
|
||||
|
||||
impl ConnectionManagerState {
|
||||
pub(super) fn new(
|
||||
timeline: Arc<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -416,7 +394,6 @@ impl ConnectionManagerState {
|
||||
Self {
|
||||
id,
|
||||
timeline,
|
||||
cancel,
|
||||
conf,
|
||||
wal_connection: None,
|
||||
wal_stream_candidates: HashMap::new(),
|
||||
@@ -424,22 +401,6 @@ impl ConnectionManagerState {
|
||||
}
|
||||
}
|
||||
|
||||
fn spawn<Fut>(
|
||||
&self,
|
||||
task: impl FnOnce(
|
||||
tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
CancellationToken,
|
||||
) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
) -> TaskHandle<WalConnectionStatus>
|
||||
where
|
||||
Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
|
||||
{
|
||||
// TODO: get rid of TaskHandle
|
||||
super::TaskHandle::spawn(&self.cancel, task)
|
||||
}
|
||||
|
||||
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
||||
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
||||
WALRECEIVER_SWITCHES
|
||||
@@ -458,7 +419,7 @@ impl ConnectionManagerState {
|
||||
);
|
||||
|
||||
let span = info_span!("connection", %node_id);
|
||||
let connection_handle = self.spawn(move |events_sender, cancellation| {
|
||||
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -486,12 +447,6 @@ impl ConnectionManagerState {
|
||||
info!("walreceiver connection handling ended: {e}");
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::ClosedGate => {
|
||||
info!(
|
||||
"walreceiver connection handling ended because of closed gate"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
if cancellation.is_cancelled() {
|
||||
@@ -531,10 +486,6 @@ impl ConnectionManagerState {
|
||||
|
||||
/// Drops the current connection (if any) and updates retry timeout for the next
|
||||
/// connection attempt to the same safekeeper.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
async fn drop_old_connection(&mut self, needs_shutdown: bool) {
|
||||
let wal_connection = match self.wal_connection.take() {
|
||||
Some(wal_connection) => wal_connection,
|
||||
@@ -542,14 +493,7 @@ impl ConnectionManagerState {
|
||||
};
|
||||
|
||||
if needs_shutdown {
|
||||
wal_connection
|
||||
.connection_task
|
||||
.shutdown()
|
||||
// This here is why this function isn't cancellation-safe.
|
||||
// If we got cancelled here, then self.wal_connection is already None and we lose track of the task.
|
||||
// Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None
|
||||
// and thus be ineffective.
|
||||
.await;
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
}
|
||||
|
||||
let retry = self
|
||||
@@ -894,9 +838,6 @@ impl ConnectionManagerState {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Not cancellation-safe.
|
||||
pub(super) async fn shutdown(mut self) {
|
||||
if let Some(wal_connection) = self.wal_connection.take() {
|
||||
wal_connection.connection_task.shutdown().await;
|
||||
@@ -1045,7 +986,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1213,7 +1154,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1280,7 +1221,7 @@ mod tests {
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
@@ -1344,7 +1285,7 @@ mod tests {
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: state.spawn(move |_, _| async move { Ok(()) }),
|
||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
||||
discovered_new_wal: Some(NewCommittedWAL {
|
||||
discovered_at: time_over_threshold,
|
||||
lsn: new_lsn,
|
||||
@@ -1400,7 +1341,6 @@ mod tests {
|
||||
timeline_id: TIMELINE_ID,
|
||||
},
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
@@ -1444,7 +1384,7 @@ mod tests {
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: state.spawn(move |sender, _| async move {
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
|
||||
@@ -27,6 +27,7 @@ use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::WALRECEIVER_RUNTIME,
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
@@ -36,8 +37,8 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::{id::NodeId, lsn::Lsn};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -67,7 +68,6 @@ pub(super) enum WalReceiverError {
|
||||
SuccessfulCompletion(String),
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
ClosedGate,
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
@@ -119,16 +119,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
) -> Result<(), WalReceiverError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// prevent timeline shutdown from finishing until we have exited
|
||||
let _guard = timeline.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||
})?;
|
||||
// This function spawns a side-car task (WalReceiverConnectionPoller).
|
||||
// Get its gate guard now as well.
|
||||
let poller_guard = timeline.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => WalReceiverError::ClosedGate,
|
||||
})?;
|
||||
|
||||
WALRECEIVER_STARTED_CONNECTIONS.inc();
|
||||
|
||||
// Connect to the database in replication mode.
|
||||
@@ -166,19 +156,22 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own. It shouldn't outlive this function, but,
|
||||
// due to lack of async drop, we can't enforce that. However, we ensure that
|
||||
// 1. it is sensitive to `cancellation` and
|
||||
// 2. holds the Timeline gate open so that after timeline shutdown,
|
||||
// we know this task is gone.
|
||||
// so spawn it off to run on its own.
|
||||
let _connection_ctx = ctx.detached_child(
|
||||
TaskKind::WalReceiverConnectionPoller,
|
||||
ctx.download_behavior(),
|
||||
);
|
||||
let connection_cancellation = cancellation.clone();
|
||||
WALRECEIVER_RUNTIME.spawn(
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverConnectionPoller,
|
||||
Some(timeline.tenant_shard_id),
|
||||
Some(timeline.timeline_id),
|
||||
"walreceiver connection",
|
||||
false,
|
||||
async move {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
select! {
|
||||
connection_result = connection => match connection_result {
|
||||
Ok(()) => debug!("Walreceiver db connection closed"),
|
||||
@@ -189,9 +182,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::ClosedGate => {
|
||||
// doesn't happen at runtime
|
||||
}
|
||||
WalReceiverError::Other(err) => {
|
||||
warn!("Connection aborted: {err:#}")
|
||||
}
|
||||
@@ -200,7 +190,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
},
|
||||
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
|
||||
}
|
||||
drop(poller_guard);
|
||||
Ok(())
|
||||
}
|
||||
// Enrich the log lines emitted by this closure with meaningful context.
|
||||
// TODO: technically, this task outlives the surrounding function, so, the
|
||||
@@ -313,7 +303,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||
|
||||
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
@@ -400,6 +389,17 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// This is a hack. It piggybacks on the keepalive messages sent by the
|
||||
// safekeeper in order to enforce `checkpoint_timeout` on the currently
|
||||
// open layer. This hack doesn't provide a bound on the total size of
|
||||
// in-memory layers on a pageserver. See https://github.com/neondatabase/neon/issues/6916.
|
||||
let mut writer = timeline.writer().await;
|
||||
if let Err(err) = writer.tick().await {
|
||||
warn!("Timeline writer tick failed: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn = timeline
|
||||
.get_remote_consistent_lsn_visible()
|
||||
|
||||
@@ -61,7 +61,7 @@ pub struct VectoredRead {
|
||||
}
|
||||
|
||||
impl VectoredRead {
|
||||
pub fn size(&self) -> usize {
|
||||
fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,6 @@ static PageServer page_servers[MAX_SHARDS];
|
||||
|
||||
static bool pageserver_flush(shardno_t shard_no);
|
||||
static void pageserver_disconnect(shardno_t shard_no);
|
||||
static void pageserver_disconnect_shard(shardno_t shard_no);
|
||||
|
||||
static bool
|
||||
PagestoreShmemIsValid(void)
|
||||
@@ -488,32 +487,9 @@ retry:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset prefetch and drop connection to the shard.
|
||||
* It also drops connection to all other shards involved in prefetch.
|
||||
*/
|
||||
|
||||
static void
|
||||
pageserver_disconnect(shardno_t shard_no)
|
||||
{
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*
|
||||
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
|
||||
* because prefetch request may be registered before connection is established.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
|
||||
pageserver_disconnect_shard(shard_no);
|
||||
}
|
||||
|
||||
/*
|
||||
* Disconnect from specified shard
|
||||
*/
|
||||
static void
|
||||
pageserver_disconnect_shard(shardno_t shard_no)
|
||||
{
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not clear
|
||||
@@ -527,6 +503,14 @@ pageserver_disconnect_shard(shardno_t shard_no)
|
||||
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
|
||||
PQfinish(page_servers[shard_no].conn);
|
||||
page_servers[shard_no].conn = NULL;
|
||||
|
||||
/*
|
||||
* If the connection to any pageserver is lost, we throw away the
|
||||
* whole prefetch queue, even for other pageservers. It should not
|
||||
* cause big problems, because connection loss is supposed to be a
|
||||
* rare event.
|
||||
*/
|
||||
prefetch_on_ps_disconnect();
|
||||
}
|
||||
if (page_servers[shard_no].wes != NULL)
|
||||
{
|
||||
@@ -692,8 +676,7 @@ page_server_api api =
|
||||
{
|
||||
.send = pageserver_send,
|
||||
.flush = pageserver_flush,
|
||||
.receive = pageserver_receive,
|
||||
.disconnect = pageserver_disconnect_shard
|
||||
.receive = pageserver_receive
|
||||
};
|
||||
|
||||
static bool
|
||||
|
||||
@@ -180,7 +180,6 @@ typedef struct
|
||||
bool (*send) (shardno_t shard_no, NeonRequest * request);
|
||||
NeonResponse *(*receive) (shardno_t shard_no);
|
||||
bool (*flush) (shardno_t shard_no);
|
||||
void (*disconnect) (shardno_t shard_no);
|
||||
} page_server_api;
|
||||
|
||||
extern void prefetch_on_ps_disconnect(void);
|
||||
|
||||
@@ -613,14 +613,6 @@ prefetch_on_ps_disconnect(void)
|
||||
Assert(slot->status == PRFS_REQUESTED);
|
||||
Assert(slot->my_ring_index == ring_index);
|
||||
|
||||
/*
|
||||
* Drop connection to all shards which have prefetch requests.
|
||||
* It is not a problem to call disconnect multiple times on the same connection
|
||||
* because disconnect implementation in libpagestore.c will check if connection
|
||||
* is alive and do nothing of connection was already dropped.
|
||||
*/
|
||||
page_server->disconnect(slot->shard_no);
|
||||
|
||||
/* clean up the request */
|
||||
slot->status = PRFS_TAG_REMAINS;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
@@ -641,12 +633,13 @@ prefetch_on_ps_disconnect(void)
|
||||
static inline void
|
||||
prefetch_set_unused(uint64 ring_index)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
||||
|
||||
if (ring_index < MyPState->ring_last)
|
||||
return; /* Should already be unused */
|
||||
|
||||
slot = GetPrfSlot(ring_index);
|
||||
Assert(MyPState->ring_unused > ring_index);
|
||||
|
||||
if (slot->status == PRFS_UNUSED)
|
||||
return;
|
||||
|
||||
@@ -805,8 +798,7 @@ Retry:
|
||||
{
|
||||
if (*force_lsn > slot->effective_request_lsn)
|
||||
{
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
goto Retry;
|
||||
prefetch_wait_for(ring_index);
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -821,8 +813,7 @@ Retry:
|
||||
{
|
||||
if (*force_lsn != slot->effective_request_lsn)
|
||||
{
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
goto Retry;
|
||||
prefetch_wait_for(ring_index);
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -888,8 +879,7 @@ Retry:
|
||||
{
|
||||
case PRFS_REQUESTED:
|
||||
Assert(MyPState->ring_receive == cleanup_index);
|
||||
if (!prefetch_wait_for(cleanup_index))
|
||||
goto Retry;
|
||||
prefetch_wait_for(cleanup_index);
|
||||
prefetch_set_unused(cleanup_index);
|
||||
break;
|
||||
case PRFS_RECEIVED:
|
||||
@@ -1690,7 +1680,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
@@ -2142,7 +2132,6 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
/*
|
||||
* Try to find prefetched page in the list of received pages.
|
||||
*/
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -2164,8 +2153,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
*/
|
||||
if (slot->status == PRFS_REQUESTED)
|
||||
{
|
||||
if (!prefetch_wait_for(slot->my_ring_index))
|
||||
goto Retry;
|
||||
prefetch_wait_for(slot->my_ring_index);
|
||||
}
|
||||
/* drop caches */
|
||||
prefetch_set_unused(slot->my_ring_index);
|
||||
@@ -2228,7 +2216,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
/* buffer was used, clean up for later reuse */
|
||||
@@ -2501,7 +2489,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
@@ -2556,7 +2544,7 @@ neon_dbsize(Oid dbNode)
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
@@ -2861,7 +2849,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
break;
|
||||
|
||||
default:
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
|
||||
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-iam.workspace = true
|
||||
|
||||
@@ -12,8 +12,6 @@ use crate::console::errors::GetAuthInfoError;
|
||||
use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{AuthSecret, NodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::stream::Stream;
|
||||
@@ -30,7 +28,7 @@ use crate::{
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
use tracing::info;
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -176,52 +174,6 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
}
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
|
||||
ctx: &mut RequestMonitoring,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
is_cleartext: bool,
|
||||
) -> auth::Result<AuthSecret> {
|
||||
// we have validated the endpoint exists, so let's intern it.
|
||||
let endpoint_int = EndpointIdInt::from(endpoint);
|
||||
|
||||
// only count the full hash count if password hack or websocket flow.
|
||||
// in other words, if proxy needs to run the hashing
|
||||
let password_weight = if is_cleartext {
|
||||
match &secret {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
AuthSecret::Md5(_) => 1,
|
||||
AuthSecret::Scram(s) => s.iterations + 1,
|
||||
}
|
||||
} else {
|
||||
// validating scram takes just 1 hmac_sha_256 operation.
|
||||
1
|
||||
};
|
||||
|
||||
let limit_not_exceeded = self
|
||||
.rate_limiter
|
||||
.check((endpoint_int, ctx.peer_addr), password_weight);
|
||||
|
||||
if !limit_not_exceeded {
|
||||
warn!(
|
||||
enabled = self.rate_limiter_enabled,
|
||||
"rate limiting authentication"
|
||||
);
|
||||
AUTH_RATE_LIMIT_HITS.inc();
|
||||
ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
|
||||
|
||||
if self.rate_limiter_enabled {
|
||||
return Err(auth::AuthError::too_many_connections());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(secret)
|
||||
}
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
///
|
||||
@@ -262,24 +214,14 @@ async fn auth_quirks(
|
||||
Some(secret) => secret,
|
||||
None => api.get_role_secret(ctx, &info).await?,
|
||||
};
|
||||
let (cached_entry, secret) = cached_secret.take_value();
|
||||
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
ctx,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
)?,
|
||||
None => {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
}
|
||||
};
|
||||
|
||||
let secret = cached_secret.value.clone().unwrap_or_else(|| {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
|
||||
});
|
||||
match authenticate_with_secret(
|
||||
ctx,
|
||||
secret,
|
||||
@@ -295,7 +237,7 @@ async fn auth_quirks(
|
||||
Err(e) => {
|
||||
if e.is_auth_failed() {
|
||||
// The password could have been changed, so we invalidate the cache.
|
||||
cached_entry.invalidate();
|
||||
cached_secret.invalidate();
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
@@ -473,7 +415,6 @@ mod tests {
|
||||
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_protocol::{
|
||||
authentication::sasl::{ChannelBinding, ScramSha256},
|
||||
message::{backend::Message as PgMessage, frontend},
|
||||
@@ -491,7 +432,6 @@ mod tests {
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
@@ -533,11 +473,9 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
|
||||
static CONFIG: &AuthenticationConfig = &AuthenticationConfig {
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
});
|
||||
};
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
loop {
|
||||
@@ -606,7 +544,7 @@ mod tests {
|
||||
}
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -646,7 +584,7 @@ mod tests {
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
|
||||
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -686,7 +624,7 @@ mod tests {
|
||||
client.write_all(&write).await.unwrap();
|
||||
});
|
||||
|
||||
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
|
||||
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, CONFIG)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ use proxy::auth;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
use proxy::cancellation::CancelMap;
|
||||
use proxy::cancellation::CancellationHandler;
|
||||
use proxy::config::remote_storage_from_toml;
|
||||
use proxy::config::AuthenticationConfig;
|
||||
use proxy::config::CacheOptions;
|
||||
use proxy::config::HttpConfig;
|
||||
@@ -19,7 +18,6 @@ use proxy::console;
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http;
|
||||
use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
|
||||
use proxy::rate_limiter::AuthRateLimiter;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
@@ -143,16 +141,10 @@ struct ProxyCliArgs {
|
||||
///
|
||||
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
|
||||
/// Can be given multiple times for different bucket sizes.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Whether the auth rate limiter actually takes effect (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
auth_rate_limit_enabled: bool,
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
|
||||
#[clap(long, default_value_t = 100)]
|
||||
@@ -192,19 +184,6 @@ struct ProxyCliArgs {
|
||||
|
||||
#[clap(flatten)]
|
||||
parquet_upload: ParquetUploadArgs,
|
||||
|
||||
/// interval for backup metric collection
|
||||
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||
metric_backup_collection_interval: std::time::Duration,
|
||||
/// remote storage configuration for backup metric collection
|
||||
/// Encoded as toml (same format as pageservers), eg
|
||||
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
|
||||
#[clap(long, default_value = "{}")]
|
||||
metric_backup_collection_remote_storage: String,
|
||||
/// chunk size for backup metric collection
|
||||
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
|
||||
#[clap(long, default_value = "4194304")]
|
||||
metric_backup_collection_chunk_size: usize,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
@@ -386,17 +365,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
// maintenance tasks. these never return unless there's an error
|
||||
let mut maintenance_tasks = JoinSet::new();
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
|
||||
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
|
||||
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
|
||||
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
// TODO: Add gc regardles of the metric collection being enabled.
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||
client_tasks.spawn(usage_metrics::task_backup(
|
||||
&metrics_config.backup_metric_collection_config,
|
||||
cancellation_token,
|
||||
));
|
||||
}
|
||||
|
||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||
@@ -453,13 +427,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
if args.allow_self_signed_compute {
|
||||
warn!("allowing self-signed compute certificates");
|
||||
}
|
||||
let backup_metric_collection_config = config::MetricBackupCollectionConfig {
|
||||
interval: args.metric_backup_collection_interval,
|
||||
remote_storage_config: remote_storage_from_toml(
|
||||
&args.metric_backup_collection_remote_storage,
|
||||
)?,
|
||||
chunk_size: args.metric_backup_collection_chunk_size,
|
||||
};
|
||||
|
||||
let metric_collection = match (
|
||||
&args.metric_collection_endpoint,
|
||||
@@ -468,7 +435,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
|
||||
endpoint: endpoint.parse()?,
|
||||
interval: humantime::parse_duration(interval)?,
|
||||
backup_metric_collection_config,
|
||||
}),
|
||||
(None, None) => None,
|
||||
_ => bail!(
|
||||
@@ -544,8 +510,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
};
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
|
||||
10
proxy/src/cache/common.rs
vendored
10
proxy/src/cache/common.rs
vendored
@@ -43,16 +43,6 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
Self { token: None, value }
|
||||
}
|
||||
|
||||
pub fn take_value(self) -> (Cached<C, ()>, V) {
|
||||
(
|
||||
Cached {
|
||||
token: self.token,
|
||||
value: (),
|
||||
},
|
||||
self.value,
|
||||
)
|
||||
}
|
||||
|
||||
/// Drop this entry from a cache if it's still there.
|
||||
pub fn invalidate(self) -> V {
|
||||
if let Some((cache, info)) = &self.token {
|
||||
|
||||
30
proxy/src/cache/project_info.rs
vendored
30
proxy/src/cache/project_info.rs
vendored
@@ -373,7 +373,10 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = None;
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
@@ -392,7 +395,10 @@ mod tests {
|
||||
|
||||
// Shouldn't add more than 2 roles.
|
||||
let user3: RoleName = "user3".into();
|
||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
|
||||
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user3.as_str(),
|
||||
[3; 32],
|
||||
)));
|
||||
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
|
||||
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
|
||||
|
||||
@@ -425,8 +431,14 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
@@ -474,8 +486,14 @@ mod tests {
|
||||
let endpoint_id = "endpoint".into();
|
||||
let user1: RoleName = "user1".into();
|
||||
let user2: RoleName = "user2".into();
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32])));
|
||||
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user1.as_str(),
|
||||
[1; 32],
|
||||
)));
|
||||
let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
|
||||
user2.as_str(),
|
||||
[2; 32],
|
||||
)));
|
||||
let allowed_ips = Arc::new(vec![
|
||||
"127.0.0.1".parse().unwrap(),
|
||||
"127.0.0.2".parse().unwrap(),
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
use crate::{
|
||||
auth,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
serverless::GlobalConnPoolOptions,
|
||||
};
|
||||
use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use itertools::Itertools;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use rustls::{
|
||||
crypto::ring::sign,
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
@@ -40,7 +35,6 @@ pub struct ProxyConfig {
|
||||
pub struct MetricCollectionConfig {
|
||||
pub endpoint: reqwest::Url,
|
||||
pub interval: Duration,
|
||||
pub backup_metric_collection_config: MetricBackupCollectionConfig,
|
||||
}
|
||||
|
||||
pub struct TlsConfig {
|
||||
@@ -56,8 +50,6 @@ pub struct HttpConfig {
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
@@ -313,21 +305,6 @@ impl CertResolver {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricBackupCollectionConfig {
|
||||
pub interval: Duration,
|
||||
pub remote_storage_config: OptRemoteStorageConfig,
|
||||
pub chunk_size: usize,
|
||||
}
|
||||
|
||||
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
|
||||
/// runtime type errors from the value parser we use.
|
||||
pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
|
||||
|
||||
pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
|
||||
RemoteStorageConfig::from_toml(&s.parse()?)
|
||||
}
|
||||
|
||||
/// Helper for cmdline cache options parsing.
|
||||
#[derive(Debug)]
|
||||
pub struct CacheOptions {
|
||||
|
||||
@@ -13,14 +13,12 @@ use parquet::{
|
||||
},
|
||||
record::RecordWriter,
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
|
||||
use tokio::{sync::mpsc, time};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, Span};
|
||||
use utils::backoff;
|
||||
|
||||
use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig};
|
||||
|
||||
use super::{RequestMonitoring, LOG_CHAN};
|
||||
|
||||
#[derive(clap::Args, Clone, Debug)]
|
||||
@@ -52,13 +50,21 @@ pub struct ParquetUploadArgs {
|
||||
parquet_upload_compression: Compression,
|
||||
}
|
||||
|
||||
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
|
||||
/// runtime type errors from the value parser we use.
|
||||
type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
|
||||
|
||||
fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
|
||||
RemoteStorageConfig::from_toml(&s.parse()?)
|
||||
}
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a upload fails, we log it at info-level, and retry.
|
||||
// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_UPLOAD_RETRIES times, we give up
|
||||
pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
|
||||
// the parquet crate leaves a lot to be desired...
|
||||
// what follows is an attempt to write parquet files with minimal allocs.
|
||||
|
||||
@@ -4,10 +4,7 @@ use ::metrics::{
|
||||
register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
|
||||
IntCounterVec, IntGauge, IntGaugeVec,
|
||||
};
|
||||
use metrics::{
|
||||
register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
|
||||
IntCounterPair,
|
||||
};
|
||||
use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::time::{self, Instant};
|
||||
@@ -117,15 +114,12 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub static HTTP_CONTENT_LENGTH: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_http_conn_content_length_bytes",
|
||||
"Number of bytes the HTTP response content consumes",
|
||||
// request/response
|
||||
&["direction"],
|
||||
// smallest bucket = 16 bytes
|
||||
// largest bucket = 4^12 * 16 bytes = 256MB
|
||||
exponential_buckets(16.0, 4.0, 12).unwrap()
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
exponential_buckets(8.0, 2.0, 20).unwrap()
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
@@ -364,20 +358,3 @@ pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
|
||||
register_hll!(
|
||||
32,
|
||||
"proxy_endpoints_auth_rate_limits",
|
||||
"Number of endpoints affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"proxy_requests_auth_rate_limits_total",
|
||||
"Number of connection requests affected by authentication rate limits",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
@@ -280,7 +280,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
// check rate limit
|
||||
if let Some(ep) = user_info.get_endpoint() {
|
||||
if !endpoint_rate_limiter.check(ep, 1) {
|
||||
if !endpoint_rate_limiter.check(ep) {
|
||||
return stream
|
||||
.throw_error(auth::AuthError::too_many_connections())
|
||||
.await?;
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{
|
||||
console::messages::MetricsAuxInfo,
|
||||
metrics::NUM_BYTES_PROXIED_COUNTER,
|
||||
stream::Stream,
|
||||
usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
|
||||
usage_metrics::{Ids, USAGE_METRICS},
|
||||
};
|
||||
use metrics::IntCounterPairGuard;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
|
||||
@@ -142,8 +142,8 @@ impl Scram {
|
||||
Ok(Scram(secret))
|
||||
}
|
||||
|
||||
fn mock() -> Self {
|
||||
Scram(scram::ServerSecret::mock(rand::random()))
|
||||
fn mock(user: &str) -> Self {
|
||||
Scram(scram::ServerSecret::mock(user, rand::random()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -330,7 +330,11 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||
|
||||
let (client_config, server_config) =
|
||||
generate_tls_config("generic-project-name.localhost", "localhost")?;
|
||||
let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
|
||||
let proxy = tokio::spawn(dummy_proxy(
|
||||
client,
|
||||
Some(server_config),
|
||||
Scram::mock("user"),
|
||||
));
|
||||
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
let password: String = rand::thread_rng()
|
||||
|
||||
@@ -4,4 +4,4 @@ mod limiter;
|
||||
pub use aimd::Aimd;
|
||||
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
|
||||
pub use limiter::Limiter;
|
||||
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
|
||||
pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::hash_map::RandomState,
|
||||
hash::{BuildHasher, Hash},
|
||||
net::IpAddr,
|
||||
hash::BuildHasher,
|
||||
sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex,
|
||||
@@ -17,7 +15,7 @@ use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{intern::EndpointIdInt, EndpointId};
|
||||
use crate::EndpointId;
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
@@ -51,11 +49,11 @@ impl RedisRateLimiter {
|
||||
.data
|
||||
.iter_mut()
|
||||
.zip(self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
self.data.iter_mut().for_each(|b| b.inc(1));
|
||||
self.data.iter_mut().for_each(RateBucket::inc);
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
@@ -73,14 +71,9 @@ impl RedisRateLimiter {
|
||||
// saw SNI, before doing TLS handshake. User-side error messages in that case
|
||||
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
|
||||
// I went with a more expensive way that yields user-friendlier error messages.
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
|
||||
|
||||
pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<Key, Vec<RateBucket>, Hasher>,
|
||||
info: Cow<'static, [RateBucketInfo]>,
|
||||
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
|
||||
info: &'static [RateBucketInfo],
|
||||
access_count: AtomicUsize,
|
||||
rand: Mutex<Rand>,
|
||||
}
|
||||
@@ -92,9 +85,9 @@ struct RateBucket {
|
||||
}
|
||||
|
||||
impl RateBucket {
|
||||
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool {
|
||||
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
|
||||
if now - self.start < info.interval {
|
||||
self.count + n <= info.max_rpi
|
||||
self.count < info.max_rpi
|
||||
} else {
|
||||
// bucket expired, reset
|
||||
self.count = 0;
|
||||
@@ -104,8 +97,8 @@ impl RateBucket {
|
||||
}
|
||||
}
|
||||
|
||||
fn inc(&mut self, n: u32) {
|
||||
self.count += n;
|
||||
fn inc(&mut self) {
|
||||
self.count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +111,7 @@ pub struct RateBucketInfo {
|
||||
|
||||
impl std::fmt::Display for RateBucketInfo {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64;
|
||||
let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
|
||||
write!(f, "{rps}@{}", humantime::format_duration(self.interval))
|
||||
}
|
||||
}
|
||||
@@ -143,25 +136,12 @@ impl std::str::FromStr for RateBucketInfo {
|
||||
}
|
||||
|
||||
impl RateBucketInfo {
|
||||
pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
|
||||
pub const DEFAULT_SET: [Self; 3] = [
|
||||
Self::new(300, Duration::from_secs(1)),
|
||||
Self::new(200, Duration::from_secs(60)),
|
||||
Self::new(100, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
/// All of these are per endpoint-ip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 300mcpus total per endpoint-ip pair
|
||||
/// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
|
||||
/// * 300 requests per second with 4096 hash rounds.
|
||||
/// * 2 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(300 * 4096, Duration::from_secs(1)),
|
||||
Self::new(200 * 4096, Duration::from_secs(60)),
|
||||
Self::new(100 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
|
||||
info.sort_unstable_by_key(|info| info.interval);
|
||||
let invalid = info
|
||||
@@ -170,7 +150,7 @@ impl RateBucketInfo {
|
||||
.find(|(a, b)| a.max_rpi > b.max_rpi);
|
||||
if let Some((a, b)) = invalid {
|
||||
bail!(
|
||||
"invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
|
||||
"invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
|
||||
b.max_rpi,
|
||||
a.max_rpi,
|
||||
);
|
||||
@@ -182,24 +162,19 @@ impl RateBucketInfo {
|
||||
pub const fn new(max_rps: u32, interval: Duration) -> Self {
|
||||
Self {
|
||||
interval,
|
||||
max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32,
|
||||
max_rpi: max_rps * interval.as_millis() as u32 / 1000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq> BucketRateLimiter<K> {
|
||||
pub fn new(info: impl Into<Cow<'static, [RateBucketInfo]>>) -> Self {
|
||||
impl EndpointRateLimiter {
|
||||
pub fn new(info: &'static [RateBucketInfo]) -> Self {
|
||||
Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
fn new_with_rand_and_hasher(
|
||||
info: impl Into<Cow<'static, [RateBucketInfo]>>,
|
||||
rand: R,
|
||||
hasher: S,
|
||||
) -> Self {
|
||||
let info = info.into();
|
||||
impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
|
||||
fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
|
||||
info!(buckets = ?info, "endpoint rate limiter");
|
||||
Self {
|
||||
info,
|
||||
@@ -210,7 +185,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
|
||||
/// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
pub fn check(&self, key: K, n: u32) -> bool {
|
||||
pub fn check(&self, endpoint: EndpointId) -> bool {
|
||||
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
|
||||
// worst case memory usage is about:
|
||||
// = 2 * 2048 * 64 * (48B + 72B)
|
||||
@@ -220,7 +195,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
let mut entry = self.map.entry(key).or_insert_with(|| {
|
||||
let mut entry = self.map.entry(endpoint).or_insert_with(|| {
|
||||
vec![
|
||||
RateBucket {
|
||||
start: now,
|
||||
@@ -232,12 +207,12 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
|
||||
let should_allow_request = entry
|
||||
.iter_mut()
|
||||
.zip(&*self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now, n));
|
||||
.zip(self.info)
|
||||
.all(|(bucket, info)| bucket.should_allow_request(info, now));
|
||||
|
||||
if should_allow_request {
|
||||
// only increment the bucket counts if the request will actually be accepted
|
||||
entry.iter_mut().for_each(|b| b.inc(n));
|
||||
entry.iter_mut().for_each(RateBucket::inc);
|
||||
}
|
||||
|
||||
should_allow_request
|
||||
@@ -248,7 +223,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
/// But that way deletion does not aquire mutex on each entry access.
|
||||
pub fn do_gc(&self) {
|
||||
info!(
|
||||
"cleaning up bucket rate limiter, current size = {}",
|
||||
"cleaning up endpoint rate limiter, current size = {}",
|
||||
self.map.len()
|
||||
);
|
||||
let n = self.map.shards().len();
|
||||
@@ -559,7 +534,7 @@ mod tests {
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time;
|
||||
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
|
||||
use super::{EndpointRateLimiter, Limiter, Outcome};
|
||||
use crate::{
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
|
||||
EndpointId,
|
||||
@@ -697,12 +672,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn default_rate_buckets() {
|
||||
let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
|
||||
let mut defaults = RateBucketInfo::DEFAULT_SET;
|
||||
RateBucketInfo::validate(&mut defaults[..]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
|
||||
#[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
|
||||
fn rate_buckets_validate() {
|
||||
let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
|
||||
.into_iter()
|
||||
@@ -718,42 +693,42 @@ mod tests {
|
||||
.map(|s| s.parse().unwrap())
|
||||
.collect();
|
||||
RateBucketInfo::validate(&mut rates).unwrap();
|
||||
let limiter = EndpointRateLimiter::new(rates);
|
||||
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
|
||||
|
||||
let endpoint = EndpointId::from("ep-my-endpoint-1234");
|
||||
|
||||
time::pause();
|
||||
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
// more connections fail
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// fail even after 500ms as it's in the same bucket
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// after a full 1s, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
for _ in 1..6 {
|
||||
for _ in 0..50 {
|
||||
assert!(limiter.check(endpoint.clone(), 2));
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
time::advance(time::Duration::from_millis(1000)).await;
|
||||
}
|
||||
|
||||
// more connections after 600 will exceed the 20rps@30s limit
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// will still fail before the 30 second limit
|
||||
time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint.clone()));
|
||||
|
||||
// after the full 30 seconds, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(1)).await;
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -763,41 +738,14 @@ mod tests {
|
||||
let rand = rand::rngs::StdRng::from_seed([1; 32]);
|
||||
let hasher = BuildHasherDefault::<FxHasher>::default();
|
||||
|
||||
let limiter = BucketRateLimiter::new_with_rand_and_hasher(
|
||||
&RateBucketInfo::DEFAULT_ENDPOINT_SET,
|
||||
let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
|
||||
&RateBucketInfo::DEFAULT_SET,
|
||||
rand,
|
||||
hasher,
|
||||
);
|
||||
for i in 0..1_000_000 {
|
||||
limiter.check(i, 1);
|
||||
limiter.check(format!("{i}").into());
|
||||
}
|
||||
assert!(limiter.map.len() < 150_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 300 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 200 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 100 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,13 +50,13 @@ impl ServerSecret {
|
||||
/// To avoid revealing information to an attacker, we use a
|
||||
/// mocked server secret even if the user doesn't exist.
|
||||
/// See `auth-scram.c : mock_scram_secret` for details.
|
||||
pub fn mock(nonce: [u8; 32]) -> Self {
|
||||
pub fn mock(user: &str, nonce: [u8; 32]) -> Self {
|
||||
// Refer to `auth-scram.c : scram_mock_salt`.
|
||||
let mocked_salt = super::sha256([user.as_bytes(), &nonce]);
|
||||
|
||||
Self {
|
||||
// this doesn't reveal much information as we're going to use
|
||||
// iteration count 1 for our generated passwords going forward.
|
||||
// PG16 users can set iteration count=1 already today.
|
||||
iterations: 1,
|
||||
salt_base64: base64::encode(nonce),
|
||||
iterations: 4096,
|
||||
salt_base64: base64::encode(mocked_salt),
|
||||
stored_key: ScramKey::default(),
|
||||
server_key: ScramKey::default(),
|
||||
doomed: true,
|
||||
|
||||
@@ -42,12 +42,7 @@ impl PoolingBackend {
|
||||
};
|
||||
|
||||
let secret = match cached_secret.value.clone() {
|
||||
Some(secret) => self.config.authentication_config.check_rate_limit(
|
||||
ctx,
|
||||
secret,
|
||||
&user_info.endpoint,
|
||||
true,
|
||||
)?,
|
||||
Some(secret) => secret,
|
||||
None => {
|
||||
// If we don't have an authentication secret, for the http flow we can just return an error.
|
||||
info!("authentication info not found");
|
||||
|
||||
@@ -42,15 +42,12 @@ use crate::error::ReportableError;
|
||||
use crate::error::UserFacingError;
|
||||
use crate::metrics::HTTP_CONTENT_LENGTH;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::proxy::run_until_cancelled;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::serverless::backend::HttpConnError;
|
||||
use crate::usage_metrics::MetricCounterRecorder;
|
||||
use crate::DbName;
|
||||
use crate::RoleName;
|
||||
|
||||
use super::backend::PoolingBackend;
|
||||
use super::conn_pool::Client;
|
||||
use super::conn_pool::ConnInfo;
|
||||
use super::json::json_to_pg_text;
|
||||
use super::json::pg_text_row_to_json;
|
||||
@@ -222,7 +219,14 @@ pub async fn handle(
|
||||
backend: Arc<PoolingBackend>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let cancel2 = cancel.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
time::sleep(config.http_config.request_timeout).await;
|
||||
cancel2.cancel();
|
||||
});
|
||||
|
||||
let result = handle_inner(cancel, config, &mut ctx, request, backend).await;
|
||||
handle.abort();
|
||||
|
||||
let mut response = match result {
|
||||
Ok(r) => {
|
||||
@@ -233,7 +237,10 @@ pub async fn handle(
|
||||
let error_kind = e.get_error_kind();
|
||||
ctx.set_error_kind(error_kind);
|
||||
|
||||
let message = "Query cancelled, connection was terminated";
|
||||
let message = format!(
|
||||
"Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections",
|
||||
config.http_config.request_timeout.as_secs_f64()
|
||||
);
|
||||
|
||||
tracing::info!(
|
||||
kind=error_kind.to_metric_label(),
|
||||
@@ -427,63 +434,6 @@ impl ReportableError for SqlOverHttpCancel {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct HttpHeaders {
|
||||
raw_output: bool,
|
||||
default_array_mode: bool,
|
||||
txn_isolation_level: Option<IsolationLevel>,
|
||||
txn_read_only: bool,
|
||||
txn_deferrable: bool,
|
||||
}
|
||||
|
||||
impl HttpHeaders {
|
||||
fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
|
||||
// Determine the output options. Default behaviour is 'false'. Anything that is not
|
||||
// strictly 'true' assumed to be false.
|
||||
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
|
||||
let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
// isolation level, read only and deferrable
|
||||
let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) {
|
||||
Some(x) => Some(
|
||||
map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?,
|
||||
),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
Ok(Self {
|
||||
raw_output,
|
||||
default_array_mode,
|
||||
txn_isolation_level,
|
||||
txn_read_only,
|
||||
txn_deferrable,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn map_header_to_isolation_level(level: &HeaderValue) -> Option<IsolationLevel> {
|
||||
match level.as_bytes() {
|
||||
b"Serializable" => Some(IsolationLevel::Serializable),
|
||||
b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted),
|
||||
b"ReadCommitted" => Some(IsolationLevel::ReadCommitted),
|
||||
b"RepeatableRead" => Some(IsolationLevel::RepeatableRead),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue> {
|
||||
match level {
|
||||
IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")),
|
||||
IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")),
|
||||
IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")),
|
||||
IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_inner(
|
||||
cancel: CancellationToken,
|
||||
config: &'static ProxyConfig,
|
||||
@@ -500,26 +450,43 @@ async fn handle_inner(
|
||||
// Determine the destination and connection params
|
||||
//
|
||||
let headers = request.headers();
|
||||
|
||||
// TLS config should be there.
|
||||
let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?;
|
||||
info!(user = conn_info.user_info.user.as_str(), "credentials");
|
||||
|
||||
// Determine the output options. Default behaviour is 'false'. Anything that is not
|
||||
// strictly 'true' assumed to be false.
|
||||
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
|
||||
let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
// Allow connection pooling only if explicitly requested
|
||||
// or if we have decided that http pool is no longer opt-in
|
||||
let allow_pool = !config.http_config.pool_options.opt_in
|
||||
|| headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
let parsed_headers = HttpHeaders::try_parse(headers)?;
|
||||
// isolation level, read only and deferrable
|
||||
|
||||
let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
|
||||
let txn_isolation_level = match txn_isolation_level_raw {
|
||||
Some(ref x) => Some(match x.as_bytes() {
|
||||
b"Serializable" => IsolationLevel::Serializable,
|
||||
b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
|
||||
b"ReadCommitted" => IsolationLevel::ReadCommitted,
|
||||
b"RepeatableRead" => IsolationLevel::RepeatableRead,
|
||||
_ => return Err(SqlOverHttpError::InvalidIsolationLevel),
|
||||
}),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
let request_content_length = match request.body().size_hint().upper() {
|
||||
Some(v) => v,
|
||||
None => MAX_REQUEST_SIZE + 1,
|
||||
};
|
||||
info!(request_content_length, "request size in bytes");
|
||||
HTTP_CONTENT_LENGTH
|
||||
.with_label_values(&["request"])
|
||||
.observe(request_content_length as f64);
|
||||
HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
|
||||
|
||||
// we don't have a streaming request support yet so this is to prevent OOM
|
||||
// from a malicious user sending an extremely large request body
|
||||
@@ -547,18 +514,20 @@ async fn handle_inner(
|
||||
}
|
||||
.map_err(SqlOverHttpError::from);
|
||||
|
||||
let (payload, mut client) = match run_until_cancelled(
|
||||
// Run both operations in parallel
|
||||
// Run both operations in parallel
|
||||
let (payload, mut client) = match select(
|
||||
try_join(
|
||||
pin!(fetch_and_process_request),
|
||||
pin!(authenticate_and_connect),
|
||||
),
|
||||
&cancel,
|
||||
pin!(cancel.cancelled()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(result) => result?,
|
||||
None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)),
|
||||
Either::Left((result, _cancelled)) => result?,
|
||||
Either::Right((_cancelled, _)) => {
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect))
|
||||
}
|
||||
};
|
||||
|
||||
let mut response = Response::builder()
|
||||
@@ -568,143 +537,95 @@ async fn handle_inner(
|
||||
//
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let mut size = 0;
|
||||
let result = match payload {
|
||||
Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?,
|
||||
Payload::Batch(statements) => {
|
||||
if parsed_headers.txn_read_only {
|
||||
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if parsed_headers.txn_deferrable {
|
||||
response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if let Some(txn_isolation_level) = parsed_headers
|
||||
.txn_isolation_level
|
||||
.and_then(map_isolation_level_to_headers)
|
||||
{
|
||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
|
||||
statements
|
||||
.process(cancel, &mut client, parsed_headers)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
|
||||
let metrics = client.metrics();
|
||||
|
||||
// how could this possibly fail
|
||||
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
||||
let len = body.len();
|
||||
let response = response
|
||||
.body(Body::from(body))
|
||||
// only fails if invalid status code or invalid header/values are given.
|
||||
// these are not user configurable so it cannot fail dynamically
|
||||
.expect("building response payload should not fail");
|
||||
|
||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||
metrics.record_egress(len as u64);
|
||||
HTTP_CONTENT_LENGTH
|
||||
.with_label_values(&["response"])
|
||||
.observe(len as f64);
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
impl QueryData {
|
||||
async fn process(
|
||||
self,
|
||||
cancel: CancellationToken,
|
||||
client: &mut Client<tokio_postgres::Client>,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<Value, SqlOverHttpError> {
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
|
||||
let res = match select(
|
||||
pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
|
||||
pin!(cancel.cancelled()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// The query successfully completed.
|
||||
Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
|
||||
discard.check_idle(status);
|
||||
Ok(results)
|
||||
}
|
||||
// The query failed with an error
|
||||
Either::Left((Err(e), __not_yet_cancelled)) => {
|
||||
discard.discard();
|
||||
return Err(e);
|
||||
}
|
||||
// The query was cancelled.
|
||||
Either::Right((_cancelled, query)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
Payload::Single(stmt) => {
|
||||
let mut size = 0;
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let query = pin!(query_to_json(
|
||||
&*inner,
|
||||
stmt,
|
||||
&mut size,
|
||||
raw_output,
|
||||
default_array_mode
|
||||
));
|
||||
let cancelled = pin!(cancel.cancelled());
|
||||
let res = select(query, cancelled).await;
|
||||
match res {
|
||||
Either::Left((Ok((status, results)), _cancelled)) => {
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
// wait for the query cancellation
|
||||
match time::timeout(time::Duration::from_millis(100), query).await {
|
||||
// query successed before it was cancelled.
|
||||
Ok(Ok((status, results))) => {
|
||||
discard.check_idle(status);
|
||||
Ok(results)
|
||||
Either::Left((Err(e), _cancelled)) => {
|
||||
discard.discard();
|
||||
return Err(e);
|
||||
}
|
||||
Either::Right((_cancelled, query)) => {
|
||||
if let Err(err) = cancel_token.cancel_query(NoTls).await {
|
||||
tracing::error!(?err, "could not cancel query");
|
||||
}
|
||||
// query failed or was cancelled.
|
||||
Ok(Err(error)) => {
|
||||
let db_error = match &error {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// if errored for some other reason, it might not be safe to return
|
||||
if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
|
||||
discard.discard();
|
||||
match time::timeout(time::Duration::from_millis(100), query).await {
|
||||
Ok(Ok((status, results))) => {
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
Ok(Err(error)) => {
|
||||
let db_error = match &error {
|
||||
SqlOverHttpError::ConnectCompute(
|
||||
HttpConnError::ConnectionError(e),
|
||||
)
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
|
||||
}
|
||||
Err(_timeout) => {
|
||||
discard.discard();
|
||||
Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
|
||||
// if errored for some other reason, it might not be safe to return
|
||||
if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) {
|
||||
discard.discard();
|
||||
}
|
||||
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
Err(_timeout) => {
|
||||
discard.discard();
|
||||
return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
impl BatchQueryData {
|
||||
async fn process(
|
||||
self,
|
||||
cancel: CancellationToken,
|
||||
client: &mut Client<tokio_postgres::Client>,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<Value, SqlOverHttpError> {
|
||||
info!("starting transaction");
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let mut builder = inner.build_transaction();
|
||||
if let Some(isolation_level) = parsed_headers.txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
if parsed_headers.txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
if parsed_headers.txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
Payload::Batch(statements) => {
|
||||
info!("starting transaction");
|
||||
let (inner, mut discard) = client.inner();
|
||||
let cancel_token = inner.cancel_token();
|
||||
let mut builder = inner.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
if txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
if txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let transaction = builder.start().await.map_err(|e| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
let transaction = builder.start().await.map_err(|e| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
|
||||
let results =
|
||||
match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await {
|
||||
let results = match query_batch(
|
||||
cancel.child_token(),
|
||||
&transaction,
|
||||
statements,
|
||||
&mut size,
|
||||
raw_output,
|
||||
default_array_mode,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(results) => {
|
||||
info!("commit");
|
||||
let status = transaction.commit().await.map_err(|e| {
|
||||
@@ -738,15 +659,44 @@ impl BatchQueryData {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(json!({ "results": results }))
|
||||
}
|
||||
if txn_read_only {
|
||||
response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if txn_deferrable {
|
||||
response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
json!({ "results": results })
|
||||
}
|
||||
};
|
||||
|
||||
let metrics = client.metrics();
|
||||
|
||||
// how could this possibly fail
|
||||
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
||||
let len = body.len();
|
||||
let response = response
|
||||
.body(Body::from(body))
|
||||
// only fails if invalid status code or invalid header/values are given.
|
||||
// these are not user configurable so it cannot fail dynamically
|
||||
.expect("building response payload should not fail");
|
||||
|
||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||
metrics.record_egress(len as u64);
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn query_batch(
|
||||
cancel: CancellationToken,
|
||||
transaction: &Transaction<'_>,
|
||||
queries: BatchQueryData,
|
||||
parsed_headers: HttpHeaders,
|
||||
total_size: &mut usize,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> Result<Vec<Value>, SqlOverHttpError> {
|
||||
let mut results = Vec::with_capacity(queries.queries.len());
|
||||
let mut current_size = 0;
|
||||
@@ -755,7 +705,8 @@ async fn query_batch(
|
||||
transaction,
|
||||
stmt,
|
||||
&mut current_size,
|
||||
parsed_headers,
|
||||
raw_output,
|
||||
array_mode
|
||||
));
|
||||
let cancelled = pin!(cancel.cancelled());
|
||||
let res = select(query, cancelled).await;
|
||||
@@ -772,6 +723,7 @@ async fn query_batch(
|
||||
}
|
||||
}
|
||||
}
|
||||
*total_size += current_size;
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@@ -779,7 +731,8 @@ async fn query_to_json<T: GenericClient>(
|
||||
client: &T,
|
||||
data: QueryData,
|
||||
current_size: &mut usize,
|
||||
parsed_headers: HttpHeaders,
|
||||
raw_output: bool,
|
||||
default_array_mode: bool,
|
||||
) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> {
|
||||
info!("executing query");
|
||||
let query_params = data.params;
|
||||
@@ -839,12 +792,12 @@ async fn query_to_json<T: GenericClient>(
|
||||
columns.push(client.get_type(c.type_oid()).await?);
|
||||
}
|
||||
|
||||
let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
|
||||
let array_mode = data.array_mode.unwrap_or(default_array_mode);
|
||||
|
||||
// convert rows to JSON
|
||||
let rows = rows
|
||||
.iter()
|
||||
.map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
|
||||
.map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
|
||||
@@ -1,34 +1,20 @@
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::{
|
||||
config::{MetricBackupCollectionConfig, MetricCollectionConfig},
|
||||
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
http, BranchId, EndpointId,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Bytes;
|
||||
use chrono::{DateTime, Datelike, Timelike, Utc};
|
||||
use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use dashmap::{mapref::entry::Entry, DashMap};
|
||||
use futures::future::select;
|
||||
use once_cell::sync::Lazy;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
pin::pin,
|
||||
sync::{
|
||||
atomic::{AtomicU64, AtomicUsize, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, instrument, trace};
|
||||
use utils::backoff;
|
||||
use uuid::{NoContext, Timestamp};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
@@ -47,93 +33,19 @@ pub struct Ids {
|
||||
pub branch_id: BranchId,
|
||||
}
|
||||
|
||||
pub trait MetricCounterRecorder {
|
||||
/// Record that some bytes were sent from the proxy to the client
|
||||
fn record_egress(&self, bytes: u64);
|
||||
/// Record that some connections were opened
|
||||
fn record_connection(&self, count: usize);
|
||||
}
|
||||
|
||||
trait MetricCounterReporter {
|
||||
fn get_metrics(&mut self) -> (u64, usize);
|
||||
fn move_metrics(&self) -> (u64, usize);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MetricBackupCounter {
|
||||
transmitted: AtomicU64,
|
||||
opened_connections: AtomicUsize,
|
||||
}
|
||||
|
||||
impl MetricCounterRecorder for MetricBackupCounter {
|
||||
fn record_egress(&self, bytes: u64) {
|
||||
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
|
||||
}
|
||||
|
||||
fn record_connection(&self, count: usize) {
|
||||
self.opened_connections.fetch_add(count, Ordering::AcqRel);
|
||||
}
|
||||
}
|
||||
|
||||
impl MetricCounterReporter for MetricBackupCounter {
|
||||
fn get_metrics(&mut self) -> (u64, usize) {
|
||||
(
|
||||
*self.transmitted.get_mut(),
|
||||
*self.opened_connections.get_mut(),
|
||||
)
|
||||
}
|
||||
fn move_metrics(&self) -> (u64, usize) {
|
||||
(
|
||||
self.transmitted.swap(0, Ordering::AcqRel),
|
||||
self.opened_connections.swap(0, Ordering::AcqRel),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MetricCounter {
|
||||
transmitted: AtomicU64,
|
||||
opened_connections: AtomicUsize,
|
||||
backup: Arc<MetricBackupCounter>,
|
||||
}
|
||||
|
||||
impl MetricCounterRecorder for MetricCounter {
|
||||
impl MetricCounter {
|
||||
/// Record that some bytes were sent from the proxy to the client
|
||||
fn record_egress(&self, bytes: u64) {
|
||||
pub fn record_egress(&self, bytes: u64) {
|
||||
self.transmitted.fetch_add(bytes, Ordering::AcqRel);
|
||||
self.backup.record_egress(bytes);
|
||||
}
|
||||
|
||||
/// Record that some connections were opened
|
||||
fn record_connection(&self, count: usize) {
|
||||
self.opened_connections.fetch_add(count, Ordering::AcqRel);
|
||||
self.backup.record_connection(count);
|
||||
}
|
||||
}
|
||||
|
||||
impl MetricCounterReporter for MetricCounter {
|
||||
fn get_metrics(&mut self) -> (u64, usize) {
|
||||
(
|
||||
*self.transmitted.get_mut(),
|
||||
*self.opened_connections.get_mut(),
|
||||
)
|
||||
}
|
||||
fn move_metrics(&self) -> (u64, usize) {
|
||||
(
|
||||
self.transmitted.swap(0, Ordering::AcqRel),
|
||||
self.opened_connections.swap(0, Ordering::AcqRel),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
trait Clearable {
|
||||
/// extract the value that should be reported
|
||||
fn should_report(self: &Arc<Self>) -> Option<u64>;
|
||||
/// Determine whether the counter should be cleared from the global map.
|
||||
fn should_clear(self: &mut Arc<Self>) -> bool;
|
||||
}
|
||||
|
||||
impl<C: MetricCounterReporter> Clearable for C {
|
||||
fn should_report(self: &Arc<Self>) -> Option<u64> {
|
||||
// heuristic to see if the branch is still open
|
||||
// if a clone happens while we are observing, the heuristic will be incorrect.
|
||||
@@ -142,12 +54,13 @@ impl<C: MetricCounterReporter> Clearable for C {
|
||||
// However, for the strong count to be 1 it must have occured that at one instant
|
||||
// all the endpoints were closed, so missing a report because the endpoints are closed is valid.
|
||||
let is_open = Arc::strong_count(self) > 1;
|
||||
let opened = self.opened_connections.swap(0, Ordering::AcqRel);
|
||||
|
||||
// update cached metrics eagerly, even if they can't get sent
|
||||
// (to avoid sending the same metrics twice)
|
||||
// see the relevant discussion on why to do so even if the status is not success:
|
||||
// https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
|
||||
let (value, opened) = self.move_metrics();
|
||||
let value = self.transmitted.swap(0, Ordering::AcqRel);
|
||||
|
||||
// Our only requirement is that we report in every interval if there was an open connection
|
||||
// if there were no opened connections since, then we don't need to report
|
||||
@@ -157,12 +70,15 @@ impl<C: MetricCounterReporter> Clearable for C {
|
||||
Some(value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Determine whether the counter should be cleared from the global map.
|
||||
fn should_clear(self: &mut Arc<Self>) -> bool {
|
||||
// we can't clear this entry if it's acquired elsewhere
|
||||
let Some(counter) = Arc::get_mut(self) else {
|
||||
return false;
|
||||
};
|
||||
let (opened, value) = counter.get_metrics();
|
||||
let opened = *counter.opened_connections.get_mut();
|
||||
let value = *counter.transmitted.get_mut();
|
||||
// clear if there's no data to report
|
||||
value == 0 && opened == 0
|
||||
}
|
||||
@@ -174,26 +90,11 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
|
||||
#[derive(Default)]
|
||||
pub struct Metrics {
|
||||
endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
|
||||
backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
/// Register a new byte metrics counter for this endpoint
|
||||
pub fn register(&self, ids: Ids) -> Arc<MetricCounter> {
|
||||
let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
|
||||
entry.clone()
|
||||
} else {
|
||||
self.backup_endpoints
|
||||
.entry(ids.clone())
|
||||
.or_insert_with(|| {
|
||||
Arc::new(MetricBackupCounter {
|
||||
transmitted: AtomicU64::new(0),
|
||||
opened_connections: AtomicUsize::new(0),
|
||||
})
|
||||
})
|
||||
.clone()
|
||||
};
|
||||
|
||||
let entry = if let Some(entry) = self.endpoints.get(&ids) {
|
||||
entry.clone()
|
||||
} else {
|
||||
@@ -203,13 +104,12 @@ impl Metrics {
|
||||
Arc::new(MetricCounter {
|
||||
transmitted: AtomicU64::new(0),
|
||||
opened_connections: AtomicUsize::new(0),
|
||||
backup: backup.clone(),
|
||||
})
|
||||
})
|
||||
.clone()
|
||||
};
|
||||
|
||||
entry.record_connection(1);
|
||||
entry.opened_connections.fetch_add(1, Ordering::AcqRel);
|
||||
entry
|
||||
}
|
||||
}
|
||||
@@ -232,7 +132,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
|
||||
|
||||
let now = Utc::now();
|
||||
collect_metrics_iteration(
|
||||
&USAGE_METRICS.endpoints,
|
||||
&USAGE_METRICS,
|
||||
&http_client,
|
||||
&config.endpoint,
|
||||
&hostname,
|
||||
@@ -244,66 +144,9 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_and_clear_metrics<C: Clearable>(
|
||||
endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
|
||||
) -> Vec<(Ids, u64)> {
|
||||
let mut metrics_to_clear = Vec::new();
|
||||
|
||||
let metrics_to_send: Vec<(Ids, u64)> = endpoints
|
||||
.iter()
|
||||
.filter_map(|counter| {
|
||||
let key = counter.key().clone();
|
||||
let Some(value) = counter.should_report() else {
|
||||
metrics_to_clear.push(key);
|
||||
return None;
|
||||
};
|
||||
Some((key, value))
|
||||
})
|
||||
.collect();
|
||||
|
||||
for metric in metrics_to_clear {
|
||||
match endpoints.entry(metric) {
|
||||
Entry::Occupied(mut counter) => {
|
||||
if counter.get_mut().should_clear() {
|
||||
counter.remove_entry();
|
||||
}
|
||||
}
|
||||
Entry::Vacant(_) => {}
|
||||
}
|
||||
}
|
||||
metrics_to_send
|
||||
}
|
||||
|
||||
fn create_event_chunks<'a>(
|
||||
metrics_to_send: &'a [(Ids, u64)],
|
||||
hostname: &'a str,
|
||||
prev: DateTime<Utc>,
|
||||
now: DateTime<Utc>,
|
||||
chunk_size: usize,
|
||||
) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
metrics_to_send
|
||||
.chunks(chunk_size)
|
||||
.map(move |chunk| EventChunk {
|
||||
events: chunk
|
||||
.iter()
|
||||
.map(|(ids, value)| Event {
|
||||
kind: EventType::Incremental {
|
||||
start_time: prev,
|
||||
stop_time: now,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname),
|
||||
value: *value,
|
||||
extra: ids.clone(),
|
||||
})
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn collect_metrics_iteration(
|
||||
endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
|
||||
metrics: &Metrics,
|
||||
client: &http::ClientWithMiddleware,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
hostname: &str,
|
||||
@@ -315,17 +158,48 @@ async fn collect_metrics_iteration(
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
let metrics_to_send = collect_and_clear_metrics(endpoints);
|
||||
let mut metrics_to_clear = Vec::new();
|
||||
|
||||
let metrics_to_send: Vec<(Ids, u64)> = metrics
|
||||
.endpoints
|
||||
.iter()
|
||||
.filter_map(|counter| {
|
||||
let key = counter.key().clone();
|
||||
let Some(value) = counter.should_report() else {
|
||||
metrics_to_clear.push(key);
|
||||
return None;
|
||||
};
|
||||
Some((key, value))
|
||||
})
|
||||
.collect();
|
||||
|
||||
if metrics_to_send.is_empty() {
|
||||
trace!("no new metrics to send");
|
||||
}
|
||||
|
||||
// Send metrics.
|
||||
for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
|
||||
let events = chunk
|
||||
.iter()
|
||||
.map(|(ids, value)| Event {
|
||||
kind: EventType::Incremental {
|
||||
start_time: prev,
|
||||
stop_time: now,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname),
|
||||
value: *value,
|
||||
extra: Ids {
|
||||
endpoint_id: ids.endpoint_id.clone(),
|
||||
branch_id: ids.branch_id.clone(),
|
||||
},
|
||||
})
|
||||
.collect();
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk)
|
||||
.json(&EventChunk { events })
|
||||
.send()
|
||||
.await;
|
||||
|
||||
@@ -339,144 +213,25 @@ async fn collect_metrics_iteration(
|
||||
|
||||
if !res.status().is_success() {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
|
||||
for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) {
|
||||
// Report if the metric value is suspiciously large
|
||||
error!("potentially abnormal metric value: {:?}", metric);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn task_backup(
|
||||
backup_config: &MetricBackupCollectionConfig,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("metrics backup config: {backup_config:?}");
|
||||
scopeguard::defer! {
|
||||
info!("metrics backup has shut down");
|
||||
}
|
||||
// Even if the remote storage is not configured, we still want to clear the metrics.
|
||||
let storage = backup_config
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|config| GenericRemoteStorage::from_config(config).context("remote storage init"))
|
||||
.transpose()?;
|
||||
let mut ticker = tokio::time::interval(backup_config.interval);
|
||||
let mut prev = Utc::now();
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
loop {
|
||||
select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
|
||||
let now = Utc::now();
|
||||
collect_metrics_backup_iteration(
|
||||
&USAGE_METRICS.backup_endpoints,
|
||||
&storage,
|
||||
&hostname,
|
||||
prev,
|
||||
now,
|
||||
backup_config.chunk_size,
|
||||
)
|
||||
.await;
|
||||
|
||||
prev = now;
|
||||
if cancellation_token.is_cancelled() {
|
||||
info!("metrics backup has been cancelled");
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn collect_metrics_backup_iteration(
|
||||
endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
|
||||
storage: &Option<GenericRemoteStorage>,
|
||||
hostname: &str,
|
||||
prev: DateTime<Utc>,
|
||||
now: DateTime<Utc>,
|
||||
chunk_size: usize,
|
||||
) {
|
||||
let year = now.year();
|
||||
let month = now.month();
|
||||
let day = now.day();
|
||||
let hour = now.hour();
|
||||
let minute = now.minute();
|
||||
let second = now.second();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
info!("starting collect_metrics_backup_iteration");
|
||||
|
||||
let metrics_to_send = collect_and_clear_metrics(endpoints);
|
||||
|
||||
if metrics_to_send.is_empty() {
|
||||
trace!("no new metrics to send");
|
||||
}
|
||||
|
||||
// Send metrics.
|
||||
for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
|
||||
let real_now = Utc::now();
|
||||
let id = uuid::Uuid::new_v7(Timestamp::from_unix(
|
||||
NoContext,
|
||||
real_now.second().into(),
|
||||
real_now.nanosecond(),
|
||||
));
|
||||
let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
|
||||
let remote_path = match RemotePath::from_string(&path) {
|
||||
Ok(remote_path) => remote_path,
|
||||
Err(e) => {
|
||||
error!("failed to create remote path from str {path}: {:?}", e);
|
||||
continue;
|
||||
for metric in metrics_to_clear {
|
||||
match metrics.endpoints.entry(metric) {
|
||||
Entry::Occupied(mut counter) => {
|
||||
if counter.get_mut().should_clear() {
|
||||
counter.remove_entry();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
error!(
|
||||
"failed to upload consumption events to remote storage: {:?}",
|
||||
e
|
||||
);
|
||||
Entry::Vacant(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_events_chunk(
|
||||
storage: &Option<GenericRemoteStorage>,
|
||||
chunk: EventChunk<'_, Event<Ids, &'static str>>,
|
||||
remote_path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let storage = match storage {
|
||||
Some(storage) => storage,
|
||||
None => {
|
||||
error!("no remote storage configured");
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
|
||||
let mut encoder = GzipEncoder::new(Vec::new());
|
||||
encoder.write_all(&data).await.context("compress metrics")?;
|
||||
encoder.shutdown().await.context("compress metrics")?;
|
||||
let compressed_data: Bytes = encoder.get_ref().clone().into();
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
|
||||
storage
|
||||
.upload(stream, compressed_data.len(), remote_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_UPLOAD_MAX_RETRIES,
|
||||
"request_data_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("request_data_upload")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
@@ -493,7 +248,7 @@ mod tests {
|
||||
};
|
||||
use url::Url;
|
||||
|
||||
use super::*;
|
||||
use super::{collect_metrics_iteration, Ids, Metrics};
|
||||
use crate::{http, rate_limiter::RateLimiterConfig};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -529,19 +284,18 @@ mod tests {
|
||||
let now = Utc::now();
|
||||
|
||||
// no counters have been registered
|
||||
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||
assert!(r.is_empty());
|
||||
|
||||
// register a new counter
|
||||
|
||||
let counter = metrics.register(Ids {
|
||||
endpoint_id: "e1".into(),
|
||||
branch_id: "b1".into(),
|
||||
});
|
||||
|
||||
// the counter should be observed despite 0 egress
|
||||
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||
assert_eq!(r.len(), 1);
|
||||
assert_eq!(r[0].events.len(), 1);
|
||||
@@ -551,7 +305,7 @@ mod tests {
|
||||
counter.record_egress(1);
|
||||
|
||||
// egress should be observered
|
||||
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||
assert_eq!(r.len(), 1);
|
||||
assert_eq!(r[0].events.len(), 1);
|
||||
@@ -561,19 +315,11 @@ mod tests {
|
||||
drop(counter);
|
||||
|
||||
// we do not observe the counter
|
||||
collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
|
||||
collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await;
|
||||
let r = std::mem::take(&mut *reports2.lock().unwrap());
|
||||
assert!(r.is_empty());
|
||||
|
||||
// counter is unregistered
|
||||
assert!(metrics.endpoints.is_empty());
|
||||
|
||||
collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
|
||||
.await;
|
||||
assert!(!metrics.backup_endpoints.is_empty());
|
||||
collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
|
||||
.await;
|
||||
// backup counter is unregistered after the second iteration
|
||||
assert!(metrics.backup_endpoints.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,5 +94,4 @@ select = [
|
||||
"I", # isort
|
||||
"W", # pycodestyle
|
||||
"B", # bugbear
|
||||
"UP032", # f-string
|
||||
]
|
||||
|
||||
@@ -33,7 +33,6 @@ once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
|
||||
@@ -28,7 +28,7 @@ use utils::pid_file;
|
||||
use metrics::set_build_info_metric;
|
||||
use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::GlobalTimelines;
|
||||
@@ -170,13 +170,6 @@ struct Args {
|
||||
/// still needed for existing replication connection.
|
||||
#[arg(long)]
|
||||
walsenders_keep_horizon: bool,
|
||||
/// Enable partial backup. If disabled, safekeeper will not upload partial
|
||||
/// segments to remote storage.
|
||||
#[arg(long)]
|
||||
partial_backup_enabled: bool,
|
||||
/// Controls how long backup will wait until uploading the partial segment.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
|
||||
partial_backup_timeout: Duration,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
@@ -307,8 +300,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
http_auth,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
||||
partial_backup_enabled: args.partial_backup_enabled,
|
||||
partial_backup_timeout: args.partial_backup_timeout,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
@@ -374,8 +365,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
wal_backup::init_remote_storage(&conf);
|
||||
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||
FuturesUnordered::new();
|
||||
|
||||
@@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 8;
|
||||
pub const SK_FORMAT_VERSION: u32 = 7;
|
||||
|
||||
// contains persistent metadata for safekeeper
|
||||
const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
use crate::{
|
||||
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
|
||||
state::{PersistedPeers, TimelinePersistentState},
|
||||
wal_backup_partial,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
use pq_proto::SystemId;
|
||||
@@ -139,50 +138,6 @@ pub struct SafeKeeperStateV4 {
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct SafeKeeperStateV7 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN that points to the end of the last backed up segment. Useful to
|
||||
/// persist to avoid finding out offloading progress on boot.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
// Peers and their state as we remember it. Knowing peers themselves is
|
||||
// fundamental; but state is saved here only for informational purposes and
|
||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -212,7 +167,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -236,7 +190,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -260,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
peer_horizon_lsn: oldstate.truncate_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -284,7 +236,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
@@ -311,30 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
oldstate.server.pg_version = 140005;
|
||||
|
||||
return Ok(oldstate);
|
||||
} else if version == 7 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?;
|
||||
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||
local_start_lsn: oldstate.local_start_lsn,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: persist the file back to the disk after upgrade
|
||||
// TODO: think about backward compatibility and rollbacks
|
||||
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ pub mod send_wal;
|
||||
pub mod state;
|
||||
pub mod timeline;
|
||||
pub mod wal_backup;
|
||||
pub mod wal_backup_partial;
|
||||
pub mod wal_service;
|
||||
pub mod wal_storage;
|
||||
|
||||
@@ -49,7 +48,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
|
||||
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
||||
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -81,8 +79,6 @@ pub struct SafeKeeperConf {
|
||||
pub http_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pub current_thread_runtime: bool,
|
||||
pub walsenders_keep_horizon: bool,
|
||||
pub partial_backup_enabled: bool,
|
||||
pub partial_backup_timeout: Duration,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -127,8 +123,6 @@ impl SafeKeeperConf {
|
||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
current_thread_runtime: false,
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,21 +147,6 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register safekeeper_received_ps_feedbacks_total counter")
|
||||
});
|
||||
pub static PARTIAL_BACKUP_UPLOADS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"safekeeper_partial_backup_uploads_total",
|
||||
"Number of partial backup uploads to the S3",
|
||||
&["result"]
|
||||
)
|
||||
.expect("Failed to register safekeeper_partial_backup_uploads_total counter")
|
||||
});
|
||||
pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"safekeeper_partial_backup_uploaded_bytes_total",
|
||||
"Number of bytes uploaded to the S3 during partial backup"
|
||||
)
|
||||
.expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
|
||||
@@ -1221,7 +1221,6 @@ mod tests {
|
||||
commit_lsn: Lsn(1234567600),
|
||||
},
|
||||
)]),
|
||||
partial_backup: crate::wal_backup_partial::State::default(),
|
||||
};
|
||||
|
||||
let ser = state.ser().unwrap();
|
||||
@@ -1267,8 +1266,6 @@ mod tests {
|
||||
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// partial_backup
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
assert_eq!(Hex(&ser), Hex(&expected));
|
||||
|
||||
@@ -13,7 +13,6 @@ use utils::{
|
||||
use crate::{
|
||||
control_file,
|
||||
safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory},
|
||||
wal_backup_partial::{self},
|
||||
};
|
||||
|
||||
/// Persistent information stored on safekeeper node about timeline.
|
||||
@@ -55,14 +54,11 @@ pub struct TimelinePersistentState {
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
// Peers and their state as we remember it. Knowing peers themselves is
|
||||
// fundamental; but state is saved here only for informational purposes and
|
||||
// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
@@ -97,7 +93,6 @@ impl TimelinePersistentState {
|
||||
.map(|p| (*p, PersistedPeerInfo::new()))
|
||||
.collect(),
|
||||
),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::{debug_dump, wal_backup_partial, wal_storage};
|
||||
use crate::{debug_dump, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
@@ -503,9 +503,6 @@ impl Timeline {
|
||||
if conf.peer_recovery_enabled {
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
|
||||
tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory.
|
||||
@@ -670,8 +667,8 @@ impl Timeline {
|
||||
term_flush_lsn =
|
||||
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
|
||||
}
|
||||
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
|
||||
Ok(rmsg)
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::time::Duration;
|
||||
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::fs::File;
|
||||
|
||||
use tokio::select;
|
||||
@@ -180,16 +180,6 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
// TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
|
||||
// dependencies to all tasks instead.
|
||||
REMOTE_STORAGE.get_or_init(|| {
|
||||
conf.remote_storage
|
||||
.as_ref()
|
||||
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
||||
});
|
||||
}
|
||||
|
||||
const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||
@@ -204,6 +194,14 @@ pub async fn wal_backup_launcher_task_main(
|
||||
conf.remote_storage
|
||||
);
|
||||
|
||||
let conf_ = conf.clone();
|
||||
REMOTE_STORAGE.get_or_init(|| {
|
||||
conf_
|
||||
.remote_storage
|
||||
.as_ref()
|
||||
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
|
||||
});
|
||||
|
||||
// Presence in this map means launcher is aware s3 offloading is needed for
|
||||
// the timeline, but task is started only if it makes sense for to offload
|
||||
// from this safekeeper.
|
||||
@@ -520,35 +518,6 @@ async fn backup_object(
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn backup_partial_segment(
|
||||
source_file: &Utf8Path,
|
||||
target_file: &RemotePath,
|
||||
size: usize,
|
||||
) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||
|
||||
// limiting the file to read only the first `size` bytes
|
||||
let limited_file = tokio::io::AsyncReadExt::take(file, size as u64);
|
||||
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE);
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
storage
|
||||
.upload(
|
||||
file,
|
||||
size,
|
||||
target_file,
|
||||
Some(StorageMetadata::from([("sk_type", "partial_segment")])),
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
file_path: &RemotePath,
|
||||
offset: u64,
|
||||
@@ -635,13 +604,6 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Used by wal_backup_partial.
|
||||
pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
|
||||
let cancel = CancellationToken::new(); // not really used
|
||||
let storage = get_configured_remote_storage();
|
||||
storage.delete_objects(paths, &cancel).await
|
||||
}
|
||||
|
||||
/// Copy segments from one timeline to another. Used in copy_timeline.
|
||||
pub async fn copy_s3_segments(
|
||||
wal_seg_size: usize,
|
||||
|
||||
@@ -1,396 +0,0 @@
|
||||
//! Safekeeper timeline has a background task which is subscribed to `commit_lsn`
|
||||
//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn`
|
||||
//! was changed), the segment will be uploaded to S3 in about 15 minutes.
|
||||
//!
|
||||
//! The filename format for partial segments is
|
||||
//! `Segment_Term_Flush_Commit_skNN.partial`, where:
|
||||
//! - `Segment` – the segment name, like `000000010000000000000001`
|
||||
//! - `Term` – current term
|
||||
//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568`
|
||||
//! - `Commit` – commit_lsn in the same hex format
|
||||
//! - `NN` – safekeeper_id, like `1`
|
||||
//!
|
||||
//! The full object name example:
|
||||
//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial`
|
||||
//!
|
||||
//! Each safekeeper will keep info about remote partial segments in its control
|
||||
//! file. Code updates state in the control file before doing any S3 operations.
|
||||
//! This way control file stores information about all potentially existing
|
||||
//! remote partial segments and can clean them up after uploading a newer version.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use rand::Rng;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
||||
safekeeper::Term,
|
||||
timeline::Timeline,
|
||||
wal_backup, SafeKeeperConf,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub enum UploadStatus {
|
||||
/// Upload is in progress
|
||||
InProgress,
|
||||
/// Upload is finished
|
||||
Uploaded,
|
||||
/// Deletion is in progress
|
||||
Deleting,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PartialRemoteSegment {
|
||||
pub status: UploadStatus,
|
||||
pub name: String,
|
||||
pub commit_lsn: Lsn,
|
||||
pub flush_lsn: Lsn,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
impl PartialRemoteSegment {
|
||||
fn eq_without_status(&self, other: &Self) -> bool {
|
||||
self.name == other.name
|
||||
&& self.commit_lsn == other.commit_lsn
|
||||
&& self.flush_lsn == other.flush_lsn
|
||||
&& self.term == other.term
|
||||
}
|
||||
}
|
||||
|
||||
// NB: these structures are a part of a control_file, you can't change them without
|
||||
// changing the control file format version.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
|
||||
pub struct State {
|
||||
pub segments: Vec<PartialRemoteSegment>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Find an Uploaded segment. There should be only one Uploaded segment at a time.
|
||||
fn uploaded_segment(&self) -> Option<PartialRemoteSegment> {
|
||||
self.segments
|
||||
.iter()
|
||||
.find(|seg| seg.status == UploadStatus::Uploaded)
|
||||
.cloned()
|
||||
}
|
||||
}
|
||||
|
||||
struct PartialBackup {
|
||||
wal_seg_size: usize,
|
||||
tli: Arc<Timeline>,
|
||||
conf: SafeKeeperConf,
|
||||
local_prefix: Utf8PathBuf,
|
||||
remote_prefix: Utf8PathBuf,
|
||||
|
||||
state: State,
|
||||
}
|
||||
|
||||
// Read-only methods for getting segment names
|
||||
impl PartialBackup {
|
||||
fn segno(&self, lsn: Lsn) -> XLogSegNo {
|
||||
lsn.segment_number(self.wal_seg_size)
|
||||
}
|
||||
|
||||
fn segment_name(&self, segno: u64) -> String {
|
||||
XLogFileName(PG_TLI, segno, self.wal_seg_size)
|
||||
}
|
||||
|
||||
fn remote_segment_name(
|
||||
&self,
|
||||
segno: u64,
|
||||
term: u64,
|
||||
commit_lsn: Lsn,
|
||||
flush_lsn: Lsn,
|
||||
) -> String {
|
||||
format!(
|
||||
"{}_{}_{:016X}_{:016X}_sk{}.partial",
|
||||
self.segment_name(segno),
|
||||
term,
|
||||
flush_lsn.0,
|
||||
commit_lsn.0,
|
||||
self.conf.my_id.0,
|
||||
)
|
||||
}
|
||||
|
||||
fn local_segment_name(&self, segno: u64) -> String {
|
||||
format!("{}.partial", self.segment_name(segno))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialBackup {
|
||||
/// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded.
|
||||
async fn prepare_upload(&self) -> PartialRemoteSegment {
|
||||
// this operation takes a lock to get the actual state
|
||||
let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
|
||||
let flush_lsn = Lsn(sk_info.flush_lsn);
|
||||
let commit_lsn = Lsn(sk_info.commit_lsn);
|
||||
let term = sk_info.term;
|
||||
let segno = self.segno(flush_lsn);
|
||||
|
||||
let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn);
|
||||
|
||||
PartialRemoteSegment {
|
||||
status: UploadStatus::InProgress,
|
||||
name,
|
||||
commit_lsn,
|
||||
flush_lsn,
|
||||
term,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads segment from disk and uploads it to the remote storage.
|
||||
async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> {
|
||||
let flush_lsn = prepared.flush_lsn;
|
||||
let segno = self.segno(flush_lsn);
|
||||
|
||||
// We're going to backup bytes from the start of the segment up to flush_lsn.
|
||||
let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
|
||||
|
||||
let local_path = self.local_prefix.join(self.local_segment_name(segno));
|
||||
let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
|
||||
|
||||
// Upload first `backup_bytes` bytes of the segment to the remote storage.
|
||||
wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
|
||||
PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
|
||||
|
||||
// We uploaded the segment, now let's verify that the data is still actual.
|
||||
// If the term changed, we cannot guarantee the validity of the uploaded data.
|
||||
// If the term is the same, we know the data is not corrupted.
|
||||
let sk_info = self.tli.get_safekeeper_info(&self.conf).await;
|
||||
if sk_info.term != prepared.term {
|
||||
anyhow::bail!("term changed during upload");
|
||||
}
|
||||
assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn));
|
||||
assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write new state to disk. If in-memory and on-disk states diverged, returns an error.
|
||||
async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> {
|
||||
self.tli
|
||||
.map_control_file(|cf| {
|
||||
if cf.partial_backup != self.state {
|
||||
let memory = self.state.clone();
|
||||
self.state = cf.partial_backup.clone();
|
||||
anyhow::bail!(
|
||||
"partial backup state diverged, memory={:?}, disk={:?}",
|
||||
memory,
|
||||
cf.partial_backup
|
||||
);
|
||||
}
|
||||
|
||||
cf.partial_backup = new_state.clone();
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
// update in-memory state
|
||||
self.state = new_state;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Upload the latest version of the partial segment and garbage collect older versions.
|
||||
#[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
|
||||
async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
|
||||
info!("starting upload {:?}", prepared);
|
||||
|
||||
let state_0 = self.state.clone();
|
||||
let state_1 = {
|
||||
let mut state = state_0.clone();
|
||||
state.segments.push(prepared.clone());
|
||||
state
|
||||
};
|
||||
|
||||
// we're going to upload a new segment, let's write it to disk to make GC later
|
||||
self.commit_state(state_1).await?;
|
||||
|
||||
self.upload_segment(prepared.clone()).await?;
|
||||
|
||||
let state_2 = {
|
||||
let mut state = state_0.clone();
|
||||
for seg in state.segments.iter_mut() {
|
||||
seg.status = UploadStatus::Deleting;
|
||||
}
|
||||
let mut actual_remote_segment = prepared.clone();
|
||||
actual_remote_segment.status = UploadStatus::Uploaded;
|
||||
state.segments.push(actual_remote_segment);
|
||||
state
|
||||
};
|
||||
|
||||
// we've uploaded new segment, it's actual, all other segments should be GCed
|
||||
self.commit_state(state_2).await?;
|
||||
self.gc().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete all non-Uploaded segments from the remote storage. There should be only one
|
||||
/// Uploaded segment at a time.
|
||||
#[instrument(name = "gc", skip_all)]
|
||||
async fn gc(&mut self) -> anyhow::Result<()> {
|
||||
let mut segments_to_delete = vec![];
|
||||
|
||||
let new_segments: Vec<PartialRemoteSegment> = self
|
||||
.state
|
||||
.segments
|
||||
.iter()
|
||||
.filter_map(|seg| {
|
||||
if seg.status == UploadStatus::Uploaded {
|
||||
Some(seg.clone())
|
||||
} else {
|
||||
segments_to_delete.push(seg.name.clone());
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
info!("deleting objects: {:?}", segments_to_delete);
|
||||
let mut objects_to_delete = vec![];
|
||||
for seg in segments_to_delete.iter() {
|
||||
let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
|
||||
objects_to_delete.push(remote_path);
|
||||
}
|
||||
|
||||
// removing segments from remote storage
|
||||
wal_backup::delete_objects(&objects_to_delete).await?;
|
||||
|
||||
// now we can update the state on disk
|
||||
let new_state = {
|
||||
let mut state = self.state.clone();
|
||||
state.segments = new_segments;
|
||||
state
|
||||
};
|
||||
self.commit_state(new_state).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
Err(_) => {
|
||||
info!("timeline canceled during task start");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// sleep for random time to avoid thundering herd
|
||||
{
|
||||
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||
let sleep_duration = await_duration.mul_f64(randf64);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
}
|
||||
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
|
||||
let local_prefix = tli.timeline_dir.clone();
|
||||
let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
|
||||
Ok(path) => path.to_owned(),
|
||||
Err(e) => {
|
||||
error!("failed to strip workspace dir prefix: {:?}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let mut backup = PartialBackup {
|
||||
wal_seg_size,
|
||||
tli,
|
||||
state: persistent_state.partial_backup,
|
||||
conf,
|
||||
local_prefix,
|
||||
remote_prefix,
|
||||
};
|
||||
|
||||
debug!("state: {:?}", backup.state);
|
||||
|
||||
'outer: loop {
|
||||
// wait until we have something to upload
|
||||
let uploaded_segment = backup.state.uploaded_segment();
|
||||
if let Some(seg) = &uploaded_segment {
|
||||
// if we already uploaded something, wait until we have something new
|
||||
while flush_lsn_rx.borrow().lsn == seg.flush_lsn
|
||||
&& *commit_lsn_rx.borrow() == seg.commit_lsn
|
||||
&& flush_lsn_rx.borrow().term == seg.term
|
||||
{
|
||||
tokio::select! {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("timeline canceled");
|
||||
return;
|
||||
}
|
||||
_ = commit_lsn_rx.changed() => {}
|
||||
_ = flush_lsn_rx.changed() => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fixing the segno and waiting some time to prevent reuploading the same segment too often
|
||||
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||
let timeout = tokio::time::sleep(await_duration);
|
||||
tokio::pin!(timeout);
|
||||
let mut timeout_expired = false;
|
||||
|
||||
// waiting until timeout expires OR segno changes
|
||||
'inner: loop {
|
||||
tokio::select! {
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("timeline canceled");
|
||||
return;
|
||||
}
|
||||
_ = commit_lsn_rx.changed() => {}
|
||||
_ = flush_lsn_rx.changed() => {
|
||||
let segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||
if segno != pending_segno {
|
||||
// previous segment is no longer partial, aborting the wait
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
_ = &mut timeout => {
|
||||
// timeout expired, now we are ready for upload
|
||||
timeout_expired = true;
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !timeout_expired {
|
||||
// likely segno has changed, let's try again in the next iteration
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
let prepared = backup.prepare_upload().await;
|
||||
if let Some(seg) = &uploaded_segment {
|
||||
if seg.eq_without_status(&prepared) {
|
||||
// we already uploaded this segment, nothing to do
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
match backup.do_upload(&prepared).await {
|
||||
Ok(()) => {
|
||||
debug!(
|
||||
"uploaded {} up to flush_lsn {}",
|
||||
prepared.name, prepared.flush_lsn
|
||||
);
|
||||
PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc();
|
||||
}
|
||||
Err(e) => {
|
||||
info!("failed to upload {}: {:#}", prepared.name, e);
|
||||
PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,8 +176,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
http_auth: None,
|
||||
current_thread_runtime: false,
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
};
|
||||
|
||||
let mut global = GlobalMap::new(disk, conf.clone())?;
|
||||
|
||||
@@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert isinstance(cmd, list)
|
||||
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
|
||||
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = basepath + ".stdout"
|
||||
stderr_filename = basepath + ".stderr"
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
with open(stderr_filename, "w") as stderr_f:
|
||||
print(f'(capturing output to "{base}.stdout")')
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
@@ -82,9 +82,11 @@ class PgBin:
|
||||
|
||||
def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
|
||||
self.log_dir = log_dir
|
||||
self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
|
||||
self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin")
|
||||
self.env = os.environ.copy()
|
||||
self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
|
||||
self.env["LD_LIBRARY_PATH"] = os.path.join(
|
||||
str(pg_distrib_dir), "v{}".format(pg_version), "lib"
|
||||
)
|
||||
|
||||
def _fixpath(self, command: List[str]):
|
||||
if "/" not in command[0]:
|
||||
@@ -108,7 +110,7 @@ class PgBin:
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print(f'Running command "{" ".join(command)}"')
|
||||
print('Running command "{}"'.format(" ".join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
@@ -126,7 +128,7 @@ class PgBin:
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print(f'Running command "{" ".join(command)}"')
|
||||
print('Running command "{}"'.format(" ".join(command)))
|
||||
env = self._build_env(env)
|
||||
return subprocess_capture(
|
||||
str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
|
||||
@@ -298,7 +300,7 @@ class NeonPageserverHttpClient(requests.Session):
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
"""Convert lsn from int to standard hex notation."""
|
||||
return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
|
||||
return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF)
|
||||
|
||||
|
||||
def lsn_from_hex(lsn_hex: str) -> int:
|
||||
@@ -329,12 +331,16 @@ def wait_for_upload(
|
||||
if current_lsn >= lsn:
|
||||
return
|
||||
print(
|
||||
f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
|
||||
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception(
|
||||
f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
|
||||
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
||||
lsn_to_hex(lsn), lsn_to_hex(current_lsn)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -482,18 +482,20 @@ def pytest_terminal_summary(
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
is_header_printed = True
|
||||
|
||||
terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ")
|
||||
terminalreporter.write(
|
||||
"{}.{}: ".format(test_report.head_line, recorded_property["name"])
|
||||
)
|
||||
unit = recorded_property["unit"]
|
||||
value = recorded_property["value"]
|
||||
if unit == "MB":
|
||||
terminalreporter.write(f"{value:,.0f}", green=True)
|
||||
terminalreporter.write("{0:,.0f}".format(value), green=True)
|
||||
elif unit in ("s", "ms") and isinstance(value, float):
|
||||
terminalreporter.write(f"{value:,.3f}", green=True)
|
||||
terminalreporter.write("{0:,.3f}".format(value), green=True)
|
||||
elif isinstance(value, float):
|
||||
terminalreporter.write(f"{value:,.4f}", green=True)
|
||||
terminalreporter.write("{0:,.4f}".format(value), green=True)
|
||||
else:
|
||||
terminalreporter.write(str(value), green=True)
|
||||
terminalreporter.line(f" {unit}")
|
||||
terminalreporter.line(" {}".format(unit))
|
||||
|
||||
result_entry.append(recorded_property)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user