Compare commits

..

1 Commits

Author SHA1 Message Date
Anna Khanova
136ed19387 Test 2024-03-27 13:42:33 +01:00
161 changed files with 1867 additions and 5816 deletions

View File

@@ -22,7 +22,6 @@
!s3_scrubber/
!safekeeper/
!storage_broker/
!storage_controller/
!trace/
!vendor/postgres-*/
!workspace_hack/

View File

@@ -147,16 +147,15 @@ jobs:
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]')
{ "platform": "rds-aurora", "db_size": "50gb"}]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
{ "platform": "rds-aurora" }]')
{ "platform": "rds-aurora" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
{ "platform": "rds-aurora", "scale": "10" }]')
{ "platform": "rds-aurora", "scale": "10" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
@@ -274,15 +270,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
psql ${CONNSTR} -c "${QUERY}"
- name: Benchmark init
uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
psql ${CONNSTR} -c "${QUERY}"
- name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
psql ${CONNSTR} -c "${QUERY}"
- name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
psql ${CONNSTR} -c "${QUERY}"
- name: Run user examples
uses: ./.github/actions/run-python-test-set

View File

@@ -1127,7 +1127,6 @@ jobs:
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
@@ -1137,7 +1136,6 @@ jobs:
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1146,7 +1144,6 @@ jobs:
-f deployProxy=true \
-f deployStorage=false \
-f deployStorageBroker=false \
-f deployStorageController=false \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true

View File

@@ -62,14 +62,14 @@ jobs:
trigger-e2e-tests:
needs: [ tag ]
runs-on: ubuntu-latest
runs-on: [ self-hosted, gen3, small ]
env:
TAG: ${{ needs.tag.outputs.build-tag }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps:
- name: check if ecr image are present
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: |
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
fi
done
- name: Set e2e-platforms
id: e2e-platforms
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Default set of platforms to run e2e tests on
platforms='["docker", "k8s"]'
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
# If the workflow run is not a pull request, add k8s-neonvm to the list.
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
case "$f" in
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)
# no-op
;;
esac
done
else
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
fi
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
- name: Set PR's status to pending and request a remote CI test
env:
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
--method POST \
--raw-field "state=pending" \
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
--raw-field "context=neon-cloud-e2e"
REMOTE_REPO="${{ github.repository_owner }}/cloud"
gh workflow --repo ${REMOTE_REPO} \
run testing.yml \
--ref "main" \
--raw-field "ci_job_name=neon-cloud-e2e" \
--raw-field "commit_hash=$COMMIT_SHA" \
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \
--raw-field "storage_image_tag=${TAG}" \
--raw-field "compute_image_tag=${TAG}" \
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
--raw-field "e2e-platforms=${E2E_PLATFORMS}"
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${TAG}\",
\"compute_image_tag\": \"${TAG}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

View File

@@ -1,5 +1,5 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/storage_controller @neondatabase/storage
/control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage

122
Cargo.lock generated
View File

@@ -270,6 +270,44 @@ dependencies = [
"critical-section",
]
[[package]]
name = "attachment_service"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"bytes",
"camino",
"clap",
"control_plane",
"diesel",
"diesel_migrations",
"fail",
"futures",
"git-version",
"hex",
"humantime",
"hyper",
"lasso",
"measured",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
"r2d2",
"reqwest",
"routerify",
"serde",
"serde_json",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -2196,9 +2234,9 @@ dependencies = [
[[package]]
name = "h2"
version = "0.3.26"
version = "0.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
dependencies = [
"bytes",
"fnv",
@@ -3397,9 +3435,9 @@ dependencies = [
[[package]]
name = "ordered-multimap"
version = "0.7.3"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
dependencies = [
"dlv-list",
"hashbrown 0.14.0",
@@ -4161,7 +4199,6 @@ name = "proxy"
version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"async-trait",
"aws-config",
"aws-sdk-iam",
@@ -5584,65 +5621,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "storage_controller"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"bytes",
"camino",
"clap",
"control_plane",
"diesel",
"diesel_migrations",
"fail",
"futures",
"git-version",
"hex",
"humantime",
"hyper",
"itertools",
"lasso",
"measured",
"metrics",
"once_cell",
"pageserver_api",
"pageserver_client",
"postgres_connection",
"r2d2",
"reqwest",
"routerify",
"serde",
"serde_json",
"thiserror",
"tokio",
"tokio-util",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "storcon_cli"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"comfy-table",
"hyper",
"pageserver_api",
"pageserver_client",
"reqwest",
"serde",
"serde_json",
"thiserror",
"tokio",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "stringprep"
version = "0.1.2"
@@ -5799,23 +5777,23 @@ dependencies = [
[[package]]
name = "test-context"
version = "0.3.0"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9"
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
dependencies = [
"async-trait",
"futures",
"test-context-macros",
]
[[package]]
name = "test-context-macros"
version = "0.3.0"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
"syn 1.0.109",
]
[[package]]
@@ -5956,9 +5934,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.37.0"
version = "1.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
dependencies = [
"backtrace",
"bytes",

View File

@@ -3,7 +3,7 @@ resolver = "2"
members = [
"compute_tools",
"control_plane",
"control_plane/storcon_cli",
"control_plane/attachment_service",
"pageserver",
"pageserver/compaction",
"pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
"proxy",
"safekeeper",
"storage_broker",
"storage_controller",
"s3_scrubber",
"workspace_hack",
"trace",
@@ -159,7 +158,7 @@ svg_fmt = "0.4.1"
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.3"
test-context = "0.1"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"

View File

@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Create remote extension download directory
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check)

View File

@@ -1262,12 +1262,10 @@ LIMIT 100",
.await
.map_err(DownloadError::Other);
if download_size.is_ok() {
self.ext_download_progress
.write()
.expect("bad lock")
.insert(ext_archive_name.to_string(), (download_start, true));
}
self.ext_download_progress
.write()
.expect("bad lock")
.insert(ext_archive_name.to_string(), (download_start, true));
download_size
}

View File

@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser.
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
name.pg_quote()
);
info!("running role create query: '{}'", &query);
@@ -743,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
// which may happen in two cases:
// - extension was just installed
// - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension version with query: {}", query);
if let Err(e) = client.simple_query(query) {
error!(
"failed to upgrade neon extension during `handle_extension_neon`: {}",
e
);
}
// DISABLED due to compute node unpinning epic
// let query = "ALTER EXTENSION neon UPDATE";
// info!("update neon extension version with query: {}", query);
// client.simple_query(query)?;
Ok(())
}
#[instrument(skip_all)]
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade");
let query = "ALTER EXTENSION neon UPDATE";
info!("update neon extension version with query: {}", query);
client.simple_query(query)?;
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade (not really)");
// DISABLED due to compute node unpinning epic
// let query = "ALTER EXTENSION neon UPDATE";
// info!("update neon extension version with query: {}", query);
// client.simple_query(query)?;
Ok(())
}
@@ -809,8 +806,19 @@ $$;"#,
"",
"",
"",
"",
// Add new migrations below.
r#"
DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END
$$;"#,
];
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";

View File

@@ -1,5 +1,5 @@
[package]
name = "storage_controller"
name = "attachment_service"
version = "0.1.0"
edition.workspace = true
license.workspace = true
@@ -25,7 +25,6 @@ git-version.workspace = true
hex.workspace = true
hyper.workspace = true
humantime.workspace = true
itertools.workspace = true
lasso.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
@@ -45,8 +44,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }
control_plane = { path = ".." }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,4 +1,3 @@
use std::sync::Arc;
use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,30 +14,19 @@ use utils::{
use crate::service::Config;
const BUSY_DELAY: Duration = Duration::from_secs(1);
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
pub(crate) const API_CONCURRENCY: usize = 32;
struct UnshardedComputeHookTenant {
// Which node is this tenant attached to
node_id: NodeId,
// Must hold this lock to send a notification.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize,
shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>,
// Must hold this lock to send a notification. The contents represent
// the last successfully sent notification, and are used to coalesce multiple
// updates by only sending when there is a chance since our last successful send.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
enum ComputeHookTenant {
Unsharded(UnshardedComputeHookTenant),
Unsharded(NodeId),
Sharded(ShardedComputeHookTenant),
}
@@ -50,20 +38,9 @@ impl ComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size,
shard_count: tenant_shard_id.shard_count,
send_lock: Arc::default(),
})
} else {
Self::Unsharded(UnshardedComputeHookTenant {
node_id,
send_lock: Arc::default(),
})
}
}
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
match self {
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
Self::Unsharded(node_id)
}
}
@@ -76,8 +53,8 @@ impl ComputeHookTenant {
node_id: NodeId,
) {
match self {
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
unsharded_tenant.node_id = node_id
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
*existing_node_id = node_id
}
Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size
@@ -104,14 +81,14 @@ impl ComputeHookTenant {
}
}
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
/// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
@@ -144,44 +121,14 @@ pub(crate) enum NotifyError {
Fatal(StatusCode),
}
enum MaybeSendResult {
// Please send this request while holding the lock, and if you succeed then write
// the request into the lock.
Transmit(
(
ComputeHookNotifyRequest,
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
),
),
// Something requires sending, but you must wait for a current sender then call again
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
// Nothing requires sending
Noop,
}
impl ComputeHookTenant {
fn maybe_send(
&self,
tenant_id: TenantId,
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
) -> MaybeSendResult {
let locked = match lock {
Some(already_locked) => already_locked,
None => {
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
};
locked
}
};
let request = match self {
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
match self {
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
tenant_id,
shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0),
node_id: unsharded_tenant.node_id,
node_id: *node_id,
}],
stripe_size: None,
}),
@@ -205,25 +152,12 @@ impl ComputeHookTenant {
// Sharded tenant doesn't yet have information for all its shards
tracing::info!(
"ComputeHookTenant::maybe_send: not enough shards ({}/{})",
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
sharded_tenant.shards.len(),
sharded_tenant.shard_count.count()
);
None
}
};
match request {
None => {
// Not yet ready to emit a notification
tracing::info!("Tenant isn't yet ready to emit a notification");
MaybeSendResult::Noop
}
Some(request) if Some(&request) == locked.as_ref() => {
// No change from the last value successfully sent
MaybeSendResult::Noop
}
Some(request) => MaybeSendResult::Transmit((request, locked)),
}
}
}
@@ -233,15 +167,8 @@ impl ComputeHookTenant {
/// the compute connection string.
pub(super) struct ComputeHook {
config: Config,
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>,
// Concurrency limiter, so that we do not overload the cloud control plane when updating
// large numbers of tenants (e.g. when failing over after a node failure)
api_concurrency: tokio::sync::Semaphore,
// This lock is only used in testing enviroments, to serialize calls into neon_lock
neon_local_lock: tokio::sync::Mutex<()>,
}
impl ComputeHook {
@@ -255,20 +182,14 @@ impl ComputeHook {
state: Default::default(),
config,
authorization_header,
neon_local_lock: Default::default(),
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
}
}
/// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local(
&self,
reconfigure_request: &ComputeHookNotifyRequest,
reconfigure_request: ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
@@ -285,7 +206,7 @@ impl ComputeHook {
} = reconfigure_request;
let compute_pageservers = shards
.iter()
.into_iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
@@ -297,10 +218,10 @@ impl ComputeHook {
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), *stripe_size)
.reconfigure(compute_pageservers.clone(), stripe_size)
.await?;
}
}
@@ -359,10 +280,11 @@ impl ComputeHook {
Err(NotifyError::SlowDown)
}
StatusCode::LOCKED => {
// We consider this fatal, because it's possible that the operation blocking the control one is
// also the one that is waiting for this reconcile. We should let the reconciler calling
// this hook fail, to give control plane a chance to un-lock.
tracing::info!("Control plane reports tenant is locked, dropping out of notify");
// Delay our retry if busy: the usual fast exponential backoff in backoff::retry
// is not appropriate
tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
.await
.ok();
Err(NotifyError::Busy)
}
StatusCode::SERVICE_UNAVAILABLE
@@ -378,29 +300,13 @@ impl ComputeHook {
async fn do_notify(
&self,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
reconfigure_request: ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let client = reqwest::Client::new();
// We hold these semaphore units across all retries, rather than only across each
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
// time out waiting for a semaphore.
let _units = self
.api_concurrency
.acquire()
.await
// Interpret closed semaphore as shutdown
.map_err(|_| NotifyError::ShuttingDown)?;
backoff::retry(
|| self.do_notify_iteration(&client, url, reconfigure_request, cancel),
|e| {
matches!(
e,
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
)
},
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
3,
10,
"Send compute notification",
@@ -434,70 +340,42 @@ impl ComputeHook {
stripe_size: ShardStripeSize,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let maybe_send_result = {
let mut state_locked = self.state.lock().unwrap();
let mut locked = self.state.lock().await;
use std::collections::hash_map::Entry;
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
tenant.maybe_send(tenant_shard_id.tenant_id, None)
use std::collections::hash_map::Entry;
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
// Process result: we may get an update to send, or we may have to wait for a lock
// before trying again.
let (request, mut send_lock_guard) = match maybe_send_result {
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::AwaitLock(send_lock) => {
let send_locked = send_lock.lock_owned().await;
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
// try_lock.
let state_locked = self.state.lock().unwrap();
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
return Ok(());
};
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
MaybeSendResult::AwaitLock(_) => {
unreachable!("We supplied lock guard")
}
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
}
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
return Ok(());
};
let result = if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, &request, cancel).await
if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, reconfigure_request, cancel)
.await
} else {
self.do_notify_local(&request).await.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
};
if result.is_ok() {
// Before dropping the send lock, stash the request we just sent so that
// subsequent callers can avoid redundantly re-sending the same thing.
*send_lock_guard = Some(request);
self.do_notify_local(reconfigure_request)
.await
.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
}
result
}
}
@@ -521,22 +399,21 @@ pub(crate) mod tests {
NodeId(1),
);
// An unsharded tenant is always ready to emit a notification, but won't
// send the same one twice
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 1);
assert!(request.stripe_size.is_none());
// Simulate successful send
*guard = Some(request);
drop(guard);
// Try asking again: this should be a no-op
let send_result = tenant_state.maybe_send(tenant_id, None);
assert!(matches!(send_result, MaybeSendResult::Noop));
// An unsharded tenant is always ready to emit a notification
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
1
);
assert!(tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size
.is_none());
// Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to
@@ -550,10 +427,7 @@ pub(crate) mod tests {
ShardStripeSize(32768),
NodeId(1),
);
assert!(matches!(
tenant_state.maybe_send(tenant_id, None),
MaybeSendResult::Noop
));
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
// Writing the second shard makes it ready to notify
tenant_state.update(
@@ -566,16 +440,22 @@ pub(crate) mod tests {
NodeId(1),
);
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 2);
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
// Simulate successful send
*guard = Some(request);
drop(guard);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
2
);
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
Ok(())
}

View File

@@ -34,8 +34,7 @@ use utils::{
};
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
TenantShardMigrateRequest,
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
};
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
@@ -399,15 +398,6 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
}
async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
json_response(StatusCode::OK, service.tenant_list())
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -421,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let nodes = state.service.node_list().await?;
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
json_response(StatusCode::OK, api_nodes)
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -491,22 +478,6 @@ async fn handle_tenant_shard_migrate(
)
}
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state
.service
.tenant_update_policy(tenant_id, update_req)
.await?,
)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
@@ -538,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.consistency_check().await?)
}
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ())
@@ -602,17 +565,9 @@ where
.await
}
/// Check if the required scope is held in the request's token, or if the request has
/// a token with 'admin' scope then always permit it.
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| {
match crate::auth::check_permission(claims, required_scope) {
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
Ok(()) => Ok(()),
Err(_) => Err(e),
},
Ok(()) => Ok(()),
}
crate::auth::check_permission(claims, required_scope)
})
}
@@ -771,9 +726,6 @@ pub fn make_router(
RequestName("debug_v1_consistency_check"),
)
})
.post("/debug/v1/reconcile_all", |r| {
request_span(r, handle_reconcile_all)
})
.put("/debug/v1/failpoints", |r| {
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
})
@@ -813,16 +765,6 @@ pub fn make_router(
RequestName("control_v1_tenant_describe"),
)
})
.get("/control/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
})
.put("/control/v1/tenant/:tenant_id/policy", |r| {
named_request_span(
r,
handle_tenant_update_policy,
RequestName("control_v1_tenant_policy"),
)
})
// Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.

View File

@@ -14,7 +14,7 @@ mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_shard;
mod tenant_state;
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
struct Sequence(u64);

View File

@@ -1,19 +1,18 @@
use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION);
@@ -51,7 +50,7 @@ struct Cli {
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)]
database_url: Option<String>,
@@ -159,8 +158,6 @@ fn main() -> anyhow::Result<()> {
std::process::exit(1);
}));
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections.

View File

@@ -37,9 +37,6 @@ pub(crate) struct StorageControllerMetricGroup {
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
/// Count of how many times we make an optimization change to a tenant's scheduling
pub(crate) storage_controller_schedule_optimization: measured::Counter,
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
@@ -104,7 +101,6 @@ impl StorageControllerMetricGroup {
status: StaticLabelSet::new(),
},
),
storage_controller_schedule_optimization: measured::Counter::new(),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),

View File

@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
use hyper::StatusCode;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
TenantLocateResponseShard,
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
},
shard::TenantShardId,
};
@@ -257,19 +256,6 @@ impl Node {
)
.await
}
/// Generate the simplified API-friendly description of a node's state
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
}
}
impl std::fmt::Display for Node {

View File

@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::ShardConfigError;
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
/// Some methods can operate on either a whole tenant or a single shard
pub(crate) enum TenantFilter {
Tenant(TenantId),
Shard(TenantShardId),
}
impl Persistence {
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
@@ -147,7 +140,7 @@ impl Persistence {
/// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let latency = &METRICS_REGISTRY
@@ -175,7 +168,7 @@ impl Persistence {
/// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static,
{
let mut conn = self.connection_pool.get()?;
@@ -282,11 +275,6 @@ impl Persistence {
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
shard.placement_policy = "{\"Attached\":0}".to_string();
}
if shard.scheduling_policy.is_empty() {
shard.scheduling_policy =
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
}
}
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -477,45 +465,59 @@ impl Persistence {
/// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard(
&self,
tenant: TenantFilter,
input_placement_policy: Option<PlacementPolicy>,
input_config: Option<TenantConfig>,
tenant_shard_id: TenantShardId,
input_placement_policy: PlacementPolicy,
input_config: TenantConfig,
input_generation: Option<Generation>,
input_scheduling_policy: Option<ShardSchedulingPolicy>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
let query = match tenant {
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.into_boxed(),
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.into_boxed(),
};
let query = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
#[derive(AsChangeset)]
#[diesel(table_name = crate::schema::tenant_shards)]
struct ShardUpdate {
generation: Option<i32>,
placement_policy: Option<String>,
config: Option<String>,
scheduling_policy: Option<String>,
if let Some(input_generation) = input_generation {
// Update includes generation column
query
.set((
generation.eq(Some(input_generation.into().unwrap() as i32)),
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
}
let update = ShardUpdate {
generation: input_generation.map(|g| g.into().unwrap() as i32),
placement_policy: input_placement_policy
.map(|p| serde_json::to_string(&p).unwrap()),
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
scheduling_policy: input_scheduling_policy
.map(|p| serde_json::to_string(&p).unwrap()),
};
Ok(())
})
.await?;
query.set(update).execute(conn)?;
Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(())
})
@@ -696,7 +698,7 @@ impl Persistence {
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence {
@@ -726,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
pub(crate) splitting: SplitState,
#[serde(default)]
pub(crate) config: String,
#[serde(default)]
pub(crate) scheduling_policy: String,
}
impl TenantShardPersistence {

View File

@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
/// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states.
pub(super) struct Reconciler {
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
/// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
/// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
pub(crate) compute_notify_failure: bool,
/// A means to abort background reconciliation: it is essential to
/// call this when something changes in the original TenantShard that
/// call this when something changes in the original TenantState that
/// will make this reconciliation impossible or unnecessary, for
/// example when a pageserver node goes offline, or the PlacementPolicy for
/// the tenant is changed.
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
pub(crate) persistence: Arc<Persistence>,
}
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)]
@@ -487,7 +487,6 @@ impl Reconciler {
while let Err(e) = self.compute_notify().await {
match e {
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
_ => {
tracing::warn!(
"Live migration blocked by compute notification error, retrying: {e}"

View File

@@ -1,4 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard};
use crate::{node::Node, tenant_state::TenantState};
use pageserver_api::controller_api::UtilizationScore;
use serde::Serialize;
use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
#[derive(Serialize)]
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -58,70 +58,6 @@ pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>,
}
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
///
/// For example, we may set an affinity score based on the number of shards from the same
/// tenant already on a node, to implicitly prefer to balance out shards.
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
pub(crate) struct AffinityScore(pub(crate) usize);
impl AffinityScore {
/// If we have no anti-affinity at all toward a node, this is its score. It means
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
/// based on other information such as total utilization.
pub(crate) const FREE: Self = Self(0);
pub(crate) fn inc(&mut self) {
self.0 += 1;
}
}
impl std::ops::Add for AffinityScore {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self(self.0 + rhs.0)
}
}
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
// it for many shards in the same tenant.
#[derive(Debug, Default)]
pub(crate) struct ScheduleContext {
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
}
impl ScheduleContext {
/// Input is a list of nodes we would like to avoid using again within this context. The more
/// times a node is passed into this call, the less inclined we are to use it.
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
for node_id in nodes {
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
entry.inc()
}
}
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
let entry = self.attached_nodes.entry(node_id).or_default();
*entry += 1;
}
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
.copied()
.unwrap_or(AffinityScore::FREE)
}
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
}
}
impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new();
@@ -147,7 +83,7 @@ impl Scheduler {
pub(crate) fn consistency_check<'a>(
&self,
nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantShard>,
shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes {
@@ -288,47 +224,27 @@ impl Scheduler {
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
}
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
/// are already in use by this shard -- we use this to avoid picking the same node
/// as both attached and secondary location. This is a hard constraint: if we cannot
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
///
/// context: we prefer to avoid using nodes identified in the context, according
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
/// the same tenant on the same node. This is a soft constraint: the context will never
/// cause us to fail to schedule a shard.
pub(crate) fn schedule_shard(
&self,
hard_exclude: &[NodeId],
context: &ScheduleContext,
) -> Result<NodeId, ScheduleError> {
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() {
return Err(ScheduleError::NoPageservers);
}
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes
.iter()
.filter_map(|(k, v)| {
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
None
} else {
Some((
*k,
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
v.shard_count,
))
Some((*k, v.shard_count))
}
})
.collect();
// Sort by, in order of precedence:
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.2, i.0));
// Sort by tenant count. Nodes with the same tenant count are sorted by ID.
tenant_counts.sort_by_key(|i| (i.1, i.0));
if scores.is_empty() {
if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. We log some detail about
// the state of nodes to help understand why this happened. This is not logged as an error because
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
@@ -344,11 +260,10 @@ impl Scheduler {
return Err(ScheduleError::ImpossibleConstraint);
}
// Lowest score wins
let node_id = scores.first().unwrap().0;
let node_id = tenant_counts.first().unwrap().0;
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
);
// Note that we do not update shard count here to reflect the scheduling: that
@@ -356,12 +271,6 @@ impl Scheduler {
Ok(node_id)
}
/// Unit test access to internal state
#[cfg(test)]
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().shard_count
}
}
#[cfg(test)]
@@ -398,7 +307,7 @@ pub(crate) mod test_utils {
mod tests {
use super::*;
use crate::tenant_shard::IntentState;
use crate::tenant_state::IntentState;
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2);
@@ -407,17 +316,15 @@ mod tests {
let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new();
let context = ScheduleContext::default();
let scheduled = scheduler.schedule_shard(&[], &context)?;
let scheduled = scheduler.schedule_shard(&[])?;
t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[], &context)?;
let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);

View File

@@ -22,7 +22,6 @@ diesel::table! {
placement_policy -> Varchar,
splitting -> Int2,
config -> Text,
scheduling_policy -> Varchar,
}
}

View File

@@ -8,10 +8,7 @@ use std::{
};
use crate::{
id_lock_map::IdLockMap,
persistence::{AbortShardSplitStatus, TenantFilter},
reconciler::ReconcileError,
scheduler::ScheduleContext,
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
};
use anyhow::Context;
use control_plane::storage_controller::{
@@ -20,14 +17,12 @@ use control_plane::storage_controller::{
use diesel::result::DatabaseErrorKind;
use futures::{stream::FuturesUnordered, StreamExt};
use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
UtilizationScore,
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
TenantShardMigrateResponse, UtilizationScore,
},
models::{SecondaryProgress, TenantConfigRequest},
};
@@ -56,6 +51,7 @@ use utils::{
generation::Generation,
http::error::ApiError,
id::{NodeId, TenantId, TimelineId},
seqwait::SeqWait,
sync::gate::Gate,
};
@@ -66,10 +62,11 @@ use crate::{
persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
reconciler::attached_location_conf,
scheduler::Scheduler,
tenant_shard::{
tenant_state::{
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
ReconcilerWaiter, TenantShard,
ReconcilerWaiter, TenantState,
},
Sequence,
};
// For operations that should be quick, like attaching a new tenant
@@ -92,7 +89,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
// Top level state available to all HTTP handlers
struct ServiceState {
tenants: BTreeMap<TenantShardId, TenantShard>,
tenants: BTreeMap<TenantShardId, TenantState>,
nodes: Arc<HashMap<NodeId, Node>>,
@@ -102,7 +99,7 @@ struct ServiceState {
impl ServiceState {
fn new(
nodes: HashMap<NodeId, Node>,
tenants: BTreeMap<TenantShardId, TenantShard>,
tenants: BTreeMap<TenantShardId, TenantState>,
scheduler: Scheduler,
) -> Self {
Self {
@@ -116,7 +113,7 @@ impl ServiceState {
&mut self,
) -> (
&mut Arc<HashMap<NodeId, Node>>,
&mut BTreeMap<TenantShardId, TenantShard>,
&mut BTreeMap<TenantShardId, TenantState>,
&mut Scheduler,
) {
(&mut self.nodes, &mut self.tenants, &mut self.scheduler)
@@ -335,11 +332,11 @@ impl Service {
for (tenant_shard_id, shard_observations) in observed {
for (node_id, observed_loc) in shard_observations {
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
tenant_shard
tenant_state
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
@@ -347,15 +344,9 @@ impl Service {
}
// Populate each tenant's intent state
let mut schedule_context = ScheduleContext::default();
for (tenant_shard_id, tenant_shard) in tenants.iter_mut() {
if tenant_shard_id.shard_number == ShardNumber(0) {
// Reset scheduling context each time we advance to the next Tenant
schedule_context = ScheduleContext::default();
}
tenant_shard.intent_from_observed(scheduler);
if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
tenant_state.intent_from_observed(scheduler);
if let Err(e) = tenant_state.schedule(scheduler) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available
// to clients.
@@ -364,11 +355,11 @@ impl Service {
// If we're both intending and observed to be attached at a particular node, we will
// emit a compute notification for this. In the case where our observed state does not
// yet match our intent, we will eventually reconcile, and that will emit a compute notification.
if let Some(attached_at) = tenant_shard.stably_attached() {
if let Some(attached_at) = tenant_state.stably_attached() {
compute_notifications.push((
*tenant_shard_id,
attached_at,
tenant_shard.shard.stripe_size,
tenant_state.shard.stripe_size,
));
}
}
@@ -679,13 +670,7 @@ impl Service {
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
while !self.cancel.is_cancelled() {
tokio::select! {
_ = interval.tick() => {
let reconciles_spawned = self.reconcile_all();
if reconciles_spawned == 0 {
// Run optimizer only when we didn't find any other work to do
self.optimize_all();
}
}
_ = interval.tick() => { self.reconcile_all(); }
_ = self.cancel.cancelled() => return
}
}
@@ -743,7 +728,7 @@ impl Service {
/// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
/// was successful, this will update the observed state of the tenant such that subsequent
/// calls to [`TenantShard::maybe_reconcile`] will do nothing.
/// calls to [`TenantState::maybe_reconcile`] will do nothing.
#[instrument(skip_all, fields(
tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
sequence=%result.sequence
@@ -761,10 +746,10 @@ impl Service {
tenant.generation = std::cmp::max(tenant.generation, result.generation);
// If the reconciler signals that it failed to notify compute, set this state on
// the shard so that a future [`TenantShard::maybe_reconcile`] will try again.
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
tenant.pending_compute_notification = result.pending_compute_notification;
// Let the TenantShard know it is idle.
// Let the TenantState know it is idle.
tenant.reconcile_complete(result.sequence);
match result.result {
@@ -972,14 +957,30 @@ impl Service {
}
for tsp in tenant_shard_persistence {
let tenant_shard_id = tsp.get_tenant_shard_id()?;
let shard_identity = tsp.get_shard_identity()?;
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if let Some(generation_pageserver) = tsp.generation_pageserver {
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
}
let new_tenant = TenantShard::from_persistent(tsp, intent)?;
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
splitting: tsp.splitting,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
};
tenants.insert(tenant_shard_id, new_tenant);
}
@@ -1103,8 +1104,6 @@ impl Service {
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
};
match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1126,7 +1125,7 @@ impl Service {
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
attach_req.tenant_shard_id,
TenantShard::new(
TenantState::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Attached(0),
@@ -1157,10 +1156,9 @@ impl Service {
// when we reattaching a detached tenant.
self.persistence
.update_tenant_shard(
TenantFilter::Shard(attach_req.tenant_shard_id),
Some(PlacementPolicy::Attached(0)),
Some(conf),
None,
attach_req.tenant_shard_id,
PlacementPolicy::Attached(0),
conf,
None,
)
.await?;
@@ -1178,32 +1176,32 @@ impl Service {
let mut locked = self.inner.write().unwrap();
let (_nodes, tenants, scheduler) = locked.parts_mut();
let tenant_shard = tenants
let tenant_state = tenants
.get_mut(&attach_req.tenant_shard_id)
.expect("Checked for existence above");
if let Some(new_generation) = new_generation {
tenant_shard.generation = Some(new_generation);
tenant_shard.policy = PlacementPolicy::Attached(0);
tenant_state.generation = Some(new_generation);
tenant_state.policy = PlacementPolicy::Attached(0);
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during storage controller restart.
assert!(attach_req.node_id.is_none());
tenant_shard.policy = PlacementPolicy::Detached;
tenant_state.policy = PlacementPolicy::Detached;
}
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
ps_id = %attaching_pageserver,
generation = ?tenant_shard.generation,
generation = ?tenant_state.generation,
"issuing",
);
} else if let Some(ps_id) = tenant_shard.intent.get_attached() {
} else if let Some(ps_id) = tenant_state.intent.get_attached() {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
%ps_id,
generation = ?tenant_shard.generation,
generation = ?tenant_state.generation,
"dropping",
);
} else {
@@ -1211,14 +1209,14 @@ impl Service {
tenant_id = %attach_req.tenant_shard_id,
"no-op: tenant already has no pageserver");
}
tenant_shard
tenant_state
.intent
.set_attached(scheduler, attach_req.node_id);
tracing::info!(
"attach_hook: tenant {} set generation {:?}, pageserver {}",
attach_req.tenant_shard_id,
tenant_shard.generation,
tenant_state.generation,
// TODO: this is an odd number of 0xf's
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
@@ -1230,36 +1228,36 @@ impl Service {
#[cfg(feature = "testing")]
{
if let Some(node_id) = attach_req.node_id {
tenant_shard.observed.locations = HashMap::from([(
tenant_state.observed.locations = HashMap::from([(
node_id,
ObservedStateLocation {
conf: Some(attached_location_conf(
tenant_shard.generation.unwrap(),
&tenant_shard.shard,
&tenant_shard.config,
tenant_state.generation.unwrap(),
&tenant_state.shard,
&tenant_state.config,
false,
)),
},
)]);
} else {
tenant_shard.observed.locations.clear();
tenant_state.observed.locations.clear();
}
}
Ok(AttachHookResponse {
gen: attach_req
.node_id
.map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
.map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
})
}
pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
let locked = self.inner.read().unwrap();
let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id);
let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
InspectResponse {
attachment: tenant_shard.and_then(|s| {
attachment: tenant_state.and_then(|s| {
s.intent
.get_attached()
.map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
@@ -1321,11 +1319,11 @@ impl Service {
let mut locked = self.inner.write().unwrap();
for (tenant_shard_id, observed_loc) in configs.tenant_shards {
let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else {
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
cleanup.push(tenant_shard_id);
continue;
};
tenant_shard
tenant_state
.observed
.locations
.insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
@@ -1496,13 +1494,13 @@ impl Service {
};
for req_tenant in validate_req.tenants {
if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen));
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
req_tenant.id,
req_tenant.gen,
tenant_shard.generation
tenant_state.generation
);
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
@@ -1617,8 +1615,6 @@ impl Service {
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
})
.collect();
@@ -1641,8 +1637,6 @@ impl Service {
Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
};
let mut schedule_context = ScheduleContext::default();
let (waiters, response_shards) = {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -1664,14 +1658,11 @@ impl Service {
// attached and secondary locations (independently) away frorm those
// pageservers also holding a shard for this tenant.
entry
.get_mut()
.schedule(scheduler, &mut schedule_context)
.map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?;
entry.get_mut().schedule(scheduler).map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?;
if let Some(node_id) = entry.get().intent.get_attached() {
let generation = entry
@@ -1688,7 +1679,7 @@ impl Service {
continue;
}
Entry::Vacant(entry) => {
let state = entry.insert(TenantShard::new(
let state = entry.insert(TenantState::new(
tenant_shard_id,
ShardIdentity::from_params(
tenant_shard_id.shard_number,
@@ -1699,7 +1690,7 @@ impl Service {
state.generation = initial_generation;
state.config = create_req.config.clone();
if let Err(e) = state.schedule(scheduler, &mut schedule_context) {
if let Err(e) = state.schedule(scheduler) {
schcedule_error = Some(e);
}
@@ -1763,9 +1754,6 @@ impl Service {
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
/// and transform it into either a tenant creation of a series of shard updates.
///
/// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will
/// still be returned.
fn tenant_location_config_prepare(
&self,
tenant_id: TenantId,
@@ -1813,12 +1801,17 @@ impl Service {
_ => None,
};
updates.push(ShardUpdate {
tenant_shard_id: *shard_id,
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
});
if shard.policy != placement_policy
|| shard.config != req.config.tenant_conf
|| set_generation.is_some()
{
updates.push(ShardUpdate {
tenant_shard_id: *shard_id,
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
});
}
}
if create {
@@ -1847,7 +1840,6 @@ impl Service {
},
)
} else {
assert!(!updates.is_empty());
TenantCreateOrUpdate::Update(updates)
}
}
@@ -1906,7 +1898,6 @@ impl Service {
// Persist updates
// Ordering: write to the database before applying changes in-memory, so that
// we will not appear time-travel backwards on a restart.
let mut schedule_context = ScheduleContext::default();
for ShardUpdate {
tenant_shard_id,
placement_policy,
@@ -1916,11 +1907,10 @@ impl Service {
{
self.persistence
.update_tenant_shard(
TenantFilter::Shard(*tenant_shard_id),
Some(placement_policy.clone()),
Some(tenant_config.clone()),
*tenant_shard_id,
placement_policy.clone(),
tenant_config.clone(),
*generation,
None,
)
.await?;
}
@@ -1954,7 +1944,7 @@ impl Service {
shard.generation = Some(generation);
}
shard.schedule(scheduler, &mut schedule_context)?;
shard.schedule(scheduler)?;
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
if let Some(waiter) = maybe_waiter {
@@ -1998,13 +1988,7 @@ impl Service {
let config = req.config;
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(req.tenant_id),
None,
Some(config.clone()),
None,
None,
)
.update_tenant_config(req.tenant_id, config.clone())
.await?;
let waiters = {
@@ -2114,7 +2098,7 @@ impl Service {
let scheduler = &locked.scheduler;
// Right now we only perform the operation on a single node without parallelization
// TODO fan out the operation to multiple nodes for better performance
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
let node_id = scheduler.schedule_shard(&[])?;
let node = locked
.nodes
.get(&node_id)
@@ -2357,58 +2341,6 @@ impl Service {
Ok(StatusCode::NOT_FOUND)
}
/// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
/// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies
/// the tenant's policies (configuration) within the storage controller
pub(crate) async fn tenant_update_policy(
&self,
tenant_id: TenantId,
req: TenantPolicyRequest,
) -> Result<(), ApiError> {
// We require an exclusive lock, because we are updating persistent and in-memory state
let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
let TenantPolicyRequest {
placement,
scheduling,
} = req;
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(tenant_id),
placement.clone(),
None,
None,
scheduling,
)
.await?;
let mut schedule_context = ScheduleContext::default();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
if let Some(placement) = &placement {
shard.policy = placement.clone();
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
"Updated placement policy to {placement:?}");
}
if let Some(scheduling) = &scheduling {
shard.set_scheduling_policy(*scheduling);
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
"Updated scheduling policy to {scheduling:?}");
}
// In case scheduling is being switched back on, try it now.
shard.schedule(scheduler, &mut schedule_context).ok();
self.maybe_reconcile_shard(shard, nodes);
}
Ok(())
}
pub(crate) async fn tenant_timeline_create(
&self,
tenant_id: TenantId,
@@ -2735,71 +2667,45 @@ impl Service {
})
}
/// Returns None if the input iterator of shards does not include a shard with number=0
fn tenant_describe_impl<'a>(
&self,
shards: impl Iterator<Item = &'a TenantShard>,
) -> Option<TenantDescribeResponse> {
let mut shard_zero = None;
let mut describe_shards = Vec::new();
for shard in shards {
if shard.tenant_shard_id.is_zero() {
shard_zero = Some(shard);
}
describe_shards.push(TenantDescribeResponseShard {
tenant_shard_id: shard.tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
scheduling_policy: *shard.get_scheduling_policy(),
})
}
let shard_zero = shard_zero?;
Some(TenantDescribeResponse {
tenant_id: shard_zero.tenant_shard_id.tenant_id,
shards: describe_shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
}
pub(crate) fn tenant_describe(
&self,
tenant_id: TenantId,
) -> Result<TenantDescribeResponse, ApiError> {
let locked = self.inner.read().unwrap();
self.tenant_describe_impl(
locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.map(|(_k, v)| v),
)
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
}
let mut shard_zero = None;
let mut shards = Vec::new();
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
let locked = self.inner.read().unwrap();
let mut result = Vec::new();
for (_tenant_id, tenant_shards) in
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
result.push(
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
.expect("Groups are always non-empty"),
);
if tenant_shard_id.is_zero() {
shard_zero = Some(shard);
}
let response_shard = TenantDescribeResponseShard {
tenant_shard_id: *tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
};
shards.push(response_shard);
}
result
let Some(shard_zero) = shard_zero else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
Ok(TenantDescribeResponse {
shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
}
#[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
@@ -2892,7 +2798,7 @@ impl Service {
tracing::info!("Restoring parent shard {tenant_shard_id}");
shard.splitting = SplitState::Idle;
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) {
if let Err(e) = shard.schedule(scheduler) {
// If this shard can't be scheduled now (perhaps due to offline nodes or
// capacity issues), that must not prevent us rolling back a split. In this
// case it should be eventually scheduled in the background.
@@ -3016,7 +2922,6 @@ impl Service {
)
};
let mut schedule_context = ScheduleContext::default();
for child in child_ids {
let mut child_shard = parent_ident;
child_shard.number = child.shard_number;
@@ -3038,7 +2943,7 @@ impl Service {
},
);
let mut child_state = TenantShard::new(child, child_shard, policy.clone());
let mut child_state = TenantState::new(child, child_shard, policy.clone());
child_state.intent = IntentState::single(scheduler, Some(pageserver));
child_state.observed = ObservedState {
locations: child_observed,
@@ -3046,13 +2951,13 @@ impl Service {
child_state.generation = Some(generation);
child_state.config = config.clone();
// The child's TenantShard::splitting is intentionally left at the default value of Idle,
// The child's TenantState::splitting is intentionally left at the default value of Idle,
// as at this point in the split process we have succeeded and this part is infallible:
// we will never need to do any special recovery from this state.
child_locations.push((child, pageserver, child_shard.stripe_size));
if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) {
if let Err(e) = child_state.schedule(scheduler) {
// This is not fatal, because we've implicitly already got an attached
// location for the child shard. Failure here just means we couldn't
// find a secondary (e.g. because cluster is overloaded).
@@ -3345,10 +3250,6 @@ impl Service {
placement_policy: serde_json::to_string(&policy).unwrap(),
config: serde_json::to_string(&config).unwrap(),
splitting: SplitState::Splitting,
// Scheduling policies do not carry through to children
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
});
}
@@ -3595,8 +3496,8 @@ impl Service {
Ok(())
}
/// For debug/support: a full JSON dump of TenantShards. Returns a response so that
/// we don't have to make TenantShard clonable in the return path.
/// For debug/support: a full JSON dump of TenantStates. Returns a response so that
/// we don't have to make TenantState clonable in the return path.
pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
let serialized = {
let locked = self.inner.read().unwrap();
@@ -3700,7 +3601,7 @@ impl Service {
}
/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
/// we don't have to make TenantShard clonable in the return path.
/// we don't have to make TenantState clonable in the return path.
pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
let serialized = {
let locked = self.inner.read().unwrap();
@@ -3916,9 +3817,8 @@ impl Service {
AvailabilityTransition::ToOffline => {
tracing::info!("Node {} transition to offline", node_id);
let mut tenants_affected: usize = 0;
for (tenant_shard_id, tenant_shard) in tenants {
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
for (tenant_shard_id, tenant_state) in tenants {
if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
// When a node goes offline, we set its observed configuration to None, indicating unknown: we will
// not assume our knowledge of the node's configuration is accurate until it comes back online
observed_loc.conf = None;
@@ -3931,24 +3831,18 @@ impl Service {
continue;
}
if tenant_shard.intent.demote_attached(node_id) {
tenant_shard.sequence = tenant_shard.sequence.next();
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
// for tenants without secondary locations: if they have a secondary location, then this
// schedule() call is just promoting an existing secondary)
let mut schedule_context = ScheduleContext::default();
match tenant_shard.schedule(scheduler, &mut schedule_context) {
if tenant_state.intent.demote_attached(node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {
Err(e) => {
// It is possible that some tenants will become unschedulable when too many pageservers
// go offline: in this case there isn't much we can do other than make the issue observable.
// TODO: give TenantShard a scheduling error attribute to be queried later.
// TODO: give TenantState a scheduling error attribute to be queried later.
tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
}
Ok(()) => {
if self
.maybe_reconcile_shard(tenant_shard, &new_nodes)
.maybe_reconcile_shard(tenant_state, &new_nodes)
.is_some()
{
tenants_affected += 1;
@@ -3967,10 +3861,10 @@ impl Service {
tracing::info!("Node {} transition to active", node_id);
// When a node comes back online, we must reconcile any tenant that has a None observed
// location on the node.
for tenant_shard in locked.tenants.values_mut() {
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
for tenant_state in locked.tenants.values_mut() {
if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
if observed_loc.conf.is_none() {
self.maybe_reconcile_shard(tenant_shard, &new_nodes);
self.maybe_reconcile_shard(tenant_state, &new_nodes);
}
}
}
@@ -3990,6 +3884,9 @@ impl Service {
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
///
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
/// an attached policy. We should error out if it isn't.
fn ensure_attached_schedule(
&self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3998,27 +3895,10 @@ impl Service {
let mut waiters = Vec::new();
let (nodes, tenants, scheduler) = locked.parts_mut();
let mut schedule_context = ScheduleContext::default();
for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
shard.schedule(scheduler, &mut schedule_context)?;
// The shard's policies may not result in an attached location being scheduled: this
// is an error because our caller needs it attached somewhere.
if shard.intent.get_attached().is_none() {
return Err(anyhow::anyhow!(
"Tenant {tenant_id} not scheduled to be attached"
));
};
if shard.stably_attached().is_some() {
// We do not require the shard to be totally up to date on reconciliation: we just require
// that it has been attached on the intended node. Other dirty state such as unattached secondary
// locations, or compute hook notifications can be ignored.
continue;
}
for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
shard.schedule(scheduler)?;
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
waiters.push(waiter);
}
}
@@ -4053,11 +3933,11 @@ impl Service {
Ok(())
}
/// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides
/// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
/// all the references to parts of Self that are needed
fn maybe_reconcile_shard(
&self,
shard: &mut TenantShard,
shard: &mut TenantState,
nodes: &Arc<HashMap<NodeId, Node>>,
) -> Option<ReconcilerWaiter> {
shard.maybe_reconcile(
@@ -4080,144 +3960,8 @@ impl Service {
let (nodes, tenants, _scheduler) = locked.parts_mut();
let pageservers = nodes.clone();
let mut schedule_context = ScheduleContext::default();
let mut reconciles_spawned = 0;
for (tenant_shard_id, shard) in tenants.iter_mut() {
if tenant_shard_id.is_zero() {
schedule_context = ScheduleContext::default();
}
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another rone
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
reconciles_spawned += 1;
}
schedule_context.avoid(&shard.intent.all_pageservers());
}
reconciles_spawned
}
/// `optimize` in this context means identifying shards which have valid scheduled locations, but
/// could be scheduled somewhere better:
/// - Cutting over to a secondary if the node with the secondary is more lightly loaded
/// * e.g. after a node fails then recovers, to move some work back to it
/// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
/// * e.g. after a shard split, the initial attached locations will all be on the node where
/// we did the split, but are probably better placed elsewhere.
/// - Creating new secondary locations if it improves the spreading of a sharded tenant
/// * e.g. after a shard split, some locations will be on the same node (where the split
/// happened), and will probably be better placed elsewhere.
///
/// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
/// the time of scheduling, this function looks for cases where a better-scoring location is available
/// according to those same soft constraints.
fn optimize_all(&self) -> usize {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let pageservers = nodes.clone();
let mut schedule_context = ScheduleContext::default();
let mut reconciles_spawned = 0;
let mut tenant_shards: Vec<&TenantShard> = Vec::new();
// Limit on how many shards' optmizations each call to this function will execute. Combined
// with the frequency of background calls, this acts as an implicit rate limit that runs a small
// trickle of optimizations in the background, rather than executing a large number in parallel
// when a change occurs.
const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
let mut work = Vec::new();
for (tenant_shard_id, shard) in tenants.iter() {
if tenant_shard_id.is_zero() {
// Reset accumulators on the first shard in a tenant
schedule_context = ScheduleContext::default();
tenant_shards.clear();
}
if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
break;
}
match shard.get_scheduling_policy() {
ShardSchedulingPolicy::Active => {
// Ok to do optimization
}
ShardSchedulingPolicy::Essential
| ShardSchedulingPolicy::Pause
| ShardSchedulingPolicy::Stop => {
// Policy prevents optimizing this shard.
continue;
}
}
// Accumulate the schedule context for all the shards in a tenant: we must have
// the total view of all shards before we can try to optimize any of them.
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
tenant_shards.push(shard);
// Once we have seen the last shard in the tenant, proceed to search across all shards
// in the tenant for optimizations
if shard.shard.number.0 == shard.shard.count.count() - 1 {
if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
// Do not start any optimizations while another change to the tenant is ongoing: this
// is not necessary for correctness, but simplifies operations and implicitly throttles
// optimization changes to happen in a "trickle" over time.
continue;
}
if tenant_shards.iter().any(|s| {
!matches!(s.splitting, SplitState::Idle)
|| matches!(s.policy, PlacementPolicy::Detached)
}) {
// Never attempt to optimize a tenant that is currently being split, or
// a tenant that is meant to be detached
continue;
}
// TODO: optimization calculations are relatively expensive: create some fast-path for
// the common idle case (avoiding the search on tenants that we have recently checked)
for shard in &tenant_shards {
if let Some(optimization) =
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
// its primary location based on soft constraints, cut it over.
shard.optimize_attachment(nodes, &schedule_context)
{
work.push((shard.tenant_shard_id, optimization));
break;
} else if let Some(optimization) =
// If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
// better placed on another node, based on ScheduleContext, then adjust it. This
// covers cases like after a shard split, where we might have too many shards
// in the same tenant with secondary locations on the node where they originally split.
shard.optimize_secondary(scheduler, &schedule_context)
{
work.push((shard.tenant_shard_id, optimization));
break;
}
// TODO: extend this mechanism to prefer attaching on nodes with fewer attached
// tenants (i.e. extend schedule state to distinguish attached from secondary counts),
// for the total number of attachments on a node (not just within a tenant.)
}
}
}
for (tenant_shard_id, optimization) in work {
let shard = tenants
.get_mut(&tenant_shard_id)
.expect("We held lock from place we got this ID");
shard.apply_optimization(scheduler, optimization);
for (_tenant_shard_id, shard) in tenants.iter_mut() {
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
reconciles_spawned += 1;
}
@@ -4226,35 +3970,9 @@ impl Service {
reconciles_spawned
}
/// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
/// also wait for any generated Reconcilers to complete. Calling this until it returns zero should
/// put the system into a quiescent state where future background reconciliations won't do anything.
pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
let reconciles_spawned = self.reconcile_all();
if reconciles_spawned == 0 {
// Only optimize when we are otherwise idle
self.optimize_all();
}
let waiters = {
let mut waiters = Vec::new();
let locked = self.inner.read().unwrap();
for (_tenant_shard_id, shard) in locked.tenants.iter() {
if let Some(waiter) = shard.get_waiter() {
waiters.push(waiter);
}
}
waiters
};
let waiter_count = waiters.len();
self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
Ok(waiter_count)
}
pub async fn shutdown(&self) {
// Note that this already stops processing any results from reconciles: so
// we do not expect that our [`TenantShard`] objects will reach a neat
// we do not expect that our [`TenantState`] objects will reach a neat
// final state.
self.cancel.cancel();

View File

@@ -7,9 +7,8 @@ use std::{
use crate::{
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
persistence::TenantShardPersistence,
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
};
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
@@ -50,7 +49,7 @@ where
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)]
pub(crate) struct TenantShard {
pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
@@ -117,10 +116,6 @@ pub(crate) struct TenantShard {
/// sending it. This is the mechanism by which compute notifications are included in the scope
/// of state that we publish externally in an eventually consistent way.
pub(crate) pending_compute_notification: bool,
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
// be set to a non-active state to avoid making changes while the issue is fixed.
scheduling_policy: ShardSchedulingPolicy,
}
#[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +246,8 @@ impl IntentState {
impl Drop for IntentState {
fn drop(&mut self) {
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
// We do not check this while panicking, to avoid polluting unit test failures or
// other assertions with this assertion's output. It's still wrong to leak these,
// but if we already have a panic then we don't need to independently flag this case.
if !(std::thread::panicking()) {
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
}
@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
Failed(TenantShardId, String),
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) struct ReplaceSecondary {
old_node_id: NodeId,
new_node_id: NodeId,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) struct MigrateAttachment {
old_attached_node_id: NodeId,
new_attached_node_id: NodeId,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) enum ScheduleOptimization {
// Replace one of our secondary locations with a different node
ReplaceSecondary(ReplaceSecondary),
// Migrate attachment to an existing secondary location
MigrateAttachment(MigrateAttachment),
}
impl ReconcilerWaiter {
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
tokio::select! {
@@ -354,7 +324,7 @@ pub(crate) struct ReconcilerHandle {
}
/// When a reconcile task completes, it sends this result object
/// to be applied to the primary TenantShard.
/// to be applied to the primary TenantState.
pub(crate) struct ReconcileResult {
pub(crate) sequence: Sequence,
/// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +337,7 @@ pub(crate) struct ReconcileResult {
pub(crate) generation: Option<Generation>,
pub(crate) observed: ObservedState,
/// Set [`TenantShard::pending_compute_notification`] from this flag
/// Set [`TenantState::pending_compute_notification`] from this flag
pub(crate) pending_compute_notification: bool,
}
@@ -379,7 +349,7 @@ impl ObservedState {
}
}
impl TenantShard {
impl TenantState {
pub(crate) fn new(
tenant_shard_id: TenantShardId,
shard: ShardIdentity,
@@ -400,7 +370,6 @@ impl TenantShard {
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(),
pending_compute_notification: false,
scheduling_policy: ShardSchedulingPolicy::default(),
}
}
@@ -456,7 +425,6 @@ impl TenantShard {
fn schedule_attached(
&mut self,
scheduler: &mut Scheduler,
context: &ScheduleContext,
) -> Result<(bool, NodeId), ScheduleError> {
// No work to do if we already have an attached tenant
if let Some(node_id) = self.intent.attached {
@@ -470,33 +438,14 @@ impl TenantShard {
Ok((true, promote_secondary))
} else {
// Pick a fresh node: either we had no secondaries or none were schedulable
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
tracing::debug!("Selected {} as attached", node_id);
self.intent.set_attached(scheduler, Some(node_id));
Ok((true, node_id))
}
}
pub(crate) fn schedule(
&mut self,
scheduler: &mut Scheduler,
context: &mut ScheduleContext,
) -> Result<(), ScheduleError> {
let r = self.do_schedule(scheduler, context);
context.avoid(&self.intent.all_pageservers());
if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
r
}
pub(crate) fn do_schedule(
&mut self,
scheduler: &mut Scheduler,
context: &ScheduleContext,
) -> Result<(), ScheduleError> {
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
// TODO: before scheduling new nodes, check if any existing content in
// self.intent refers to pageservers that are offline, and pick other
// pageservers if so.
@@ -504,16 +453,6 @@ impl TenantShard {
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
// change their attach location.
match self.scheduling_policy {
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
return Ok(());
}
}
// Build the set of pageservers already in use by this tenant, to avoid scheduling
// more work on the same pageservers we're already using.
let mut modified = false;
@@ -540,13 +479,12 @@ impl TenantShard {
}
// Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) =
self.schedule_attached(scheduler, context)?;
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
let mut used_pageservers = vec![attached_node_id];
while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id);
used_pageservers.push(node_id);
modified = true;
@@ -559,7 +497,7 @@ impl TenantShard {
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
let node_id = scheduler.schedule_shard(&[], context)?;
let node_id = scheduler.schedule_shard(&[])?;
self.intent.push_secondary(scheduler, node_id);
modified = true;
}
@@ -586,167 +524,6 @@ impl TenantShard {
Ok(())
}
/// Optimize attachments: if a shard has a secondary location that is preferable to
/// its primary location based on soft constraints, switch that secondary location
/// to be attached.
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn optimize_attachment(
&self,
nodes: &HashMap<NodeId, Node>,
schedule_context: &ScheduleContext,
) -> Option<ScheduleOptimization> {
let attached = (*self.intent.get_attached())?;
if self.intent.secondary.is_empty() {
// We can only do useful work if we have both attached and secondary locations: this
// function doesn't schedule new locations, only swaps between attached and secondaries.
return None;
}
let current_affinity_score = schedule_context.get_node_affinity(attached);
let current_attachment_count = schedule_context.get_node_attachments(attached);
// Generate score for each node, dropping any un-schedulable nodes.
let all_pageservers = self.intent.all_pageservers();
let mut scores = all_pageservers
.iter()
.flat_map(|node_id| {
if matches!(
nodes
.get(node_id)
.map(|n| n.may_schedule())
.unwrap_or(MaySchedule::No),
MaySchedule::No
) {
None
} else {
let affinity_score = schedule_context.get_node_affinity(*node_id);
let attachment_count = schedule_context.get_node_attachments(*node_id);
Some((*node_id, affinity_score, attachment_count))
}
})
.collect::<Vec<_>>();
// Sort precedence:
// 1st - prefer nodes with the lowest total affinity score
// 2nd - prefer nodes with the lowest number of attachments in this context
// 3rd - if all else is equal, sort by node ID for determinism in tests.
scores.sort_by_key(|i| (i.1, i.2, i.0));
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
scores.first()
{
if attached != *preferred_node {
// The best alternative must be more than 1 better than us, otherwise we could end
// up flapping back next time we're called (e.g. there's no point migrating from
// a location with score 1 to a score zero, because on next location the situation
// would be the same, but in reverse).
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|| current_attachment_count > *preferred_attachment_count + 1
{
tracing::info!(
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
self.intent.get_secondary()
);
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: attached,
new_attached_node_id: *preferred_node,
}));
}
} else {
tracing::debug!(
"Node {} is already preferred (score {:?})",
preferred_node,
preferred_affinity_score
);
}
}
// Fall-through: we didn't find an optimization
None
}
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn optimize_secondary(
&self,
scheduler: &Scheduler,
schedule_context: &ScheduleContext,
) -> Option<ScheduleOptimization> {
if self.intent.secondary.is_empty() {
// We can only do useful work if we have both attached and secondary locations: this
// function doesn't schedule new locations, only swaps between attached and secondaries.
return None;
}
for secondary in self.intent.get_secondary() {
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
// We're already on a node unaffected any affinity constraints,
// so we won't change it.
continue;
};
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
// This implicitly limits the choice to nodes that are available, and prefers nodes
// with lower utilization.
let Ok(candidate_node) =
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
else {
// A scheduling error means we have no possible candidate replacements
continue;
};
let candidate_affinity_score = schedule_context
.nodes
.get(&candidate_node)
.unwrap_or(&AffinityScore::FREE);
// The best alternative must be more than 1 better than us, otherwise we could end
// up flapping back next time we're called.
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
// If some other node is available and has a lower score than this node, then
// that other node is a good place to migrate to.
tracing::info!(
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
self.intent.get_secondary()
);
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id: *secondary,
new_node_id: candidate_node,
}));
}
}
None
}
pub(crate) fn apply_optimization(
&mut self,
scheduler: &mut Scheduler,
optimization: ScheduleOptimization,
) {
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_schedule_optimization
.inc();
match optimization {
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id,
new_attached_node_id,
}) => {
self.intent.demote_attached(old_attached_node_id);
self.intent
.promote_attached(scheduler, new_attached_node_id);
}
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id,
new_node_id,
}) => {
self.intent.remove_secondary(scheduler, old_node_id);
self.intent.push_secondary(scheduler, new_node_id);
}
}
}
/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -891,19 +668,6 @@ impl TenantShard {
}
}
// Pre-checks done: finally check whether we may actually do the work
match self.scheduling_policy {
ShardSchedulingPolicy::Active
| ShardSchedulingPolicy::Essential
| ShardSchedulingPolicy::Pause => {}
ShardSchedulingPolicy::Stop => {
// We only reach this point if there is work to do and we're going to skip
// doing it: warn it obvious why this tenant isn't doing what it ought to.
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
return None;
}
}
// Build list of nodes from which the reconciler should detach
let mut detach = Vec::new();
for node_id in self.observed.locations.keys() {
@@ -1040,22 +804,6 @@ impl TenantShard {
})
}
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
/// if it is not already running
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
if self.reconciler.is_some() {
Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
error_seq_wait: self.error_waiter.clone(),
error: self.last_error.clone(),
seq: self.sequence,
})
} else {
None
}
}
/// Called when a ReconcileResult has been emitted and the service is updating
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
/// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +829,6 @@ impl TenantShard {
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
}
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
self.scheduling_policy = p;
}
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
&self.scheduling_policy
}
pub(crate) fn from_persistent(
tsp: TenantShardPersistence,
intent: IntentState,
) -> anyhow::Result<Self> {
let tenant_shard_id = tsp.get_tenant_shard_id()?;
let shard_identity = tsp.get_shard_identity()?;
Ok(Self {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
splitting: tsp.splitting,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
})
}
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
TenantShardPersistence {
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +840,6 @@ impl TenantShard {
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
}
}
}
@@ -1143,7 +856,7 @@ pub(crate) mod tests {
use super::*;
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
let tenant_id = TenantId::generate();
let shard_number = ShardNumber(0);
let shard_count = ShardCount::new(1);
@@ -1153,7 +866,7 @@ pub(crate) mod tests {
shard_number,
shard_count,
};
TenantShard::new(
TenantState::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
)
}
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
let tenant_id = TenantId::generate();
(0..shard_count.count())
.map(|i| {
let shard_number = ShardNumber(i);
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number,
shard_count,
};
TenantShard::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
shard_count,
pageserver_api::shard::ShardStripeSize(32768),
)
.unwrap(),
policy.clone(),
)
})
.collect()
}
/// Test the scheduling behaviors used when a tenant configured for HA is subject
/// to nodes being marked offline.
#[test]
@@ -1200,26 +887,25 @@ pub(crate) mod tests {
let mut nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard
.schedule(&mut scheduler, &mut context)
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_state
.schedule(&mut scheduler)
.expect("we have enough nodes, scheduling should work");
// Expect to initially be schedule on to different nodes
assert_eq!(tenant_shard.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_some());
assert_eq!(tenant_state.intent.secondary.len(), 1);
assert!(tenant_state.intent.attached.is_some());
let attached_node_id = tenant_shard.intent.attached.unwrap();
let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
let attached_node_id = tenant_state.intent.attached.unwrap();
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_shard.intent.demote_attached(attached_node_id);
let changed = tenant_state.intent.demote_attached(attached_node_id);
assert!(changed);
assert!(tenant_shard.intent.attached.is_none());
assert_eq!(tenant_shard.intent.secondary.len(), 2);
assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline
nodes
@@ -1229,18 +915,18 @@ pub(crate) mod tests {
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
// Scheduling the node should promote the still-available secondary node to attached
tenant_shard
.schedule(&mut scheduler, &mut context)
tenant_state
.schedule(&mut scheduler)
.expect("active nodes are available");
assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
// The original attached node should have been retained as a secondary
assert_eq!(
*tenant_shard.intent.secondary.iter().last().unwrap(),
*tenant_state.intent.secondary.iter().last().unwrap(),
attached_node_id
);
tenant_shard.intent.clear(&mut scheduler);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
@@ -1250,263 +936,48 @@ pub(crate) mod tests {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard.observed.locations.insert(
tenant_state.observed.locations.insert(
NodeId(3),
ObservedStateLocation {
conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedMulti,
generation: Some(2),
secondary_conf: None,
shard_number: tenant_shard.shard.number.0,
shard_count: tenant_shard.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0,
shard_number: tenant_state.shard.number.0,
shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(),
}),
},
);
tenant_shard.observed.locations.insert(
tenant_state.observed.locations.insert(
NodeId(2),
ObservedStateLocation {
conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedStale,
generation: Some(1),
secondary_conf: None,
shard_number: tenant_shard.shard.number.0,
shard_count: tenant_shard.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0,
shard_number: tenant_state.shard.number.0,
shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(),
}),
},
);
tenant_shard.intent_from_observed(&mut scheduler);
tenant_state.intent_from_observed(&mut scheduler);
// The highest generationed attached location gets used as attached
assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
// Other locations get used as secondary
assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
tenant_shard.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn scheduling_mode() -> anyhow::Result<()> {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
// In pause mode, schedule() shouldn't do anything
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
assert!(tenant_shard
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(tenant_shard.intent.all_pageservers().is_empty());
// In active mode, schedule() works
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
assert!(tenant_shard
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(!tenant_shard.intent.all_pageservers().is_empty());
tenant_shard.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn optimize_attachment() -> anyhow::Result<()> {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
// Initially: both nodes attached on shard 1, and both have secondary locations
// on different nodes.
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
// Either shard should recognize that it has the option to switch to a secondary location where there
// would be no other shards from the same tenant, and request to do so.
assert_eq!(
optimization_a,
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: NodeId(1),
new_attached_node_id: NodeId(2)
}))
);
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
// of [`Service::optimize_all`] to avoid trying
// to do optimizations for multiple shards in the same tenant at the same time. Generating
// both optimizations is just done for test purposes
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
assert_eq!(
optimization_b,
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: NodeId(1),
new_attached_node_id: NodeId(3)
}))
);
// Applying these optimizations should result in the end state proposed
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
shard_a.intent.clear(&mut scheduler);
shard_b.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn optimize_secondary() -> anyhow::Result<()> {
let nodes = make_test_nodes(4);
let mut scheduler = Scheduler::new(nodes.values());
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
// Initially: both nodes attached on shard 1, and both have secondary locations
// on different nodes.
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
// Since there is a node with no locations available, the node with two locations for the
// same tenant should generate an optimization to move one away
assert_eq!(
optimization_a,
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id: NodeId(3),
new_node_id: NodeId(4)
}))
);
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
shard_a.intent.clear(&mut scheduler);
shard_b.intent.clear(&mut scheduler);
Ok(())
}
// Optimize til quiescent: this emulates what Service::optimize_all does, when
// called repeatedly in the background.
fn optimize_til_idle(
nodes: &HashMap<NodeId, Node>,
scheduler: &mut Scheduler,
shards: &mut [TenantShard],
) {
let mut loop_n = 0;
loop {
let mut schedule_context = ScheduleContext::default();
let mut any_changed = false;
for shard in shards.iter() {
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
}
for shard in shards.iter_mut() {
let optimization = shard.optimize_attachment(nodes, &schedule_context);
if let Some(optimization) = optimization {
shard.apply_optimization(scheduler, optimization);
any_changed = true;
break;
}
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
if let Some(optimization) = optimization {
shard.apply_optimization(scheduler, optimization);
any_changed = true;
break;
}
}
if !any_changed {
break;
}
// Assert no infinite loop
loop_n += 1;
assert!(loop_n < 1000);
}
}
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
/// that it converges.
#[test]
fn optimize_add_nodes() -> anyhow::Result<()> {
let nodes = make_test_nodes(4);
// Only show the scheduler a couple of nodes
let mut scheduler = Scheduler::new([].iter());
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
let mut schedule_context = ScheduleContext::default();
for shard in &mut shards {
assert!(shard
.schedule(&mut scheduler, &mut schedule_context)
.is_ok());
}
// We should see equal number of locations on the two nodes.
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
// Add another two nodes: we should see the shards spread out when their optimize
// methods are called
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
for shard in shards.iter_mut() {
shard.intent.clear(&mut scheduler);
}
scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
}

View File

@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
};
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
@@ -1058,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
}
Some(("set-state", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
let scheduling = subcommand_args.get_one("scheduling");
let availability = subcommand_args.get_one("availability");
let storage_controller = StorageController::from_env(env);
storage_controller
.node_configure(NodeConfigureRequest {
node_id: pageserver.conf.id,
scheduling: scheduling.cloned(),
availability: availability.cloned(),
})
.await?;
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"),
@@ -1498,6 +1515,12 @@ fn cli() -> Command {
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("set-state")
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
.about("Set scheduling or availability state of pageserver node")
.arg(pageserver_config_args.clone())
)
)
.subcommand(
Command::new("storage_controller")

View File

@@ -389,10 +389,6 @@ impl PageServerNode {
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
@@ -505,12 +501,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")

View File

@@ -1,23 +0,0 @@
[package]
name = "storcon_cli"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -1,587 +0,0 @@
use std::{collections::HashMap, str::FromStr};
use clap::{Parser, Subcommand};
use hyper::Method;
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::Url;
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
};
#[derive(Subcommand, Debug)]
enum Command {
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
/// since pageservers auto-register when they start up
NodeRegister {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
listen_pg_addr: String,
#[arg(long)]
listen_pg_port: u16,
#[arg(long)]
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
},
/// Modify a node's configuration in the storage controller
NodeConfigure {
#[arg(long)]
node_id: NodeId,
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
/// manually mark a node offline
#[arg(long)]
availability: Option<NodeAvailabilityArg>,
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
#[arg(long)]
scheduling: Option<NodeSchedulingPolicy>,
},
/// Modify a tenant's policies in the storage controller
TenantPolicy {
#[arg(long)]
tenant_id: TenantId,
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
/// or is in the normal attached state with N secondary locations (`attached:N`)
#[arg(long)]
placement: Option<PlacementPolicyArg>,
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
/// unavailable, and are only for use in emergencies.
#[arg(long)]
scheduling: Option<ShardSchedulingPolicyArg>,
},
/// List nodes known to the storage controller
Nodes {},
/// List tenants known to the storage controller
Tenants {},
/// Create a new tenant in the storage controller, and by extension on pageservers.
TenantCreate {
#[arg(long)]
tenant_id: TenantId,
},
/// Delete a tenant in the storage controller, and by extension on pageservers.
TenantDelete {
#[arg(long)]
tenant_id: TenantId,
},
/// Split an existing tenant into a higher number of shards than its current shard count.
TenantShardSplit {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
shard_count: u8,
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
#[arg(long)]
stripe_size: Option<u32>,
},
/// Migrate the attached location for a tenant shard to a specific pageserver.
TenantShardMigrate {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
config: String,
},
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
/// alternative to the storage controller's scheduling optimization behavior.
TenantScatter {
#[arg(long)]
tenant_id: TenantId,
},
/// Print details about a particular tenant, including all its shards' states.
TenantDescribe {
#[arg(long)]
tenant_id: TenantId,
},
}
#[derive(Parser)]
#[command(
author,
version,
about,
long_about = "CLI for Storage Controller Support/Debug"
)]
#[command(arg_required_else_help(true))]
struct Cli {
#[arg(long)]
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
api: Url,
#[arg(long)]
/// JWT token for authenticating with storage controller. Depending on the API used, this
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[command(subcommand)]
command: Command,
}
#[derive(Debug, Clone)]
struct PlacementPolicyArg(PlacementPolicy);
impl FromStr for PlacementPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"detached" => Ok(Self(PlacementPolicy::Detached)),
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
_ if s.starts_with("attached:") => {
let mut splitter = s.split(':');
let _prefix = splitter.next().unwrap();
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
None => Err(anyhow::anyhow!(
"Invalid format '{s}', a valid example is 'attached:1'"
)),
}
}
_ => Err(anyhow::anyhow!(
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
)),
}
}
}
#[derive(Debug, Clone)]
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
impl FromStr for ShardSchedulingPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
_ => Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
)),
}
}
}
#[derive(Debug, Clone)]
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
impl FromStr for NodeAvailabilityArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
} => {
storcon_client
.dispatch::<_, ()>(
Method::POST,
"control/v1/node".to_string(),
Some(NodeRegisterRequest {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
}),
)
.await?;
}
Command::TenantCreate { tenant_id } => {
vps_client
.tenant_create(&TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters::default(),
placement_policy: Some(PlacementPolicy::Attached(1)),
config: TenantConfig::default(),
})
.await?;
}
Command::TenantDelete { tenant_id } => {
let status = vps_client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await?;
tracing::info!("Delete status: {}", status);
}
Command::Nodes {} => {
let resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
for node in resp {
table.add_row([
format!("{}", node.id),
node.listen_http_addr,
format!("{:?}", node.scheduling),
format!("{:?}", node.availability),
]);
}
println!("{table}");
}
Command::NodeConfigure {
node_id,
availability,
scheduling,
} => {
let req = NodeConfigureRequest {
node_id,
availability: availability.map(|a| a.0),
scheduling,
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{node_id}/config"),
Some(req),
)
.await?;
}
Command::Tenants {} => {
let resp = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
"ShardCount",
"StripeSize",
"Placement",
"Scheduling",
]);
for tenant in resp {
let shard_zero = tenant.shards.into_iter().next().unwrap();
table.add_row([
format!("{}", tenant.tenant_id),
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
format!("{:?}", tenant.stripe_size),
format!("{:?}", tenant.policy),
format!("{:?}", shard_zero.scheduling_policy),
]);
}
println!("{table}");
}
Command::TenantPolicy {
tenant_id,
placement,
scheduling,
} => {
let req = TenantPolicyRequest {
scheduling: scheduling.map(|s| s.0),
placement: placement.map(|p| p.0),
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/policy"),
Some(req),
)
.await?;
}
Command::TenantShardSplit {
tenant_id,
shard_count,
stripe_size,
} => {
let req = TenantShardSplitRequest {
new_shard_count: shard_count,
new_stripe_size: stripe_size.map(ShardStripeSize),
};
let response = storcon_client
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(req),
)
.await?;
println!(
"Split tenant {} into {} shards: {}",
tenant_id,
shard_count,
response
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Command::TenantShardMigrate {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: tenant_conf,
})
.await?;
}
Command::TenantScatter { tenant_id } => {
// Find the shards
let locate_response = storcon_client
.dispatch::<(), TenantLocateResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await?;
let shards = locate_response.shards;
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
let shard_count = shards.len();
for s in shards {
let entry = node_to_shards.entry(s.node_id).or_default();
entry.push(s.shard_id);
}
// Load list of available nodes
let nodes_resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
for node in nodes_resp {
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
node_to_shards.entry(node.id).or_default();
}
}
let max_shard_per_node = shard_count / node_to_shards.len();
loop {
let mut migrate_shard = None;
for shards in node_to_shards.values_mut() {
if shards.len() > max_shard_per_node {
// Pick the emptiest
migrate_shard = Some(shards.pop().unwrap());
}
}
let Some(migrate_shard) = migrate_shard else {
break;
};
// Pick the emptiest node to migrate to
let mut destinations = node_to_shards
.iter()
.map(|(k, v)| (k, v.len()))
.collect::<Vec<_>>();
destinations.sort_by_key(|i| i.1);
let (destination_node, destination_count) = *destinations.first().unwrap();
if destination_count + 1 > max_shard_per_node {
// Even the emptiest destination doesn't have space: we're done
break;
}
let destination_node = *destination_node;
node_to_shards
.get_mut(&destination_node)
.unwrap()
.push(migrate_shard);
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{migrate_shard}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id: migrate_shard,
node_id: destination_node,
}),
)
.await?;
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
}
// Spread the shards across the nodes
}
Command::TenantDescribe { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let shards = describe_response.shards;
let mut table = comfy_table::Table::new();
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
for shard in shards {
let secondary = shard
.node_secondary
.iter()
.map(|n| format!("{}", n))
.collect::<Vec<_>>()
.join(",");
let mut status_parts = Vec::new();
if shard.is_reconciling {
status_parts.push("reconciling");
}
if shard.is_pending_compute_notification {
status_parts.push("pending_compute");
}
if shard.is_splitting {
status_parts.push("splitting");
}
let status = status_parts.join(",");
table.add_row([
format!("{}", shard.tenant_shard_id),
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or(String::new()),
secondary,
shard.last_error,
status,
]);
}
println!("{table}");
}
}
Ok(())
}

View File

@@ -2,8 +2,8 @@
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "storage_controller/src/schema.rs"
file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "storage_controller/migrations"
dir = "control_plane/attachment_service/migrations"

View File

@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
Neon storage broker, providing messaging between safekeepers and pageservers.
[storage_broker.md](./storage_broker.md)
`storage_controller`:
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
managing a many-sharded tenant as a single entity.
`/control_plane`:
Local control plane.

View File

@@ -2,9 +2,9 @@ use std::str::FromStr;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
/// in [`attachment_service::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};
use utils::id::NodeId;
use crate::{
models::{ShardParameters, TenantConfig},
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantPolicyRequest {
pub placement: Option<PlacementPolicy>,
pub scheduling: Option<ShardSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy,
pub config: TenantConfig,
}
#[derive(Serialize, Deserialize)]
pub struct NodeDescribeResponse {
pub id: NodeId,
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
pub is_pending_compute_notification: bool,
/// A shard split is currently underway
pub is_splitting: bool,
pub scheduling_policy: ShardSchedulingPolicy,
}
/// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
/// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct UtilizationScore(pub u64);
impl UtilizationScore {
@@ -129,7 +106,7 @@ impl UtilizationScore {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
#[derive(Serialize, Clone, Copy)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
// Normal, happy state
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
// This wrapper provides serde functionality and it should only be used to
// communicate with external callers which don't know or care about the
// utilisation score of the pageserver it is targeting.
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
#[derive(Serialize, Deserialize, Clone)]
pub enum NodeAvailabilityWrapper {
Active,
Offline,
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum ShardSchedulingPolicy {
// Normal mode: the tenant's scheduled locations may be updated at will, including
// for non-essential optimization.
Active,
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
// For example, this still permits a node's attachment location to change to a secondary in
// response to a node failure, or to assign a new secondary if a node was removed.
Essential,
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
// unavailable, it will not be rescheduled to another node.
Pause,
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
Stop,
}
impl Default for ShardSchedulingPolicy {
fn default() -> Self {
Self::Active
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
// This is used when parsing node configuration requests from neon-local.
// Assume the worst possible utilisation score
// and let it get updated via the heartbeats.
"active" => Ok(Self::Active(UtilizationScore::worst())),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy {
Active,
Filling,

View File

@@ -301,7 +301,6 @@ pub struct TenantConfig {
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]

View File

@@ -565,16 +565,6 @@ impl GenericRemoteStorage {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct StorageMetadata(HashMap<String, String>);
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
fn from(arr: [(&str, &str); N]) -> Self {
let map: HashMap<String, String> = arr
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self(map)
}
}
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {

View File

@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -146,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();

View File

@@ -219,6 +219,7 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -247,6 +248,7 @@ struct S3WithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -308,6 +310,7 @@ struct S3WithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();

View File

@@ -182,18 +182,6 @@ where
}
}
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
let internal = self.internal.lock().unwrap();
let cnt = internal.current.cnt_value();
drop(internal);
if cnt >= num {
Ok(())
} else {
Err(cnt)
}
}
/// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived.
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {

View File

@@ -27,25 +27,25 @@
//!
//! # Reference Numbers
//!
//! 2024-04-04 on i3en.3xlarge
//! 2024-03-20 on i3en.3xlarge
//!
//! ```text
//! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
//! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
//! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
//! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
//! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
//! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
//! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
//! ```
use bytes::{Buf, Bytes};

View File

@@ -128,12 +128,12 @@ impl Client {
pub async fn timeline_info(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
force_await_logical_size: ForceAwaitLogicalSize,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
@@ -151,11 +151,11 @@ impl Client {
pub async fn keyspace(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint
);
self.get(&uri)

View File

@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
fanout: u64,
ctx: &E::RequestContext,
) -> anyhow::Result<()> {
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
let exp_base = fanout.max(2);
assert!(fanout >= 2);
// Start at L0
let mut current_level_no = 0;
let mut current_level_target_height = target_file_size;
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
break;
}
current_level_no += 1;
current_level_target_height = current_level_target_height.saturating_mul(exp_base);
current_level_target_height = current_level_target_height.saturating_mul(fanout);
}
Ok(())
}

View File

@@ -1,5 +1,4 @@
use anyhow::Context;
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest;
@@ -96,7 +95,7 @@ async fn main_impl(
let timeline = *timeline;
let info = mgmt_api_client
.timeline_info(
TenantShardId::unsharded(timeline.tenant_id),
timeline.tenant_id,
timeline.timeline_id,
ForceAwaitLogicalSize::No,
)

View File

@@ -4,7 +4,6 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
@@ -174,10 +173,7 @@ async fn main_impl(
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(
TenantShardId::unsharded(timeline.tenant_id),
timeline.timeline_id,
)
.keyspace(timeline.tenant_id, timeline.timeline_id)
.await?;
let lsn = partitioning.at_lsn;
let start = Instant::now();

View File

@@ -1,7 +1,6 @@
use std::sync::Arc;
use humantime::Duration;
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use utils::id::TenantTimelineId;
@@ -60,11 +59,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move {
let info = mgmt_api_client
.timeline_info(
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.await
.unwrap();
@@ -79,11 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
while !info.current_logical_size_is_accurate {
ticker.tick().await;
info = mgmt_api_client
.timeline_info(
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
.await
.unwrap();
}

View File

@@ -18,7 +18,6 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tracing::*;
@@ -672,37 +671,42 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
{
BACKGROUND_RUNTIME.block_on(async move {
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
let signal = tokio::select! {
_ = sigquit.recv() => {
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
_ = sigint.recv() => { "SIGINT" },
_ = sigterm.recv() => { "SIGTERM" },
};
use signal_hook::consts::*;
let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut signals =
signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
return signals
.forever()
.next()
.expect("forever() never returns None unless explicitly closed");
});
let signal = BACKGROUND_RUNTIME
.block_on(signal_handler)
.expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
unreachable!()
}
_ => unreachable!(),
}
}
}

View File

@@ -12,7 +12,7 @@ use pageserver_api::{
use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
use utils::{backoff, generation::Generation, id::NodeId};
use crate::{
config::{NodeMetadata, PageServerConf},
@@ -210,10 +210,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(),
};
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
}
fail::fail_point!("control-plane-client-validate");
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

View File

@@ -1629,7 +1629,7 @@ components:
type: integer
format: int64
minimum: 0
description: The amount of disk space currently used.
description: The amount of disk space currently utilized by layer files.
free_space_bytes:
type: integer
format: int64

View File

@@ -993,26 +993,11 @@ async fn tenant_status(
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
let activate = true;
#[cfg(feature = "testing")]
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
let tenant_info = async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
if activate {
// This is advisory: we prefer to let the tenant activate on-demand when this function is
// called, but it is still valid to return 200 and describe the current state of the tenant
// if it doesn't make it into an active state.
tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await
.ok();
}
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() {

View File

@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use pageserver_api::key::rel_block_to_key;
use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive;
use tracing::*;
@@ -171,10 +170,7 @@ async fn import_rel(
let r = reader.read_exact(&mut buf).await;
match r {
Ok(_) => {
let key = rel_block_to_key(rel, blknum);
if modification.tline.get_shard_identity().is_key_local(&key) {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
}
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
}
// TODO: UnexpectedEof is expected

View File

@@ -1483,18 +1483,12 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
});
pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
)
.unwrap(),
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"

View File

@@ -876,13 +876,7 @@ impl PageServerHandler {
if lsn <= last_record_lsn {
lsn = last_record_lsn;
} else {
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
timeline.wait_lsn(lsn, ctx).await?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
@@ -894,13 +888,7 @@ impl PageServerHandler {
"invalid LSN(0) in request".into(),
));
}
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
timeline.wait_lsn(lsn, ctx).await?;
}
if lsn < **latest_gc_cutoff_lsn {
@@ -1227,13 +1215,7 @@ impl PageServerHandler {
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
info!("waiting for {}", lsn);
timeline
.wait_lsn(
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
ctx,
)
.await?;
timeline.wait_lsn(lsn, ctx).await?;
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
.context("invalid basebackup lsn")?;

View File

@@ -214,12 +214,13 @@ pub enum TaskKind {
/// Internally, `Client` hands over requests to the `Connection` object.
/// The `Connection` object is responsible for speaking the wire protocol.
///
/// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
/// That abstraction doesn't use `task_mgr`.
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
///
/// Once the connection is established, the `TaskHandle` task spawns a
/// [`WalReceiverConnectionPoller`] task that is responsible for polling
/// Once the connection is established, the `TaskHandle` task creates a
/// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
/// the `Connection` object.
/// A `CancellationToken` created by the `TaskHandle` task ensures
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -229,6 +230,7 @@ pub enum TaskKind {
WalReceiverManager,
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
/// See the comment on [`WalReceiverManager`].
///
/// [`WalReceiverManager`]: Self::WalReceiverManager

View File

@@ -12,7 +12,6 @@
//!
use anyhow::{bail, Context};
use arc_swap::ArcSwap;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use enumset::EnumSet;
@@ -99,7 +98,7 @@ use std::ops::Bound::Included;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant};
use crate::span;
@@ -261,7 +260,7 @@ pub struct Tenant {
// We keep TenantConfOpt sturct here to preserve the information
// about parameters that are not set.
// This is necessary to allow global config updates.
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
tenant_shard_id: TenantShardId,
@@ -1516,7 +1515,7 @@ impl Tenant {
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
.wait_lsn(*lsn, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1607,7 +1606,7 @@ impl Tenant {
);
{
let conf = self.tenant_conf.load();
let conf = self.tenant_conf.read().unwrap();
if !conf.location.may_delete_layers_hint() {
info!("Skipping GC in location state {:?}", conf.location);
@@ -1634,7 +1633,7 @@ impl Tenant {
}
{
let conf = self.tenant_conf.load();
let conf = self.tenant_conf.read().unwrap();
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
info!("Skipping compaction in location state {:?}", conf.location);
return Ok(());
@@ -1783,7 +1782,7 @@ impl Tenant {
async fn shutdown(
&self,
shutdown_progress: completion::Barrier,
shutdown_mode: timeline::ShutdownMode,
freeze_and_flush: bool,
) -> Result<(), completion::Barrier> {
span::debug_assert_current_span_has_tenant_id();
@@ -1830,8 +1829,16 @@ impl Tenant {
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let timeline_id = timeline.timeline_id;
let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
let span =
tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
js.spawn(async move {
if freeze_and_flush {
timeline.flush_and_shutdown().instrument(span).await
} else {
timeline.shutdown().instrument(span).await
}
});
})
};
// test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2075,14 +2082,14 @@ impl Tenant {
}
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
self.tenant_conf.load().location.attach_mode
self.tenant_conf.read().unwrap().location.attach_mode
}
/// For API access: generate a LocationConfig equivalent to the one that would be used to
/// create a Tenant in the same state. Do not use this in hot paths: it's for relatively
/// rare external API calls, like a reconciliation at startup.
pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
let conf = self.tenant_conf.load();
let conf = self.tenant_conf.read().unwrap();
let location_config_mode = match conf.location.attach_mode {
AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2229,7 +2236,7 @@ where
impl Tenant {
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
self.tenant_conf.load().tenant_conf.clone()
self.tenant_conf.read().unwrap().tenant_conf.clone()
}
pub fn effective_config(&self) -> TenantConf {
@@ -2238,84 +2245,84 @@ impl Tenant {
}
pub fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.checkpoint_distance
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
pub fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
pub fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.compaction_target_size
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
}
pub fn get_compaction_period(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.compaction_period
.unwrap_or(self.conf.default_tenant_conf.compaction_period)
}
pub fn get_compaction_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.compaction_threshold
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
pub fn get_gc_horizon(&self) -> u64 {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.gc_horizon
.unwrap_or(self.conf.default_tenant_conf.gc_horizon)
}
pub fn get_gc_period(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.gc_period
.unwrap_or(self.conf.default_tenant_conf.gc_period)
}
pub fn get_image_creation_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.image_creation_threshold
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
pub fn get_pitr_interval(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.pitr_interval
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
}
pub fn get_trace_read_requests(&self) -> bool {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.trace_read_requests
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn get_min_resident_size_override(&self) -> Option<u64> {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.min_resident_size_override
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_heatmap_period(&self) -> Option<Duration> {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
let heatmap_period = tenant_conf
.heatmap_period
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2327,40 +2334,26 @@ impl Tenant {
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
// Use read-copy-update in order to avoid overwriting the location config
// state if this races with [`Tenant::set_new_location_config`]. Note that
// this race is not possible if both request types come from the storage
// controller (as they should!) because an exclusive op lock is required
// on the storage controller side.
self.tenant_conf.rcu(|inner| {
Arc::new(AttachedTenantConf {
tenant_conf: new_tenant_conf.clone(),
location: inner.location,
})
});
self.tenant_conf_updated(&new_tenant_conf);
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
self.tenant_conf_updated();
// Don't hold self.timelines.lock() during the notifies.
// There's no risk of deadlock right now, but there could be if we consolidate
// mutexes in struct Timeline in the future.
let timelines = self.list_timelines();
for timeline in timelines {
timeline.tenant_conf_updated(&new_tenant_conf);
timeline.tenant_conf_updated();
}
}
pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
let new_tenant_conf = new_conf.tenant_conf.clone();
self.tenant_conf.store(Arc::new(new_conf));
self.tenant_conf_updated(&new_tenant_conf);
*self.tenant_conf.write().unwrap() = new_conf;
self.tenant_conf_updated();
// Don't hold self.timelines.lock() during the notifies.
// There's no risk of deadlock right now, but there could be if we consolidate
// mutexes in struct Timeline in the future.
let timelines = self.list_timelines();
for timeline in timelines {
timeline.tenant_conf_updated(&new_tenant_conf);
timeline.tenant_conf_updated();
}
}
@@ -2374,8 +2367,11 @@ impl Tenant {
.unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
}
pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
pub(crate) fn tenant_conf_updated(&self) {
let conf = {
let guard = self.tenant_conf.read().unwrap();
Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
};
self.timeline_get_throttle.reconfigure(conf)
}
@@ -2523,7 +2519,7 @@ impl Tenant {
Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
tenant_conf: Arc::new(RwLock::new(attached_conf)),
}
}
@@ -3509,7 +3505,7 @@ impl Tenant {
}
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
self.tenant_conf.load().tenant_conf.clone()
self.tenant_conf.read().unwrap().tenant_conf.clone()
}
}
@@ -3657,9 +3653,6 @@ pub(crate) mod harness {
heatmap_period: Some(tenant_conf.heatmap_period),
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
}
}
}
@@ -3858,7 +3851,6 @@ mod tests {
use hex_literal::hex;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tests::timeline::ShutdownMode;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4304,7 +4296,7 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
.shutdown(Default::default(), true)
.instrument(harness.span())
.await
.ok()
@@ -4345,7 +4337,7 @@ mod tests {
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
.shutdown(Default::default(), true)
.instrument(harness.span())
.await
.ok()
@@ -5126,7 +5118,7 @@ mod tests {
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown(super::timeline::ShutdownMode::Hard)
.shutdown()
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
.await;
std::mem::forget(tline);

View File

@@ -57,9 +57,6 @@ pub mod defaults {
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
// By default ingest enough WAL for two new L0 layers before checking if new image
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
@@ -365,10 +362,6 @@ pub struct TenantConf {
pub lazy_slru_download: bool,
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
// How much WAL must be ingested before checking again whether a new image layer is required.
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -461,9 +454,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
}
impl TenantConfOpt {
@@ -518,9 +508,6 @@ impl TenantConfOpt {
.timeline_get_throttle
.clone()
.unwrap_or(global_conf.timeline_get_throttle),
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
}
}
}
@@ -561,7 +548,6 @@ impl Default for TenantConf {
heatmap_period: Duration::ZERO,
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
}
}
}
@@ -635,7 +621,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
}
}
}

View File

@@ -14,10 +14,7 @@ use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
tenant::{
mgr::{TenantSlot, TenantsMapRemoveResult},
timeline::ShutdownMode,
},
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
};
use super::{
@@ -466,7 +463,7 @@ impl DeleteTenantFlow {
// tenant.shutdown
// Its also bad that we're holding tenants.read here.
// TODO relax set_stopping to be idempotent?
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
if tenant.shutdown(progress, false).await.is_err() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"tenant shutdown is already in progress"
)));

View File

@@ -72,10 +72,6 @@ impl EphemeralFile {
self.len
}
pub(crate) fn id(&self) -> page_cache::FileId {
self.page_cache_file_id
}
pub(crate) async fn read_blk(
&self,
blknum: u32,

View File

@@ -346,6 +346,35 @@ where
}
}
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub enum InMemoryLayerHandle {
Open {
lsn_floor: Lsn,
end_lsn: Lsn,
},
Frozen {
idx: usize,
lsn_floor: Lsn,
end_lsn: Lsn,
},
}
impl InMemoryLayerHandle {
pub fn get_lsn_floor(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
}
}
pub fn get_end_lsn(&self) -> Lsn {
match self {
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
}
}
}
impl LayerMap {
///
/// Find the latest layer (by lsn.end) that covers the given
@@ -547,18 +576,41 @@ impl LayerMap {
self.historic.iter()
}
/// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
/// Get a handle for the first in memory layer that matches the provided predicate.
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
///
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
/// the same exclusive region established by holding the layer manager lock.
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
where
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
{
if let Some(open) = &self.open_layer {
if pred(open) {
return Some(open.clone());
return Some(InMemoryLayerHandle::Open {
lsn_floor: open.get_lsn_range().start,
end_lsn: open.get_lsn_range().end,
});
}
}
self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
let pos = self.frozen_layers.iter().rev().position(pred);
pos.map(|rev_idx| {
let idx = self.frozen_layers.len() - 1 - rev_idx;
InMemoryLayerHandle::Frozen {
idx,
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
}
})
}
/// Get the layer pointed to by the provided handle.
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
match handle {
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
}
}
///

View File

@@ -44,7 +44,6 @@ use crate::tenant::config::{
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -784,9 +783,11 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
join_set.spawn(
async move {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
t.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
@@ -1106,7 +1107,7 @@ impl TenantManager {
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, ShutdownMode::Hard).await {
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
@@ -1222,7 +1223,7 @@ impl TenantManager {
TenantSlot::Attached(tenant) => {
let (_guard, progress) = utils::completion::channel();
info!("Shutting down just-spawned tenant, because tenant manager is shut down");
match tenant.shutdown(progress, ShutdownMode::Hard).await {
match tenant.shutdown(progress, false).await {
Ok(()) => {
info!("Finished shutting down just-spawned tenant");
}
@@ -1272,7 +1273,7 @@ impl TenantManager {
};
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
match tenant.shutdown(progress, false).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
@@ -1648,14 +1649,7 @@ impl TenantManager {
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
"failpoint"
)));
if let Err(e) = timeline
.wait_lsn(
*target_lsn,
crate::tenant::timeline::WaitLsnWaiter::Tenant,
ctx,
)
.await
{
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
// Failure here might mean shutdown, in any case this part is an optimization
// and we shouldn't hold up the split operation.
tracing::warn!(
@@ -1676,7 +1670,7 @@ impl TenantManager {
// Phase 5: Shut down the parent shard, and erase it from disk
let (_guard, progress) = completion::channel();
match parent.shutdown(progress, ShutdownMode::Hard).await {
match parent.shutdown(progress, false).await {
Ok(()) => {}
Err(other) => {
other.wait().await;
@@ -2663,11 +2657,11 @@ where
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let shutdown_mode = ShutdownMode::Hard;
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(progress, shutdown_mode).await {
match tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to

View File

@@ -200,7 +200,6 @@ use utils::backoff::{
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use std::ops::DerefMut;
@@ -208,7 +207,7 @@ use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::{
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -262,10 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
/// Default buffer size when interfacing with [`tokio::fs::File`].
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
/// Doing non-essential flushes of deletion queue is subject to this timeout, after
/// which we warn and skip.
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -593,14 +588,14 @@ impl RemoteTimelineClient {
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
"scheduling metadata upload with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
@@ -1055,26 +1050,6 @@ impl RemoteTimelineClient {
Ok(())
}
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
match tokio::time::timeout(
DELETION_QUEUE_FLUSH_TIMEOUT,
self.deletion_queue_client.flush_immediate(),
)
.await
{
Ok(result) => result,
Err(_timeout) => {
// Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
// to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion
// queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
tracing::warn!(
"Timed out waiting for deletion queue flush, acking deletion anyway"
);
Ok(())
}
}
}
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1124,7 +1099,7 @@ impl RemoteTimelineClient {
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.flush_deletion_queue().await?;
self.deletion_queue_client.flush_immediate().await?;
let cancel = shutdown_token();
@@ -1198,7 +1173,7 @@ impl RemoteTimelineClient {
// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.flush_deletion_queue().await?;
self.deletion_queue_client.flush_immediate().await?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
@@ -1594,7 +1569,7 @@ impl RemoteTimelineClient {
/// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
///
/// In-progress operations will still be running after this function returns.
/// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
/// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
/// to wait for them to complete, after calling this function.
pub(crate) fn stop(&self) {
// Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue

View File

@@ -786,35 +786,6 @@ impl<'a> TenantDownloader<'a> {
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id)
.join(layer.name.file_name());
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
local_path,
meta.len(),
);
}
Err(e) => {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
local_path,
e
);
debug_assert!(false);
}
}
}
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{

View File

@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::{Arc, Mutex};
use std::sync::Mutex;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
use self::inmemory_layer::InMemoryLayerFileId;
use super::layer_map::InMemoryLayerHandle;
use super::timeline::layer_manager::LayerManager;
use super::timeline::GetVectoredError;
use super::PageReconstructError;
@@ -204,30 +204,23 @@ impl Default for ValuesReconstructState {
}
}
/// A key that uniquely identifies a layer in a timeline
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub(crate) enum LayerId {
PersitentLayerId(PersistentLayerKey),
InMemoryLayerId(InMemoryLayerFileId),
/// Description of layer to be read - the layer map can turn
/// this description into the actual layer.
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
pub(crate) enum ReadableLayerDesc {
Persistent {
desc: PersistentLayerDesc,
lsn_range: Range<Lsn>,
},
InMemory {
handle: InMemoryLayerHandle,
lsn_ceil: Lsn,
},
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
#[derive(Debug)]
pub(crate) enum ReadableLayer {
PersistentLayer(Layer),
InMemoryLayer(Arc<InMemoryLayer>),
}
/// A partial description of a read to be done.
#[derive(Debug, Clone)]
struct ReadDesc {
/// An id used to resolve the readable layer within the fringe
layer_id: LayerId,
/// Lsn range for the read, used for selecting the next read
lsn_range: Range<Lsn>,
}
struct ReadableLayerDescOrdered(ReadableLayerDesc);
/// Data structure which maintains a fringe of layers for the
/// read path. The fringe is the set of layers which intersects
@@ -238,64 +231,41 @@ struct ReadDesc {
/// a two layer indexing scheme.
#[derive(Debug)]
pub(crate) struct LayerFringe {
planned_reads_by_lsn: BinaryHeap<ReadDesc>,
layers: HashMap<LayerId, LayerKeyspace>,
}
#[derive(Debug)]
struct LayerKeyspace {
layer: ReadableLayer,
target_keyspace: KeySpace,
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
layers: HashMap<ReadableLayerDesc, KeySpace>,
}
impl LayerFringe {
pub(crate) fn new() -> Self {
LayerFringe {
planned_reads_by_lsn: BinaryHeap::new(),
layers_by_lsn: BinaryHeap::new(),
layers: HashMap::new(),
}
}
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
let read_desc = match self.planned_reads_by_lsn.pop() {
Some(desc) => desc,
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
let handle = match self.layers_by_lsn.pop() {
Some(h) => h,
None => return None,
};
let removed = self.layers.remove_entry(&read_desc.layer_id);
let removed = self.layers.remove_entry(&handle.0);
match removed {
Some((
_,
LayerKeyspace {
layer,
target_keyspace,
},
)) => Some((layer, target_keyspace, read_desc.lsn_range)),
Some((layer, keyspace)) => Some((layer, keyspace)),
None => unreachable!("fringe internals are always consistent"),
}
}
pub(crate) fn update(
&mut self,
layer: ReadableLayer,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
) {
let layer_id = layer.id();
let entry = self.layers.entry(layer_id.clone());
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
let entry = self.layers.entry(layer.clone());
match entry {
Entry::Occupied(mut entry) => {
entry.get_mut().target_keyspace.merge(&keyspace);
entry.get_mut().merge(&keyspace);
}
Entry::Vacant(entry) => {
self.planned_reads_by_lsn.push(ReadDesc {
lsn_range,
layer_id: layer_id.clone(),
});
entry.insert(LayerKeyspace {
layer,
target_keyspace: keyspace,
});
self.layers_by_lsn
.push(ReadableLayerDescOrdered(entry.key().clone()));
entry.insert(keyspace);
}
}
}
@@ -307,55 +277,77 @@ impl Default for LayerFringe {
}
}
impl Ord for ReadDesc {
impl Ord for ReadableLayerDescOrdered {
fn cmp(&self, other: &Self) -> Ordering {
let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
if ord == std::cmp::Ordering::Equal {
self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
self.0
.get_lsn_floor()
.cmp(&other.0.get_lsn_floor())
.reverse()
} else {
ord
}
}
}
impl PartialOrd for ReadDesc {
impl PartialOrd for ReadableLayerDescOrdered {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ReadDesc {
impl PartialEq for ReadableLayerDescOrdered {
fn eq(&self, other: &Self) -> bool {
self.lsn_range == other.lsn_range
self.0.get_lsn_floor() == other.0.get_lsn_floor()
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
}
}
impl Eq for ReadDesc {}
impl Eq for ReadableLayerDescOrdered {}
impl ReadableLayer {
pub(crate) fn id(&self) -> LayerId {
impl ReadableLayerDesc {
pub(crate) fn get_lsn_floor(&self) -> Lsn {
match self {
Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
}
}
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
match self {
ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
}
}
pub(crate) async fn get_values_reconstruct_data(
&self,
layer_manager: &LayerManager,
keyspace: KeySpace,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
match self {
ReadableLayer::PersistentLayer(layer) => {
ReadableLayerDesc::Persistent { desc, lsn_range } => {
let layer = layer_manager.get_from_desc(desc);
layer
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.get_values_reconstruct_data(
keyspace,
lsn_range.clone(),
reconstruct_state,
ctx,
)
.await
}
ReadableLayer::InMemoryLayer(layer) => {
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
let layer = layer_manager
.layer_map()
.get_in_memory_layer(handle)
.unwrap();
layer
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
.await
}
}

View File

@@ -47,7 +47,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::BytesMut;
use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
@@ -947,34 +946,6 @@ impl DeltaLayerInner {
Ok(planner.finish())
}
fn get_min_read_buffer_size(
planned_reads: &[VectoredRead],
read_size_soft_max: usize,
) -> usize {
let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
return read_size_soft_max;
};
let largest_read_size = largest_read.size();
if largest_read_size > read_size_soft_max {
// If the read is oversized, it should only contain one key.
let offenders = largest_read
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
largest_read_size,
read_size_soft_max,
offenders
);
}
largest_read_size
}
async fn do_reads_and_update_state(
&self,
reads: Vec<VectoredRead>,
@@ -988,8 +959,7 @@ impl DeltaLayerInner {
.expect("Layer is loaded with max vectored bytes config")
.0
.into();
let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
let mut buf = Some(BytesMut::with_capacity(buf_size));
let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
// Note that reads are processed in reverse order (from highest key+lsn).
// This is the order that `ReconstructState` requires such that it can
@@ -1016,7 +986,7 @@ impl DeltaLayerInner {
// We have "lost" the buffer since the lower level IO api
// doesn't return the buffer on error. Allocate a new one.
buf = Some(BytesMut::with_capacity(buf_size));
buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
continue;
}
@@ -1240,16 +1210,9 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
mod test {
use std::collections::BTreeMap;
use itertools::MinMaxResult;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use rand::RngCore;
use super::*;
use crate::{
context::DownloadBehavior,
task_mgr::TaskKind,
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
DEFAULT_PG_VERSION,
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
};
/// Construct an index for a fictional delta layer and and then
@@ -1369,229 +1332,4 @@ mod test {
assert_eq!(planned_blobs, expected_blobs);
}
mod constants {
use utils::lsn::Lsn;
/// Offset used by all lsns in this test
pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
/// Number of unique keys including in the test data
pub(super) const KEY_COUNT: u8 = 60;
/// Max number of different lsns for each key
pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
/// Possible value sizes for each key along with a probability weight
pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
/// Probability that there will be a gap between the current key and the next one (33.3%)
pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
/// The minimum size of a key range in all the generated reads
pub(super) const MIN_RANGE_SIZE: i128 = 10;
/// The number of ranges included in each vectored read
pub(super) const RANGES_COUNT: u8 = 2;
/// The number of vectored reads performed
pub(super) const READS_COUNT: u8 = 100;
/// Soft max size of a vectored read. Will be violated if we have to read keys
/// with values larger than the limit
pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
}
struct Entry {
key: Key,
lsn: Lsn,
value: Vec<u8>,
}
fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
let mut current_key = Key::MIN;
let mut entries = Vec::new();
for _ in 0..constants::KEY_COUNT {
let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
let mut lsns_iter =
std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
Some(Lsn(lsn.0 + 0x08))
});
let mut lsns = Vec::new();
while lsns.len() < count as usize {
let take = rng.gen_bool(0.5);
let lsn = lsns_iter.next().unwrap();
if take {
lsns.push(lsn);
}
}
for lsn in lsns {
let size = constants::VALUE_SIZES
.choose_weighted(rng, |item| item.1)
.unwrap()
.0;
let mut buf = vec![0; size];
rng.fill_bytes(&mut buf);
entries.push(Entry {
key: current_key,
lsn,
value: buf,
})
}
let gap = constants::KEY_GAP_CHANGES
.choose_weighted(rng, |item| item.1)
.unwrap()
.0;
if gap {
current_key = current_key.add(2);
} else {
current_key = current_key.add(1);
}
}
entries
}
struct EntriesMeta {
key_range: Range<Key>,
lsn_range: Range<Lsn>,
index: BTreeMap<(Key, Lsn), Vec<u8>>,
}
fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
let key_range = match entries.iter().minmax_by_key(|e| e.key) {
MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
_ => panic!("More than one entry is always expected"),
};
let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
_ => panic!("More than one entry is always expected"),
};
let mut index = BTreeMap::new();
for entry in entries.iter() {
index.insert((entry.key, entry.lsn), entry.value.clone());
}
EntriesMeta {
key_range,
lsn_range,
index,
}
}
fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
let start = key_range.start.to_i128();
let end = key_range.end.to_i128();
let mut keyspace = KeySpace::default();
for _ in 0..constants::RANGES_COUNT {
let mut range: Option<Range<Key>> = Option::default();
while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
let range_start = rng.gen_range(start..end);
let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
if range_end_offset >= end {
range = Some(Key::from_i128(range_start)..Key::from_i128(end));
} else {
let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
}
}
keyspace.ranges.push(range.unwrap());
}
keyspace
}
#[tokio::test]
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
let (tenant, ctx) = harness.load().await;
let timeline_id = TimelineId::generate();
let timeline = tenant
.create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
.await?;
tracing::info!("Generating test data ...");
let rng = &mut StdRng::seed_from_u64(0);
let entries = generate_entries(rng);
let entries_meta = get_entries_meta(&entries);
tracing::info!("Done generating {} entries", entries.len());
tracing::info!("Writing test data to delta layer ...");
let mut writer = DeltaLayerWriter::new(
harness.conf,
timeline_id,
harness.tenant_shard_id,
entries_meta.key_range.start,
entries_meta.lsn_range.clone(),
)
.await?;
for entry in entries {
let (_, res) = writer
.put_value_bytes(entry.key, entry.lsn, entry.value, false)
.await;
res?;
}
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
let inner = resident.get_inner_delta(&ctx).await?;
let file_size = inner.file.metadata().await?.len();
tracing::info!(
"Done writing test data to delta layer. Resulting file size is: {}",
file_size
);
for i in 0..constants::READS_COUNT {
tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
block_reader,
);
let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
let mut reconstruct_state = ValuesReconstructState::new();
let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
let vectored_reads = DeltaLayerInner::plan_reads(
keyspace.clone(),
entries_meta.lsn_range.clone(),
data_end_offset,
index_reader,
planner,
&mut reconstruct_state,
&ctx,
)
.await?;
let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
let buf_size = DeltaLayerInner::get_min_read_buffer_size(
&vectored_reads,
constants::MAX_VECTORED_READ_BYTES,
);
let mut buf = Some(BytesMut::with_capacity(buf_size));
for read in vectored_reads {
let blobs_buf = vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await?;
for meta in blobs_buf.blobs.iter() {
let value = &blobs_buf.buf[meta.start..meta.end];
assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
}
buf = Some(blobs_buf.buf);
}
}
Ok(())
}
}

View File

@@ -44,7 +44,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
@@ -541,25 +540,7 @@ impl ImageLayerInner {
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
for read in reads.into_iter() {
let buf_size = read.size();
if buf_size > max_vectored_read_bytes {
// If the read is oversized, it should only contain one key.
let offenders = read
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
buf_size,
max_vectored_read_bytes,
offenders
);
}
let buf = BytesMut::with_capacity(buf_size);
let buf = BytesMut::with_capacity(max_vectored_read_bytes);
let res = vectored_blob_reader.read_blobs(&read, buf).await;
match res {

View File

@@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::{page_cache, walrecord};
use crate::walrecord;
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
@@ -36,14 +36,10 @@ use super::{
ValuesReconstructState,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
file_id: InMemoryLayerFileId,
/// This layer contains all the changes from 'start_lsn'. The
/// start is inclusive.
@@ -204,10 +200,6 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources {
};
impl InMemoryLayer {
pub(crate) fn file_id(&self) -> InMemoryLayerFileId {
self.file_id
}
pub(crate) fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
@@ -451,10 +443,8 @@ impl InMemoryLayer {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.id());
Ok(InMemoryLayer {
file_id: key,
conf,
timeline_id,
tenant_shard_id,

View File

@@ -1759,18 +1759,6 @@ impl ResidentLayer {
pub(crate) fn metadata(&self) -> LayerFileMetadata {
self.owner.metadata()
}
#[cfg(test)]
pub(crate) async fn get_inner_delta<'a>(
&'a self,
ctx: &RequestContext,
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
LayerKind::Delta(d) => Ok(d),
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
}
}
}
impl AsLayerDesc for ResidentLayer {

View File

@@ -9,7 +9,6 @@ pub mod uninit;
mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use arc_swap::ArcSwap;
use bytes::Bytes;
use camino::Utf8Path;
use enumset::EnumSet;
@@ -119,11 +118,11 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
@@ -184,7 +183,7 @@ pub(crate) struct AuxFilesState {
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
myself: Weak<Self>,
@@ -282,12 +281,10 @@ pub struct Timeline {
pub(super) flush_loop_state: Mutex<FlushLoopState>,
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
/// - The u64 value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
/// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn
/// read by whoever sends an update
layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>,
/// The value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
@@ -312,8 +309,6 @@ pub struct Timeline {
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
last_image_layer_creation_check_at: AtomicLsn,
/// Current logical size of the "datadir", at the last LSN.
current_logical_size: LogicalSize,
@@ -615,25 +610,6 @@ pub enum GetVectoredImpl {
Vectored,
}
pub(crate) enum WaitLsnWaiter<'a> {
Timeline(&'a Timeline),
Tenant,
PageService,
}
/// Argument to [`Timeline::shutdown`].
#[derive(Debug, Clone, Copy)]
pub(crate) enum ShutdownMode {
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
/// the call to [`Timeline::shutdown`].
FreezeAndFlush,
/// Shut down immediately, without waiting for any open layers to flush.
Hard,
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -1082,8 +1058,7 @@ impl Timeline {
pub(crate) async fn wait_lsn(
&self,
lsn: Lsn,
who_is_waiting: WaitLsnWaiter<'_>,
ctx: &RequestContext, /* Prepare for use by cancellation */
_ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
return Err(WaitLsnError::Shutdown);
@@ -1091,28 +1066,20 @@ impl Timeline {
return Err(WaitLsnError::BadState);
}
if cfg!(debug_assertions) {
match ctx.task_kind() {
TaskKind::WalReceiverManager
| TaskKind::WalReceiverConnectionHandler
| TaskKind::WalReceiverConnectionPoller => {
let is_myself = match who_is_waiting {
WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself),
WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()),
};
if is_myself {
if let Err(current) = self.last_record_lsn.would_wait_for(lsn) {
// walingest is the only one that can advance last_record_lsn; it should make sure to never reach here
panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock");
}
} else {
// if another timeline's is waiting for us, there's no deadlock risk because
// our walreceiver task can make progress independent of theirs
}
}
_ => {}
}
}
// This should never be called from the WAL receiver, because that could lead
// to a deadlock.
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
"wait_lsn cannot be called in WAL receiver"
);
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
"wait_lsn cannot be called in WAL receiver"
);
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
@@ -1171,8 +1138,8 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait().await
}
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
@@ -1192,39 +1159,7 @@ impl Timeline {
};
let Some(open_layer) = &layers_guard.layer_map().open_layer else {
// If there is no open layer, we have no layer freezing to do. However, we might need to generate
// some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions
// that didn't result in writes to this shard.
// Must not hold the layers lock while waiting for a flush.
drop(layers_guard);
let last_record_lsn = self.get_last_record_lsn();
let disk_consistent_lsn = self.get_disk_consistent_lsn();
if last_record_lsn > disk_consistent_lsn {
// We have no open layer, but disk_consistent_lsn is behind the last record: this indicates
// we are a sharded tenant and have skipped some WAL
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
// This should be somewhat rare, so we log it at INFO level.
//
// We checked for checkpoint timeout so that a shard without any
// data ingested (yet) doesn't write a remote index as soon as it
// sees its LSN advance: we only do this if we've been layer-less
// for some time.
tracing::info!(
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
disk_consistent_lsn,
last_record_lsn
);
// The flush loop will update remote consistent LSN as well as disk consistent LSN.
self.flush_frozen_layers_and_wait(last_record_lsn)
.await
.ok();
}
}
// No open layer, no work to do.
return;
};
@@ -1353,119 +1288,83 @@ impl Timeline {
self.launch_eviction_task(parent, background_jobs_can_start);
}
/// After this function returns, there are no timeline-scoped tasks are left running.
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// The preferred pattern for is:
/// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token
/// - if early shutdown (not just cancellation) of a sub-tree of tasks is required,
/// go the extra mile and keep track of JoinHandles
/// - Keep track of JoinHandles using a passed-down `Arc<Mutex<Option<JoinSet>>>` or similar,
/// instead of spawning directly on a runtime. It is a more composable / testable pattern.
///
/// For legacy reasons, we still have multiple tasks spawned using
/// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`.
/// We refer to these as "timeline-scoped task_mgr tasks".
/// Some of these tasks are already sensitive to Timeline::cancel while others are
/// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`]
/// or [`task_mgr::shutdown_watcher`].
/// We want to gradually convert the code base away from these.
///
/// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to
/// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped
/// ones that aren't mentioned here):
/// - [`TaskKind::TimelineDeletionWorker`]
/// - NB: also used for tenant deletion
/// - [`TaskKind::RemoteUploadTask`]`
/// - [`TaskKind::InitialLogicalSizeCalculation`]
/// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?)
// Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive:
/// - [`TaskKind::Eviction`]
/// - [`TaskKind::LayerFlushTask`]
/// - [`TaskKind::OndemandLogicalSizeCalculation`]
/// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped)
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
/// While we are flushing, we continue to accept read I/O.
pub(crate) async fn flush_and_shutdown(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
let try_freeze_and_flush = match mode {
ShutdownMode::FreezeAndFlush => true,
ShutdownMode::Hard => false,
};
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
// trying to flush
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
// Regardless of whether we're going to try_freeze_and_flush
// or not, stop ingesting any more data. Walreceiver only provides
// cancellation but no "wait until gone", because it uses the Timeline::gate.
// So, only after the self.gate.close() below will we know for sure that
// no walreceiver tasks are left.
// For `try_freeze_and_flush=true`, this means that we might still be ingesting
// data during the call to `self.freeze_and_flush()` below.
// That's not ideal, but, we don't have the concept of a ChildGuard,
// which is what we'd need to properly model early shutdown of the walreceiver
// task sub-tree before the other Timeline task sub-trees.
let walreceiver = self.walreceiver.lock().unwrap().take();
tracing::debug!(
is_some = walreceiver.is_some(),
"Waiting for WalReceiverManager..."
);
if let Some(walreceiver) = walreceiver {
walreceiver.cancel();
}
// ... and inform any waiters for newer LSNs that there won't be any.
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
self.last_record_lsn.shutdown();
if try_freeze_and_flush {
// we shut down walreceiver above, so, we won't add anything more
// to the InMemoryLayer; freeze it and wait for all frozen layers
// to reach the disk & upload queue, then shut the upload queue and
// wait for it to drain.
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.shutdown().await;
}
}
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
// now all writers to InMemory layer are gone, do the final flush if requested
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.shutdown().await;
}
}
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
}
}
self.shutdown().await;
}
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
/// the graceful [`Timeline::flush_and_shutdown`] function.
pub(crate) async fn shutdown(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Transition the remote_client into a state where it's only useful for timeline deletion.
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
// while doing so.
self.last_record_lsn.shutdown();
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
// case our caller wants to use that for a deletion
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.stop();
// As documented in remote_client.stop()'s doc comment, it's our responsibility
// to shut down the upload queue tasks.
// TODO: fix that, task management should be encapsulated inside remote_client.
task_mgr::shutdown_tasks(
Some(TaskKind::RemoteUploadTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
}
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
// Finally wait until any gate-holders are complete.
//
// TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks
// and use a TBD variant of shutdown_tasks that asserts that there were no tasks left.
// Finally wait until any gate-holders are complete
self.gate.close().await;
self.metrics.shutdown();
@@ -1669,65 +1568,57 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub(crate) fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.lazy_slru_download
.unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
}
fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.checkpoint_distance
.unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
}
fn get_checkpoint_timeout(&self) -> Duration {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.checkpoint_timeout
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
fn get_compaction_target_size(&self) -> u64 {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.compaction_target_size
.unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
}
fn get_compaction_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.compaction_threshold
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
fn get_image_creation_threshold(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.image_creation_threshold
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
}
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
let tenant_conf = &self.tenant_conf.load();
let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
tenant_conf
.tenant_conf
.compaction_algorithm
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
}
fn get_eviction_policy(&self) -> EvictionPolicy {
let tenant_conf = self.tenant_conf.load();
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf
.tenant_conf
.eviction_policy
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
}
@@ -1741,26 +1632,14 @@ impl Timeline {
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.image_layer_creation_check_threshold
.unwrap_or(
self.conf
.default_tenant_conf
.image_layer_creation_check_threshold,
)
}
pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
pub(super) fn tenant_conf_updated(&self) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
// The threshold is embedded in the metric. So, we need to update it.
{
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
new_conf,
&self.tenant_conf.read().unwrap().tenant_conf,
&self.conf.default_tenant_conf,
);
@@ -1787,7 +1666,7 @@ impl Timeline {
#[allow(clippy::too_many_arguments)]
pub(super) fn new(
conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
tenant_conf: Arc<RwLock<AttachedTenantConf>>,
metadata: &TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
timeline_id: TimelineId,
@@ -1803,16 +1682,17 @@ impl Timeline {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(state);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let evictions_low_residence_duration_metric_threshold = {
let loaded_tenant_conf = tenant_conf.load();
let tenant_conf_guard = tenant_conf.read().unwrap();
let evictions_low_residence_duration_metric_threshold =
Self::get_evictions_low_residence_duration_metric_threshold(
&loaded_tenant_conf.tenant_conf,
&tenant_conf_guard.tenant_conf,
&conf.default_tenant_conf,
)
};
);
drop(tenant_conf_guard);
Arc::new_cyclic(|myself| {
let mut result = Timeline {
@@ -1889,7 +1769,6 @@ impl Timeline {
},
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(HashMap::new()),
@@ -1918,7 +1797,6 @@ impl Timeline {
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result
.metrics
.last_record_gauge
@@ -1995,19 +1873,20 @@ impl Timeline {
self.timeline_id, self.tenant_shard_id
);
let tenant_conf = self.tenant_conf.load();
let wal_connect_timeout = tenant_conf
let tenant_conf_guard = self.tenant_conf.read().unwrap();
let wal_connect_timeout = tenant_conf_guard
.tenant_conf
.walreceiver_connect_timeout
.unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
let lagging_wal_timeout = tenant_conf
let lagging_wal_timeout = tenant_conf_guard
.tenant_conf
.lagging_wal_timeout
.unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
let max_lsn_wal_lag = tenant_conf
let max_lsn_wal_lag = tenant_conf_guard
.tenant_conf
.max_lsn_wal_lag
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
drop(tenant_conf_guard);
let mut guard = self.walreceiver.lock().unwrap();
assert!(
@@ -2555,6 +2434,10 @@ impl Timeline {
debug!("cancelling logical size calculation for timeline shutdown");
calculation.await
}
_ = task_mgr::shutdown_watcher() => {
debug!("cancelling logical size calculation for task shutdown");
calculation.await
}
}
}
@@ -3009,6 +2892,16 @@ impl Timeline {
let mut completed_keyspace = KeySpace::default();
// Hold the layer map whilst visiting the timeline to prevent
// compaction, eviction and flushes from rendering the layers unreadable.
//
// TODO: Do we actually need to do this? In theory holding on
// to [`tenant::storage_layer::Layer`] should be enough. However,
// [`Timeline::get`] also holds the lock during IO, so more investigation
// is needed.
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
loop {
if cancel.is_cancelled() {
return Err(GetVectoredError::Cancelled);
@@ -3018,9 +2911,6 @@ impl Timeline {
unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
completed_keyspace.merge(&keys_done_last_step);
let guard = timeline.layers.read().await;
let layers = guard.layer_map();
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
@@ -3028,11 +2918,12 @@ impl Timeline {
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayer::InMemoryLayer(l),
ReadableLayerDesc::InMemory {
handle: l,
lsn_ceil: cont_lsn,
},
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
@@ -3044,43 +2935,30 @@ impl Timeline {
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
ReadableLayerDesc::Persistent {
desc: (*layer).clone(),
lsn_range: lsn_floor..cont_lsn,
},
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
.for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
}
}
}
// It's safe to drop the layer map lock after planning the next round of reads.
// The fringe keeps readable handles for the layers which are safe to read even
// if layers were compacted or flushed.
//
// The more interesting consideration is: "Why is the read algorithm still correct
// if the layer map changes while it is operating?". Doing a vectored read on a
// timeline boils down to pushing an imaginary lsn boundary downwards for each range
// covered by the read. The layer map tells us how to move the lsn downwards for a
// range at *a particular point in time*. It is fine for the answer to be different
// at two different time points.
drop(guard);
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
let next_cont_lsn = lsn_range.start;
if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
layer_to_read
.get_values_reconstruct_data(
&guard,
keyspace_to_read.clone(),
lsn_range,
reconstruct_state,
ctx,
)
.await?;
unmapped_keyspace = keyspace_to_read;
cont_lsn = next_cont_lsn;
cont_lsn = layer_to_read.get_lsn_floor();
} else {
break;
}
@@ -3158,7 +3036,7 @@ impl Timeline {
}
}
ancestor
.wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx)
.wait_lsn(self.ancestor_lsn, ctx)
.await
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
@@ -3208,9 +3086,7 @@ impl Timeline {
self.last_record_lsn.advance(new_lsn);
}
/// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
/// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
async fn freeze_inmem_layer(&self, write_lock_held: bool) {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
@@ -3220,9 +3096,7 @@ impl Timeline {
Some(self.write_lock.lock().await)
};
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn).await;
to_lsn
self.freeze_inmem_layer_at(self.get_last_record_lsn()).await;
}
async fn freeze_inmem_layer_at(&self, at: Lsn) {
@@ -3235,24 +3109,25 @@ impl Timeline {
/// Layer flusher task's main loop.
async fn flush_loop(
self: &Arc<Self>,
mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
ctx: &RequestContext,
) {
info!("started flush loop");
loop {
tokio::select! {
_ = self.cancel.cancelled() => {
info!("shutting down layer flush task due to Timeline::cancel");
info!("shutting down layer flush task");
break;
},
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
break;
},
_ = layer_flush_start_rx.changed() => {}
}
trace!("waking up");
let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow();
// The highest LSN to which we flushed in the loop over frozen layers
let mut flushed_to_lsn = Lsn(0);
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
if self.cancel.is_cancelled() {
info!("dropping out of flush loop for timeline shutdown");
@@ -3273,9 +3148,7 @@ impl Timeline {
break Ok(());
};
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(this_layer_to_lsn) => {
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
}
Ok(()) => {}
Err(FlushLayerError::Cancelled) => {
info!("dropping out of flush loop for timeline shutdown");
return;
@@ -3284,36 +3157,11 @@ impl Timeline {
FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err.map(|_| ());
break err;
}
}
timer.stop_and_record();
};
// Unsharded tenants should never advance their LSN beyond the end of the
// highest layer they write: such gaps between layer data and the frozen LSN
// are only legal on sharded tenants.
debug_assert!(
self.shard_identity.count.count() > 1
|| flushed_to_lsn >= frozen_to_lsn
|| !flushed_to_lsn.is_valid()
);
if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 {
// If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised
// to us via layer_flush_start_rx, then advance it here.
//
// This path is only taken for tenants with multiple shards: single sharded tenants should
// never encounter a gap in the wal.
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}");
if self.set_disk_consistent_lsn(frozen_to_lsn) {
if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) {
tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}");
}
}
}
// Notify any listeners that we're done
let _ = self
.layer_flush_done_tx
@@ -3321,13 +3169,7 @@ impl Timeline {
}
}
/// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
/// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
///
/// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
/// it means no data will be written between the top of the highest frozen layer and to_lsn,
/// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
@@ -3341,10 +3183,9 @@ impl Timeline {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
self.layer_flush_start_tx.send_modify(|counter| {
my_flush_request = *counter + 1;
*counter = my_flush_request;
*lsn = std::cmp::max(last_record_lsn, *lsn);
});
loop {
@@ -3381,22 +3222,16 @@ impl Timeline {
}
fn flush_frozen_layers(&self) {
self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
*counter += 1;
*lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
});
self.layer_flush_start_tx.send_modify(|val| *val += 1);
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
///
/// Return value is the last lsn (inclusive) of the layer that was frozen.
#[instrument(skip_all, fields(layer=%frozen_layer))]
async fn flush_frozen_layer(
self: &Arc<Self>,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> Result<Lsn, FlushLayerError> {
) -> Result<(), FlushLayerError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// As a special case, when we have just imported an image into the repository,
@@ -3471,6 +3306,7 @@ impl Timeline {
}
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
// The new on-disk layers are now in the layer map. We can remove the
// in-memory layer from the map now. The flushed layer is stored in
@@ -3484,7 +3320,10 @@ impl Timeline {
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
if self.set_disk_consistent_lsn(disk_consistent_lsn) {
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.disk_consistent_lsn.store(disk_consistent_lsn);
// Schedule remote uploads that will reflect our new disk_consistent_lsn
self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
}
@@ -3501,22 +3340,7 @@ impl Timeline {
// This failpoint is used by another test case `test_pageserver_recovery`.
fail_point!("flush-frozen-exit");
Ok(Lsn(lsn_range.end.0 - 1))
}
/// Return true if the value changed
///
/// This function must only be used from the layer flush task, and may not be called concurrently.
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
// We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
let old_value = self.disk_consistent_lsn.load();
if new_value != old_value {
assert!(new_value >= old_value);
self.disk_consistent_lsn.store(new_value);
true
} else {
false
}
Ok(())
}
/// Update metadata file
@@ -3677,24 +3501,6 @@ impl Timeline {
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let last = self.last_image_layer_creation_check_at.load();
if lsn != Lsn(0) {
let distance = lsn
.checked_sub(last)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting below if we've not ingested
// sufficient WAL since the last check.
if distance.0 < min_distance {
return false;
}
}
self.last_image_layer_creation_check_at.store(lsn);
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;
@@ -4036,24 +3842,6 @@ impl Timeline {
Ok(())
}
/// Schedules the uploads of the given image layers
fn upload_new_image_layers(
self: &Arc<Self>,
new_images: impl IntoIterator<Item = ResidentLayer>,
) -> anyhow::Result<()> {
let Some(remote_client) = &self.remote_client else {
return Ok(());
};
for layer in new_images {
remote_client.schedule_layer_file_upload(layer)?;
}
// should any new image layer been created, not uploading index_part will
// result in a mismatch between remote_physical_size and layermap calculated
// size, which will fail some tests, but should not be an issue otherwise.
remote_client.schedule_index_upload_for_file_changes()?;
Ok(())
}
/// Update information about which layer files need to be retained on
/// garbage collection. This is separate from actually performing the GC,
/// and is updated more frequently, so that compaction can remove obsolete

View File

@@ -125,8 +125,18 @@ impl Timeline {
)
.await
.map_err(anyhow::Error::from)?;
if let Some(remote_client) = &self.remote_client {
for layer in layers {
remote_client.schedule_layer_file_upload(layer)?;
}
}
self.upload_new_image_layers(layers)?;
if let Some(remote_client) = &self.remote_client {
// should any new image layer been created, not uploading index_part will
// result in a mismatch between remote_physical_size and layermap calculated
// size, which will fail some tests, but should not be an issue otherwise.
remote_client.schedule_index_upload_for_file_changes()?;
}
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -808,10 +818,7 @@ impl TimelineAdaptor {
self.timeline
.finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete)
.await?;
self.timeline
.upload_new_image_layers(std::mem::take(&mut self.new_images))?;
self.new_images.clear();
self.new_deltas.clear();
self.layers_to_delete.clear();
Ok(())

View File

@@ -6,7 +6,7 @@ use std::{
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, Instrument};
use tracing::{debug, error, info, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId};
use crate::{
@@ -14,6 +14,7 @@ use crate::{
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
debug_assert_current_span_has_tenant_and_timeline_id,
metadata::TimelineMetadata,
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
CreateTimelineCause, DeleteTimelineError, Tenant,
@@ -22,6 +23,58 @@ use crate::{
use super::{Timeline, TimelineResources};
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Notify any timeline work to drop out of loops/requests
tracing::debug!("Cancelling CancellationToken");
timeline.cancel.cancel();
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
if let Some(walreceiver) = maybe_started_walreceiver {
walreceiver.stop().await;
}
debug!("wal receiver shutdown confirmed");
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
)
.await;
// Prevent new uploads from starting.
if let Some(remote_client) = timeline.remote_client.as_ref() {
remote_client.stop();
}
// Stop & wait for the remaining timeline tasks, including upload tasks.
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(
None,
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
)
.await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
tracing::debug!("Waiting for gate...");
timeline.gate.close().await;
tracing::debug!("Shutdown complete");
Ok(())
}
/// Mark timeline as deleted in S3 so we won't pick it up next time
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
@@ -215,14 +268,7 @@ impl DeleteTimelineFlow {
guard.mark_in_progress()?;
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
timeline.shutdown(super::ShutdownMode::Hard).await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
stop_tasks(&timeline).await?;
set_deleted_in_remote_index(&timeline).await?;

View File

@@ -67,19 +67,20 @@ impl Timeline {
),
false,
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = self_clone.cancel.cancelled() => { return Ok(()); }
_ = cancel.cancelled() => { return Ok(()); }
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
};
self_clone.eviction_task(parent).await;
self_clone.eviction_task(parent, cancel).await;
Ok(())
},
);
}
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>, cancel: CancellationToken) {
use crate::tenant::tasks::random_init_delay;
// acquire the gate guard only once within a useful span
@@ -94,7 +95,7 @@ impl Timeline {
EvictionPolicy::OnlyImitiate(lat) => lat.period,
EvictionPolicy::NoEviction => Duration::from_secs(10),
};
if random_init_delay(period, &self.cancel).await.is_err() {
if random_init_delay(period, &cancel).await.is_err() {
return;
}
}
@@ -103,13 +104,13 @@ impl Timeline {
loop {
let policy = self.get_eviction_policy();
let cf = self
.eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx)
.eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx)
.await;
match cf {
ControlFlow::Break(()) => break,
ControlFlow::Continue(sleep_until) => {
if tokio::time::timeout_at(sleep_until, self.cancel.cancelled())
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
.await
.is_ok()
{

View File

@@ -120,10 +120,9 @@ impl LayerManager {
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
pub(crate) async fn try_freeze_in_memory_layer(
&mut self,
lsn: Lsn,
Lsn(last_record_lsn): Lsn,
last_freeze_at: &AtomicLsn,
) {
let Lsn(last_record_lsn) = lsn;
let end_lsn = Lsn(last_record_lsn + 1);
if let Some(open_layer) = &self.layer_map.open_layer {
@@ -136,11 +135,8 @@ impl LayerManager {
self.layer_map.frozen_layers.push_back(open_layer_rc);
self.layer_map.open_layer = None;
self.layer_map.next_open_layer_at = Some(end_lsn);
last_freeze_at.store(end_lsn);
}
// Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
// accounts for regions in the LSN range where we might have ingested no data due to sharding.
last_freeze_at.store(end_lsn);
}
/// Add image layers to the layer map, called from `create_image_layers`.

View File

@@ -24,12 +24,13 @@ mod connection_manager;
mod walreceiver_connection;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::num::NonZeroU64;
use std::sync::Arc;
@@ -39,6 +40,8 @@ use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TimelineId;
use self::connection_manager::ConnectionManagerStatus;
use super::Timeline;
@@ -57,10 +60,9 @@ pub struct WalReceiverConf {
}
pub struct WalReceiver {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
/// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
/// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
cancel: CancellationToken,
}
impl WalReceiver {
@@ -74,23 +76,23 @@ impl WalReceiver {
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
let loop_status = Arc::new(std::sync::RwLock::new(None));
let manager_status = Arc::clone(&loop_status);
let cancel = timeline.cancel.child_token();
WALRECEIVER_RUNTIME.spawn({
let cancel = cancel.clone();
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
Some(timeline.tenant_shard_id),
Some(timeline_id),
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
// acquire timeline gate so we know the task doesn't outlive the Timeline
let Ok(_guard) = timeline.gate.enter() else {
debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already");
return;
};
debug!("WAL receiver manager started, connecting to broker");
let cancel = task_mgr::shutdown_token();
let mut connection_manager_state = ConnectionManagerState::new(
timeline,
conf,
cancel.clone(),
);
while !cancel.is_cancelled() {
let loop_step_result = connection_manager_loop_step(
@@ -110,22 +112,25 @@ impl WalReceiver {
}
connection_manager_state.shutdown().await;
*loop_status.write().unwrap() = None;
debug!("task exits");
Ok(())
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
});
);
Self {
tenant_shard_id,
timeline_id,
manager_status,
cancel,
}
}
#[instrument(skip_all, level = tracing::Level::DEBUG)]
pub fn cancel(&self) {
debug_assert_current_span_has_tenant_and_timeline_id();
debug!("cancelling walreceiver tasks");
self.cancel.cancel();
pub async fn stop(self) {
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
}
pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
@@ -159,18 +164,14 @@ enum TaskStateUpdate<E> {
impl<E: Clone> TaskHandle<E> {
/// Initializes the task, starting it immediately after the creation.
///
/// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]).
/// It being a child token enables us to provide a [`Self::shutdown`] method.
fn spawn<Fut>(
cancel_parent: &CancellationToken,
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
) -> Self
where
Fut: Future<Output = anyhow::Result<()>> + Send,
E: Send + Sync + 'static,
{
let cancellation = cancel_parent.child_token();
let cancellation = CancellationToken::new();
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
let cancellation_clone = cancellation.clone();

View File

@@ -280,8 +280,6 @@ pub(super) struct ConnectionManagerState {
id: TenantTimelineId,
/// Use pageserver data about the timeline to filter out some of the safekeepers.
timeline: Arc<Timeline>,
/// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn.
cancel: CancellationToken,
conf: WalReceiverConf,
/// Current connection to safekeeper for WAL streaming.
wal_connection: Option<WalConnection>,
@@ -404,11 +402,7 @@ struct BrokerSkTimeline {
}
impl ConnectionManagerState {
pub(super) fn new(
timeline: Arc<Timeline>,
conf: WalReceiverConf,
cancel: CancellationToken,
) -> Self {
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
let id = TenantTimelineId {
tenant_id: timeline.tenant_shard_id.tenant_id,
timeline_id: timeline.timeline_id,
@@ -416,7 +410,6 @@ impl ConnectionManagerState {
Self {
id,
timeline,
cancel,
conf,
wal_connection: None,
wal_stream_candidates: HashMap::new(),
@@ -424,22 +417,6 @@ impl ConnectionManagerState {
}
}
fn spawn<Fut>(
&self,
task: impl FnOnce(
tokio::sync::watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
CancellationToken,
) -> Fut
+ Send
+ 'static,
) -> TaskHandle<WalConnectionStatus>
where
Fut: std::future::Future<Output = anyhow::Result<()>> + Send,
{
// TODO: get rid of TaskHandle
super::TaskHandle::spawn(&self.cancel, task)
}
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
WALRECEIVER_SWITCHES
@@ -458,7 +435,7 @@ impl ConnectionManagerState {
);
let span = info_span!("connection", %node_id);
let connection_handle = self.spawn(move |events_sender, cancellation| {
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -486,12 +463,6 @@ impl ConnectionManagerState {
info!("walreceiver connection handling ended: {e}");
Ok(())
}
WalReceiverError::ClosedGate => {
info!(
"walreceiver connection handling ended because of closed gate"
);
Ok(())
}
WalReceiverError::Other(e) => {
// give out an error to have task_mgr give it a really verbose logging
if cancellation.is_cancelled() {
@@ -1045,7 +1016,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: state.spawn(move |sender, _| async move {
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1213,7 +1184,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: state.spawn(move |sender, _| async move {
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1280,7 +1251,7 @@ mod tests {
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: state.spawn(move |sender, _| async move {
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();
@@ -1344,7 +1315,7 @@ mod tests {
sk_id: NodeId(1),
availability_zone: None,
status: connection_status,
connection_task: state.spawn(move |_, _| async move { Ok(()) }),
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
discovered_new_wal: Some(NewCommittedWAL {
discovered_at: time_over_threshold,
lsn: new_lsn,
@@ -1400,7 +1371,6 @@ mod tests {
timeline_id: TIMELINE_ID,
},
timeline,
cancel: CancellationToken::new(),
conf: WalReceiverConf {
wal_connect_timeout: Duration::from_secs(1),
lagging_wal_timeout: Duration::from_secs(1),
@@ -1444,7 +1414,7 @@ mod tests {
sk_id: connected_sk_id,
availability_zone: None,
status: connection_status,
connection_task: state.spawn(move |sender, _| async move {
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status))
.ok();

View File

@@ -27,6 +27,7 @@ use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -36,8 +37,8 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::pageserver_feedback::PageserverFeedback;
use utils::{id::NodeId, lsn::Lsn};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
/// Status of the connection.
#[derive(Debug, Clone, Copy)]
@@ -67,7 +68,6 @@ pub(super) enum WalReceiverError {
SuccessfulCompletion(String),
/// Generic error
Other(anyhow::Error),
ClosedGate,
}
impl From<tokio_postgres::Error> for WalReceiverError {
@@ -119,16 +119,6 @@ pub(super) async fn handle_walreceiver_connection(
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// prevent timeline shutdown from finishing until we have exited
let _guard = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => WalReceiverError::ClosedGate,
})?;
// This function spawns a side-car task (WalReceiverConnectionPoller).
// Get its gate guard now as well.
let poller_guard = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => WalReceiverError::ClosedGate,
})?;
WALRECEIVER_STARTED_CONNECTIONS.inc();
// Connect to the database in replication mode.
@@ -166,19 +156,22 @@ pub(super) async fn handle_walreceiver_connection(
}
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own. It shouldn't outlive this function, but,
// due to lack of async drop, we can't enforce that. However, we ensure that
// 1. it is sensitive to `cancellation` and
// 2. holds the Timeline gate open so that after timeline shutdown,
// we know this task is gone.
// so spawn it off to run on its own.
let _connection_ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionPoller,
ctx.download_behavior(),
);
let connection_cancellation = cancellation.clone();
WALRECEIVER_RUNTIME.spawn(
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id),
Some(timeline.timeline_id),
"walreceiver connection",
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
select! {
connection_result = connection => match connection_result {
Ok(()) => debug!("Walreceiver db connection closed"),
@@ -189,9 +182,6 @@ pub(super) async fn handle_walreceiver_connection(
// with a similar error.
},
WalReceiverError::SuccessfulCompletion(_) => {}
WalReceiverError::ClosedGate => {
// doesn't happen at runtime
}
WalReceiverError::Other(err) => {
warn!("Connection aborted: {err:#}")
}
@@ -200,7 +190,7 @@ pub(super) async fn handle_walreceiver_connection(
},
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
}
drop(poller_guard);
Ok(())
}
// Enrich the log lines emitted by this closure with meaningful context.
// TODO: technically, this task outlives the surrounding function, so, the
@@ -313,7 +303,6 @@ pub(super) async fn handle_walreceiver_connection(
trace!("received XLogData between {startlsn} and {endlsn}");
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
waldecoder.feed_bytes(data);
{

View File

@@ -61,7 +61,7 @@ pub struct VectoredRead {
}
impl VectoredRead {
pub fn size(&self) -> usize {
fn size(&self) -> usize {
(self.end - self.start) as usize
}
}

View File

@@ -15,23 +15,11 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz
.map_err(std::io::Error::from)
.context("statvfs tenants directory")?;
// https://unix.stackexchange.com/a/703650
let blocksz = if statvfs.fragment_size() > 0 {
statvfs.fragment_size()
} else {
statvfs.block_size()
};
let blocksz = statvfs.block_size();
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
let free = statvfs.blocks_available() as u64 * blocksz;
#[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
let used = statvfs
.blocks()
// use blocks_free instead of available here to match df in case someone compares
.saturating_sub(statvfs.blocks_free()) as u64
* blocksz;
let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
let captured_at = std::time::SystemTime::now();
let doc = PageserverUtilization {

View File

@@ -36,12 +36,11 @@ use bytes::{Bytes, BytesMut};
use pageserver_api::key::key_to_rel_block;
use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::TenantShardId;
use std::sync::Arc;
use std::sync::{Arc, RwLock};
use std::time::Duration;
use std::time::Instant;
use tracing::*;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
///
/// This is the real implementation that uses a Postgres process to
@@ -54,19 +53,7 @@ pub struct PostgresRedoManager {
tenant_shard_id: TenantShardId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
/// their process object; we use [`Arc::clone`] for that.
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
/// had that behavior; it's probably unnecessary.
/// The only merit of it is that if one walredo process encounters an error,
/// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`].
/// and retry redo, thereby starting the new process, while other redo tasks might
/// still be using the old redo process. But, those other tasks will most likely
/// encounter an error as well, and errors are an unexpected condition anyway.
/// So, probably we could get rid of the `Arc` in the future.
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
}
///
@@ -114,7 +101,6 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
};
img = Some(result?);
@@ -135,7 +121,6 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
}
}
@@ -149,7 +134,7 @@ impl PostgresRedoManager {
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
})
},
pid: self.redo_process.get().map(|p| p.id()),
pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
})
}
}
@@ -167,7 +152,7 @@ impl PostgresRedoManager {
tenant_shard_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: heavier_once_cell::OnceCell::default(),
redo_process: RwLock::new(None),
}
}
@@ -179,7 +164,8 @@ impl PostgresRedoManager {
if let Some(last_redo_at) = *g {
if last_redo_at.elapsed() >= idle_timeout {
drop(g);
drop(self.redo_process.get().map(|guard| guard.take_and_deinit()));
let mut guard = self.redo_process.write().unwrap();
*guard = None;
}
}
}
@@ -188,11 +174,8 @@ impl PostgresRedoManager {
///
/// Process one request for WAL redo using wal-redo postgres
///
/// # Cancel-Safety
///
/// Cancellation safe.
#[allow(clippy::too_many_arguments)]
async fn apply_batch_postgres(
fn apply_batch_postgres(
&self,
key: Key,
lsn: Lsn,
@@ -208,31 +191,42 @@ impl PostgresRedoManager {
const MAX_RETRY_ATTEMPTS: u32 = 1;
let mut n_attempts = 0u32;
loop {
let proc: Arc<process::WalRedoProcess> =
match self.redo_process.get_or_init_detached().await {
Ok(guard) => Arc::clone(&guard),
Err(permit) => {
// don't hold poison_guard, the launch code can bail
let start = Instant::now();
let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
self.redo_process.set(Arc::clone(&proc), permit);
proc
// launch the WAL redo process on first use
let proc: Arc<process::WalRedoProcess> = {
let proc_guard = self.redo_process.read().unwrap();
match &*proc_guard {
None => {
// "upgrade" to write lock to launch the process
drop(proc_guard);
let mut proc_guard = self.redo_process.write().unwrap();
match &*proc_guard {
None => {
let start = Instant::now();
let proc = Arc::new(
process::WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
);
let duration = start.elapsed();
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
.observe(duration.as_secs_f64());
info!(
duration_ms = duration.as_millis(),
pid = proc.id(),
"launched walredo process"
);
*proc_guard = Some(Arc::clone(&proc));
proc
}
Some(proc) => Arc::clone(proc),
}
}
};
Some(proc) => Arc::clone(proc),
}
};
let started_at = std::time::Instant::now();
@@ -278,34 +272,34 @@ impl PostgresRedoManager {
n_attempts,
e,
);
// Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
// Note that there may be other tasks concurrent with us that also hold `proc`.
// We have to deal with that here.
// Also read the doc comment on field `self.redo_process`.
//
// NB: there may still be other concurrent threads using `proc`.
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
//
// NB: the drop impl blocks the dropping thread with a wait() system call for
// the child process. In some ways the blocking is actually good: if we
// deferred the waiting into the background / to tokio if we used `tokio::process`,
// it could happen that if walredo always fails immediately, we spawn processes faster
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
match self.redo_process.get() {
None => (),
Some(guard) => {
if Arc::ptr_eq(&proc, &*guard) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
guard.take_and_deinit();
} else {
// Another task already spawned another redo process (further up in this method)
// and put it into `redo_process`. Do nothing, our view of the world is behind.
// Avoid concurrent callers hitting the same issue.
// We can't prevent it from happening because we want to enable parallelism.
{
let mut guard = self.redo_process.write().unwrap();
match &*guard {
Some(current_field_value) => {
if Arc::ptr_eq(current_field_value, &proc) {
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
*guard = None;
}
}
None => {
// Another thread was faster to observe the error, and already took the process out of rotation.
}
}
}
// The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
// NB: there may still be other concurrent threads using `proc`.
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
// holding the lock while waiting for the process to exit.
// NB: the drop impl blocks the current threads with a wait() system call for
// the child process. We dropped the `guard` above so that other threads aren't
// affected. But, it's good that the current thread _does_ block to wait.
// If we instead deferred the waiting into the background / to tokio, it could
// happen that if walredo always fails immediately, we spawn processes faster
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
drop(proc);
} else if n_attempts != 0 {
info!(n_attempts, "retried walredo succeeded");

View File

@@ -111,7 +111,6 @@ static PageServer page_servers[MAX_SHARDS];
static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static void pageserver_disconnect_shard(shardno_t shard_no);
static bool
PagestoreShmemIsValid(void)
@@ -488,32 +487,9 @@ retry:
return ret;
}
/*
* Reset prefetch and drop connection to the shard.
* It also drops connection to all other shards involved in prefetch.
*/
static void
pageserver_disconnect(shardno_t shard_no)
{
/*
* If the connection to any pageserver is lost, we throw away the
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*
* Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
* because prefetch request may be registered before connection is established.
*/
prefetch_on_ps_disconnect();
pageserver_disconnect_shard(shard_no);
}
/*
* Disconnect from specified shard
*/
static void
pageserver_disconnect_shard(shardno_t shard_no)
{
/*
* If anything goes wrong while we were sending a request, it's not clear
@@ -527,6 +503,14 @@ pageserver_disconnect_shard(shardno_t shard_no)
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
PQfinish(page_servers[shard_no].conn);
page_servers[shard_no].conn = NULL;
/*
* If the connection to any pageserver is lost, we throw away the
* whole prefetch queue, even for other pageservers. It should not
* cause big problems, because connection loss is supposed to be a
* rare event.
*/
prefetch_on_ps_disconnect();
}
if (page_servers[shard_no].wes != NULL)
{
@@ -692,8 +676,7 @@ page_server_api api =
{
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive,
.disconnect = pageserver_disconnect_shard
.receive = pageserver_receive
};
static bool

View File

@@ -180,7 +180,6 @@ typedef struct
bool (*send) (shardno_t shard_no, NeonRequest * request);
NeonResponse *(*receive) (shardno_t shard_no);
bool (*flush) (shardno_t shard_no);
void (*disconnect) (shardno_t shard_no);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);

View File

@@ -613,14 +613,6 @@ prefetch_on_ps_disconnect(void)
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->my_ring_index == ring_index);
/*
* Drop connection to all shards which have prefetch requests.
* It is not a problem to call disconnect multiple times on the same connection
* because disconnect implementation in libpagestore.c will check if connection
* is alive and do nothing of connection was already dropped.
*/
page_server->disconnect(slot->shard_no);
/* clean up the request */
slot->status = PRFS_TAG_REMAINS;
MyPState->n_requests_inflight -= 1;
@@ -641,12 +633,13 @@ prefetch_on_ps_disconnect(void)
static inline void
prefetch_set_unused(uint64 ring_index)
{
PrefetchRequest *slot;
PrefetchRequest *slot = GetPrfSlot(ring_index);
if (ring_index < MyPState->ring_last)
return; /* Should already be unused */
slot = GetPrfSlot(ring_index);
Assert(MyPState->ring_unused > ring_index);
if (slot->status == PRFS_UNUSED)
return;
@@ -805,8 +798,7 @@ Retry:
{
if (*force_lsn > slot->effective_request_lsn)
{
if (!prefetch_wait_for(ring_index))
goto Retry;
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
@@ -821,8 +813,7 @@ Retry:
{
if (*force_lsn != slot->effective_request_lsn)
{
if (!prefetch_wait_for(ring_index))
goto Retry;
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
@@ -888,8 +879,7 @@ Retry:
{
case PRFS_REQUESTED:
Assert(MyPState->ring_receive == cleanup_index);
if (!prefetch_wait_for(cleanup_index))
goto Retry;
prefetch_wait_for(cleanup_index);
prefetch_set_unused(cleanup_index);
break;
case PRFS_RECEIVED:
@@ -1690,7 +1680,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);
return exists;
@@ -2142,7 +2132,6 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/*
* Try to find prefetched page in the list of received pages.
*/
Retry:
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
if (entry != NULL)
@@ -2164,8 +2153,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
*/
if (slot->status == PRFS_REQUESTED)
{
if (!prefetch_wait_for(slot->my_ring_index))
goto Retry;
prefetch_wait_for(slot->my_ring_index);
}
/* drop caches */
prefetch_set_unused(slot->my_ring_index);
@@ -2228,7 +2216,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
((NeonErrorResponse *) resp)->message)));
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
/* buffer was used, clean up for later reuse */
@@ -2501,7 +2489,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
@@ -2556,7 +2544,7 @@ neon_dbsize(Oid dbNode)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2861,7 +2849,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);

View File

@@ -10,7 +10,6 @@ testing = []
[dependencies]
anyhow.workspace = true
async-compression.workspace = true
async-trait.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true

View File

@@ -102,7 +102,8 @@ pub(super) async fn authenticate(
ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone());
info!("woken up a compute node");
let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default();
info!(?cold_start_info, "woken up a compute node");
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
// while direct connections do not. Once we migrate to pg_sni_proxy

View File

@@ -10,7 +10,6 @@ use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring;
use proxy::proxy::run_until_cancelled;
use proxy::{BranchId, EndpointId, ProjectId};
use rustls::pki_types::PrivateKeyDer;
use tokio::net::TcpListener;
@@ -270,12 +269,7 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
endpoint_id: (&EndpointId::from("")).into(),
project_id: (&ProjectId::from("")).into(),
branch_id: (&BranchId::from("")).into(),
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
};
let metrics_aux: MetricsAuxInfo = Default::default();
// doesn't yet matter as pg-sni-router doesn't report analytics logs
ctx.set_success();

View File

@@ -10,7 +10,6 @@ use proxy::auth;
use proxy::auth::backend::MaybeOwned;
use proxy::cancellation::CancelMap;
use proxy::cancellation::CancellationHandler;
use proxy::config::remote_storage_from_toml;
use proxy::config::AuthenticationConfig;
use proxy::config::CacheOptions;
use proxy::config::HttpConfig;
@@ -192,19 +191,6 @@ struct ProxyCliArgs {
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
/// interval for backup metric collection
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
metric_backup_collection_interval: std::time::Duration,
/// remote storage configuration for backup metric collection
/// Encoded as toml (same format as pageservers), eg
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
#[clap(long, default_value = "{}")]
metric_backup_collection_remote_storage: String,
/// chunk size for backup metric collection
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
#[clap(long, default_value = "4194304")]
metric_backup_collection_chunk_size: usize,
}
#[derive(clap::Args, Clone, Copy, Debug)]
@@ -386,17 +372,12 @@ async fn main() -> anyhow::Result<()> {
// maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
maintenance_tasks.spawn(http::health_server::task_main(http_listener));
maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));
if let Some(metrics_config) = &config.metric_collection {
// TODO: Add gc regardles of the metric collection being enabled.
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
client_tasks.spawn(usage_metrics::task_backup(
&metrics_config.backup_metric_collection_config,
cancellation_token,
));
}
if let auth::BackendType::Console(api, _) = &config.auth_backend {
@@ -453,13 +434,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
if args.allow_self_signed_compute {
warn!("allowing self-signed compute certificates");
}
let backup_metric_collection_config = config::MetricBackupCollectionConfig {
interval: args.metric_backup_collection_interval,
remote_storage_config: remote_storage_from_toml(
&args.metric_backup_collection_remote_storage,
)?,
chunk_size: args.metric_backup_collection_chunk_size,
};
let metric_collection = match (
&args.metric_collection_endpoint,
@@ -468,7 +442,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
(Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
endpoint: endpoint.parse()?,
interval: humantime::parse_duration(interval)?,
backup_metric_collection_config,
}),
(None, None) => None,
_ => bail!(

View File

@@ -16,7 +16,7 @@ use crate::{
config::ProjectInfoCacheOptions,
console::AuthSecret,
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
EndpointId, RoleName,
EndpointId, ProjectId, RoleName,
};
use super::{Cache, Cached};
@@ -214,11 +214,14 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_role_secret(
&self,
project_id: ProjectIdInt,
endpoint_id: EndpointIdInt,
role_name: RoleNameInt,
project_id: &ProjectId,
endpoint_id: &EndpointId,
role_name: &RoleName,
secret: Option<AuthSecret>,
) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
let role_name = RoleNameInt::from(role_name);
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
@@ -231,10 +234,12 @@ impl ProjectInfoCacheImpl {
}
pub fn insert_allowed_ips(
&self,
project_id: ProjectIdInt,
endpoint_id: EndpointIdInt,
project_id: &ProjectId,
endpoint_id: &EndpointId,
allowed_ips: Arc<Vec<IpPattern>>,
) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle.
return;
@@ -353,7 +358,7 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)]
mod tests {
use super::*;
use crate::{scram::ServerSecret, ProjectId};
use crate::scram::ServerSecret;
#[tokio::test]
async fn test_project_info_cache_settings() {
@@ -364,8 +369,8 @@ mod tests {
ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600),
});
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -374,23 +379,9 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert!(cached.cached());
@@ -402,12 +393,7 @@ mod tests {
// Shouldn't add more than 2 roles.
let user3: RoleName = "user3".into();
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user3).into(),
secret3.clone(),
);
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -435,8 +421,8 @@ mod tests {
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_secs(2)).await;
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -445,23 +431,9 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
tokio::time::advance(Duration::from_secs(2)).await;
// Nothing should be invalidated.
@@ -498,8 +470,8 @@ mod tests {
gc_interval: Duration::from_secs(600),
}));
let project_id: ProjectId = "project".into();
let endpoint_id: EndpointId = "endpoint".into();
let project_id = "project".into();
let endpoint_id = "endpoint".into();
let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -508,20 +480,10 @@ mod tests {
"127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(),
]);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
cache.clone().disable_ttl();
tokio::time::advance(Duration::from_millis(100)).await;
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
// Added before ttl was disabled + ttl should be still cached.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -535,11 +497,7 @@ mod tests {
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
// Added after ttl was disabled + ttl should not be cached.
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached());

View File

@@ -276,7 +276,6 @@ impl ConnCfg {
let stream = connection.stream.into_inner();
info!(
cold_start_info = ctx.cold_start_info.as_str(),
"connected to compute node at {host} ({socket_addr}) sslmode={:?}",
self.0.get_ssl_mode()
);

View File

@@ -5,7 +5,6 @@ use crate::{
};
use anyhow::{bail, ensure, Context, Ok};
use itertools::Itertools;
use remote_storage::RemoteStorageConfig;
use rustls::{
crypto::ring::sign,
pki_types::{CertificateDer, PrivateKeyDer},
@@ -40,7 +39,6 @@ pub struct ProxyConfig {
pub struct MetricCollectionConfig {
pub endpoint: reqwest::Url,
pub interval: Duration,
pub backup_metric_collection_config: MetricBackupCollectionConfig,
}
pub struct TlsConfig {
@@ -313,21 +311,6 @@ impl CertResolver {
}
}
#[derive(Debug)]
pub struct MetricBackupCollectionConfig {
pub interval: Duration,
pub remote_storage_config: OptRemoteStorageConfig,
pub chunk_size: usize,
}
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
/// runtime type errors from the value parser we use.
pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
RemoteStorageConfig::from_toml(&s.parse()?)
}
/// Helper for cmdline cache options parsing.
#[derive(Debug)]
pub struct CacheOptions {

View File

@@ -3,7 +3,7 @@ use std::fmt;
use crate::auth::IpPattern;
use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
use crate::{BranchId, EndpointId, ProjectId};
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
pub struct GetRoleSecret {
pub role_secret: Box<str>,
pub allowed_ips: Option<Vec<IpPattern>>,
pub project_id: Option<ProjectIdInt>,
pub project_id: Option<ProjectId>,
}
// Manually implement debug to omit sensitive info.
@@ -93,47 +93,22 @@ impl fmt::Debug for DatabaseInfo {
/// Various labels for prometheus metrics.
/// Also known as `ProxyMetricsAuxInfo` in the console.
#[derive(Debug, Deserialize, Clone)]
#[derive(Debug, Deserialize, Clone, Default)]
pub struct MetricsAuxInfo {
pub endpoint_id: EndpointIdInt,
pub project_id: ProjectIdInt,
pub branch_id: BranchIdInt,
#[serde(default)]
pub cold_start_info: ColdStartInfo,
pub endpoint_id: EndpointId,
pub project_id: ProjectId,
pub branch_id: BranchId,
pub cold_start_info: Option<ColdStartInfo>,
}
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum ColdStartInfo {
#[default]
Unknown,
/// Compute was already running
Warm,
#[serde(rename = "pool_hit")]
/// Compute was not running but there was an available VM
VmPoolHit,
#[serde(rename = "pool_miss")]
/// Compute was not running and there were no VMs available
VmPoolMiss,
// not provided by control plane
/// Connection available from HTTP pool
HttpPoolHit,
/// Cached connection info
WarmCached,
}
impl ColdStartInfo {
pub fn as_str(&self) -> &'static str {
match self {
ColdStartInfo::Unknown => "unknown",
ColdStartInfo::Warm => "warm",
ColdStartInfo::VmPoolHit => "pool_hit",
ColdStartInfo::VmPoolMiss => "pool_miss",
ColdStartInfo::HttpPoolHit => "http_pool_hit",
ColdStartInfo::WarmCached => "warm_cached",
}
}
Unknown = 0,
Warm = 1,
PoolHit = 2,
PoolMiss = 3,
}
#[cfg(test)]

Some files were not shown because too many files have changed in this diff Show More