Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Bayandin
7d175a92d6 DO NOT MERGE: run only 300gb benchmark 2023-11-07 17:48:30 +00:00
Alexander Bayandin
c1748cfe19 Add 300gb benchmark 2023-11-07 17:45:07 +00:00
Alexander Bayandin
e19b543ea2 Nightly Benchmark: simplify matrix 2023-11-07 17:44:38 +00:00
51 changed files with 499 additions and 1392 deletions

View File

@@ -22,11 +22,5 @@ platforms = [
# "x86_64-pc-windows-msvc",
]
[final-excludes]
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
# from depending on workspace-hack because most of the dependencies are not used.
workspace-members = ["vm_monitor"]
# Write out exact versions rather than a semver range. (Defaults to false.)
# exact-versions = true

View File

@@ -17,9 +17,8 @@ assignees: ''
## Implementation ideas
```[tasklist]
### Tasks
```
## Tasks
- [ ]
## Other related tasks and Epics

View File

@@ -34,76 +34,6 @@ concurrency:
cancel-in-progress: true
jobs:
bench:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
# Set --sparse-ordering option of pytest-order plugin
# to ensure tests are running in order of appears in the file.
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Delete Neon Project
if: ${{ always() }}
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
generate-matrices:
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
@@ -116,66 +46,18 @@ jobs:
runs-on: ubuntu-latest
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}
steps:
- name: Generate matrix for pgbench benchmark
id: pgbench-compare-matrix
run: |
matrix='{
"platform": [
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
- name: Generate matrix for OLAP benchmarks
id: olap-compare-matrix
run: |
matrix='{
"platform": [
"neon-captest-reuse"
"include": [
{ "platform": "neon-captest-new", "db_size": "300gb", "compute_units": "[7, 7]", "provisioner": "k8s-pod" },
{ "platform": "neonvm-captest-new", "db_size": "300gb", "compute_units": "[7, 7]", "provisioner": "k8s-neonvm" }
]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
{ "platform": "rds-aurora" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
- name: Generate matrix for TPC-H benchmarks
id: tpch-compare-matrix
run: |
matrix='{
"platform": [
"neon-captest-reuse"
],
"scale": [
"10"
]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
{ "platform": "rds-aurora", "scale": "10" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
pgbench-compare:
@@ -226,8 +108,8 @@ jobs:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
compute_units: ${{ matrix.compute_units }}
provisioner: ${{ matrix.provisioner }}
- name: Set up Connection String
id: set-up-connstr
@@ -317,293 +199,3 @@ jobs:
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
# Run this job only when pgbench-compare is finished to avoid the intersection.
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() }}
needs: [ generate-matrices, pgbench-compare ]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
- name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
TEST_OLAP_SCALE: 10
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
tpch-compare:
# TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare & clickbench-compare.
# Run this job only when clickbench-compare is finished to avoid the intersection.
# We might change it after https://github.com/neondatabase/neon/issues/2900.
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: ${{ !cancelled() }}
needs: [ generate-matrices, clickbench-compare ]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.platform }}
TEST_OLAP_SCALE: ${{ matrix.scale }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Get Connstring Secret Name
run: |
case "${PLATFORM}" in
neon-captest-reuse)
ENV_PLATFORM=CAPTEST_TPCH
;;
rds-aurora)
ENV_PLATFORM=RDS_AURORA_TPCH
;;
rds-postgres)
ENV_PLATFORM=RDS_AURORA_TPCH
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
- name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
TEST_OLAP_SCALE: ${{ matrix.scale }}
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
user-examples-compare:
if: ${{ !cancelled() }}
needs: [ generate-matrices, tpch-compare ]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
- uses: actions/checkout@v3
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
;;
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
fi
psql ${CONNSTR} -c "${QUERY}"
- name: Run user examples
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_perf_olap.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

3
Cargo.lock generated
View File

@@ -4064,7 +4064,6 @@ dependencies = [
"aws-config",
"aws-credential-types",
"aws-sdk-s3",
"aws-smithy-async",
"aws-smithy-http",
"aws-types",
"azure_core",
@@ -6056,6 +6055,7 @@ dependencies = [
"tokio-util",
"tracing",
"tracing-subscriber",
"workspace_hack",
]
[[package]]
@@ -6483,7 +6483,6 @@ dependencies = [
"clap",
"clap_builder",
"crossbeam-utils",
"dashmap",
"either",
"fail",
"futures",

View File

@@ -48,7 +48,6 @@ async-trait = "0.1"
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56"
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
aws-credential-types = "0.56"
aws-types = "0.56"
axum = { version = "0.6.20", features = ["ws"] }
@@ -67,7 +66,7 @@ comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
dashmap = "5.5.0"
either = "1.8"
enum-map = "2.4.2"
enumset = "1.0.12"

View File

@@ -710,12 +710,8 @@ impl ComputeNode {
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access.
#[instrument(skip_all)]
fn pg_reload_conf(&self) -> Result<()> {
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
Command::new(pgctl_bin)
.args(["reload", "-D", &self.pgdata])
.output()
.expect("cannot run pg_ctl process");
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
client.simple_query("SELECT pg_reload_conf()")?;
Ok(())
}
@@ -728,9 +724,9 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
self.pg_reload_conf()?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
// Proceed with post-startup configuration. Note, that order of operations is important.
// Disable DDL forwarding because control plane already knows about these roles/databases.

View File

@@ -78,7 +78,7 @@ use regex::Regex;
use remote_storage::*;
use serde_json;
use std::io::Read;
use std::num::NonZeroUsize;
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::Path;
use std::str;
use tar::Archive;
@@ -281,6 +281,8 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)

View File

@@ -17,7 +17,7 @@ use std::{fmt, io};
use std::{future::Future, str::FromStr};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tracing::{debug, error, info, trace, warn};
use tracing::{debug, error, info, trace};
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
use pq_proto::{
@@ -35,11 +35,6 @@ pub enum QueryError {
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Authentication failure
#[error("Unauthorized: {0}")]
Unauthorized(std::borrow::Cow<'static, str>),
#[error("Simulated Connection Error")]
SimulatedConnectionError,
/// Some other error
#[error(transparent)]
Other(#[from] anyhow::Error),
@@ -54,9 +49,8 @@ impl From<io::Error> for QueryError {
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
Self::Disconnected(_) => b"08006", // connection failure
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
@@ -616,7 +610,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error(&e),
&e.to_string(),
Some(e.pg_error_code()),
))?;
return Err(e);
@@ -738,9 +732,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
if let Err(e) = handler.process_query(self, query_string).await {
match e {
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
QueryError::SimulatedConnectionError => {
return Err(QueryError::SimulatedConnectionError)
}
e => {
log_query_error(query_string, &e);
let short_error = short_error(&e);
@@ -975,8 +966,6 @@ pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Shutdown => "shutdown".to_string(),
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
QueryError::Other(e) => format!("{e:#}"),
}
}
@@ -993,15 +982,9 @@ fn log_query_error(query: &str, e: &QueryError) {
QueryError::Disconnected(other_connection_error) => {
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
}
QueryError::SimulatedConnectionError => {
error!("query handler for query '{query}' failed due to a simulated connection error")
}
QueryError::Shutdown => {
info!("query handler for '{query}' cancelled during tenant shutdown")
}
QueryError::Unauthorized(e) => {
warn!("query handler for '{query}' failed with authentication error: {e}");
}
QueryError::Other(e) => {
error!("query handler for '{query}' failed: {e:?}");
}

View File

@@ -8,7 +8,6 @@ license.workspace = true
anyhow.workspace = true
async-trait.workspace = true
once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-http.workspace = true
aws-types.workspace = true
aws-config.workspace = true

View File

@@ -14,7 +14,13 @@ mod local_fs;
mod s3_bucket;
mod simulate_failures;
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
use std::{
collections::HashMap,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
sync::Arc,
};
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -30,6 +36,12 @@ pub use self::{
};
use s3_bucket::RequestKind;
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
/// ~200 RPS for IAM services
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -431,6 +443,10 @@ pub struct StorageMetadata(HashMap<String, String>);
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
pub max_concurrent_syncs: NonZeroUsize,
/// Max allowed errors before the sync task is considered failed and evicted.
pub max_sync_errors: NonZeroU32,
/// The storage connection configuration.
pub storage: RemoteStorageKind,
}
@@ -526,6 +542,18 @@ impl RemoteStorageConfig {
let use_azure = container_name.is_some() && container_region.is_some();
let max_concurrent_syncs = NonZeroUsize::new(
parse_optional_integer("max_concurrent_syncs", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
)
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
let max_sync_errors = NonZeroU32::new(
parse_optional_integer("max_sync_errors", toml)?
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
)
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
let default_concurrency_limit = if use_azure {
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
} else {
@@ -607,7 +635,11 @@ impl RemoteStorageConfig {
}
};
Ok(Some(RemoteStorageConfig { storage }))
Ok(Some(RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage,
}))
}
}

View File

@@ -4,27 +4,23 @@
//! allowing multiple api users to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different.
use std::{borrow::Cow, sync::Arc};
use std::borrow::Cow;
use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
};
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
config::{Config, Region},
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use scopeguard::ScopeGuard;
@@ -87,23 +83,10 @@ impl S3Bucket {
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
let mut retry_config = RetryConfigBuilder::new();
retry_config
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
let mut config_builder = Config::builder()
.region(region)
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.retry_config(retry_config.build());
.credentials_provider(credentials_provider);
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet;
use std::env;
use std::num::NonZeroUsize;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::PathBuf;
use std::sync::Arc;
@@ -469,6 +469,8 @@ fn create_azure_client(
let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AzureContainer(AzureConfig {
container_name: remote_storage_azure_container,
container_region: remote_storage_azure_region,

View File

@@ -1,6 +1,6 @@
use std::collections::HashSet;
use std::env;
use std::num::NonZeroUsize;
use std::num::{NonZeroU32, NonZeroUsize};
use std::ops::ControlFlow;
use std::path::PathBuf;
use std::sync::Arc;
@@ -396,6 +396,8 @@ fn create_s3_client(
let random = rand::thread_rng().gen::<u32>();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: remote_storage_s3_bucket,
bucket_region: remote_storage_s3_region,

View File

@@ -2,7 +2,7 @@
use arc_swap::ArcSwap;
use serde;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
use std::{fs, sync::Arc};
use anyhow::Result;
use camino::Utf8Path;
@@ -11,7 +11,7 @@ use jsonwebtoken::{
};
use serde::{Deserialize, Serialize};
use crate::{http::error::ApiError, id::TenantId};
use crate::id::TenantId;
/// Algorithm to use. We require EdDSA.
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -54,7 +54,7 @@ impl SwappableJwtAuth {
pub fn swap(&self, jwt_auth: JwtAuth) {
self.0.swap(Arc::new(jwt_auth));
}
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
self.0.load().decode(token)
}
}
@@ -65,24 +65,6 @@ impl std::fmt::Debug for SwappableJwtAuth {
}
}
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub struct AuthError(pub Cow<'static, str>);
impl Display for AuthError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<AuthError> for ApiError {
fn from(_value: AuthError) -> Self {
// Don't pass on the value of the AuthError as a precautionary measure.
// Being intentionally vague in public error communication hurts debugability
// but it is more secure.
ApiError::Forbidden("JWT authentication error".to_string())
}
}
pub struct JwtAuth {
decoding_keys: Vec<DecodingKey>,
validation: Validation,
@@ -132,7 +114,7 @@ impl JwtAuth {
/// The function tries the stored decoding keys in succession,
/// and returns the first yielding a successful result.
/// If there is no working decoding key, it returns the last error.
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
let mut res = None;
for decoding_key in &self.decoding_keys {
res = Some(decode(token, decoding_key, &self.validation));
@@ -141,9 +123,9 @@ impl JwtAuth {
}
}
if let Some(res) = res {
res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
res.map_err(anyhow::Error::new)
} else {
Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
anyhow::bail!("no JWT decoding keys configured")
}
}
}
@@ -184,9 +166,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
"#;
#[test]
fn test_decode() {
fn test_decode() -> Result<(), anyhow::Error> {
let expected_claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
scope: Scope::Tenant,
};
@@ -205,24 +187,28 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
// Check it can be validated with the public key
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
let claims_from_token = auth.decode(encoded_eddsa)?.claims;
assert_eq!(claims_from_token, expected_claims);
Ok(())
}
#[test]
fn test_encode() {
fn test_encode() -> Result<(), anyhow::Error> {
let claims = Claims {
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
scope: Scope::Tenant,
};
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
// decode it back
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
let decoded = auth.decode(&encoded).unwrap();
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
let decoded = auth.decode(&encoded)?;
assert_eq!(decoded.claims, claims);
Ok(())
}
}

View File

@@ -1,4 +1,4 @@
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
use crate::auth::{Claims, SwappableJwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION};
@@ -400,11 +400,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
})?;
let token = parse_token(header_value)?;
let data = auth.decode(token).map_err(|err| {
warn!("Authentication error: {err}");
// Rely on From<AuthError> for ApiError impl
err
})?;
let data = auth
.decode(token)
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
req.set_context(data.claims);
}
None => {
@@ -452,11 +450,12 @@ where
pub fn check_permission_with(
req: &Request<Body>,
check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
) -> Result<(), ApiError> {
match req.context::<Claims>() {
Some(claims) => Ok(check_permission(&claims)
.map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
Some(claims) => {
Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
}
None => Ok(()), // claims is None because auth is disabled
}
}

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::error::Error as StdError;
use thiserror::Error;
use tracing::{error, info, warn};
use tracing::{error, info};
#[derive(Debug, Error)]
pub enum ApiError {
@@ -118,9 +118,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
// Print a stack trace for Internal Server errors
match api_error {
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
warn!("Error processing HTTP request: {api_error:#}")
}
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),

View File

@@ -125,9 +125,6 @@ where
// Wake everyone with an error.
let mut internal = self.internal.lock().unwrap();
// Block any future waiters from starting
internal.shutdown = true;
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)

View File

@@ -85,13 +85,6 @@ impl Gate {
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
}
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
/// the CancellationToken on such types is analogous to "Did shutdown start?"
pub fn close_complete(&self) -> bool {
self.sem.is_closed()
}
async fn do_close(&self) {
tracing::debug!(gate = self.name, "Closing Gate...");
match self.sem.acquire_many(Self::MAX_UNITS).await {

View File

@@ -19,12 +19,13 @@ inotify.workspace = true
serde.workspace = true
serde_json.workspace = true
sysinfo.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio.workspace = true
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies]
cgroups-rs = "0.3.3"

View File

@@ -188,7 +188,6 @@ extern "C" fn recovery_download(
}
}
#[allow(clippy::unnecessary_cast)]
extern "C" fn wal_read(
sk: *mut Safekeeper,
buf: *mut ::std::os::raw::c_char,
@@ -422,7 +421,6 @@ impl std::fmt::Display for Level {
}
/// Take ownership of `Vec<u8>` from StringInfoData.
#[allow(clippy::unnecessary_cast)]
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
if pg.data.is_null() {
return None;

View File

@@ -186,7 +186,7 @@ impl Wrapper {
.unwrap()
.into_bytes_with_nul();
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

View File

@@ -1,21 +1,22 @@
use utils::auth::{AuthError, Claims, Scope};
use anyhow::{bail, Result};
use utils::auth::{Claims, Scope};
use utils::id::TenantId;
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
match (&claims.scope, tenant_id) {
(Scope::Tenant, None) => Err(AuthError(
"Attempt to access management api with tenant scope. Permission denied".into(),
)),
(Scope::Tenant, None) => {
bail!("Attempt to access management api with tenant scope. Permission denied")
}
(Scope::Tenant, Some(tenant_id)) => {
if claims.tenant_id.unwrap() != tenant_id {
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
bail!("Tenant id mismatch. Permission denied")
}
Ok(())
}
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::SafekeeperData, _) => Err(AuthError(
"SafekeeperData scope makes no sense for Pageserver".into(),
)),
(Scope::SafekeeperData, _) => {
bail!("SafekeeperData scope makes no sense for Pageserver")
}
}
}

View File

@@ -1314,6 +1314,12 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!(
parsed_remote_storage_config,
RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
)
.unwrap(),
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
.unwrap(),
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
},
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1374,6 +1380,8 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!(
parsed_remote_storage_config,
RemoteStorageConfig {
max_concurrent_syncs,
max_sync_errors,
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: bucket_name.clone(),
bucket_region: bucket_region.clone(),

View File

@@ -893,6 +893,14 @@ mod test {
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

View File

@@ -352,8 +352,7 @@ paths:
in: query
required: true
schema:
type: string
format: hex
type: integer
description: A LSN to get the timestamp
responses:
"200":

View File

@@ -303,7 +303,11 @@ async fn build_timeline_info(
// we're executing this function, we will outlive the timeline on-disk state.
info.current_logical_size_non_incremental = Some(
timeline
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
.get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
ctx,
)
.await?,
);
}
@@ -1667,8 +1671,6 @@ where
);
match handle.await {
// TODO: never actually return Err from here, always Ok(...) so that we can log
// spanned errors. Call api_error_handler instead and return appropriate Body.
Ok(result) => result,
Err(e) => {
// The handler task panicked. We have a global panic handler that logs the

View File

@@ -1225,6 +1225,15 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
@@ -1919,6 +1928,7 @@ pub fn preinitialize_metrics() {
&READ_NUM_FS_LAYERS,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
]

View File

@@ -218,27 +218,9 @@ async fn page_service_conn_main(
// no write timeout is used, because the kernel is assumed to error writes after some time.
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
let socket_timeout_ms = (|| {
fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
// Exponential distribution for simulating
// poor network conditions, expect about avg_timeout_ms to be around 15
// in tests
if let Some(avg_timeout_ms) = avg_timeout_ms {
let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
let u = rand::random::<f32>();
((1.0 - u).ln() / (-avg)) as u64
} else {
default_timeout_ms
}
});
default_timeout_ms
})();
// A timeout here does not mean the client died, it can happen if it's just idle for
// a while: we will tear down this PageServerHandler and instantiate a new one if/when
// they reconnect.
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
// timeout should be lower, but trying out multiple days for
// <https://github.com/neondatabase/neon/issues/4205>
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
let socket = std::pin::pin!(socket);
// XXX: pgbackend.run() should take the connection_ctx,
@@ -512,11 +494,7 @@ impl PageServerHandler {
};
if let Err(e) = &response {
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
if timeline.cancel.is_cancelled() {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
@@ -920,7 +898,7 @@ impl PageServerHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -932,7 +910,7 @@ impl PageServerHandler {
.claims
.as_ref()
.expect("claims presence already checked");
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
check_permission(claims, tenant_id)
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -971,17 +949,16 @@ where
.auth
.as_ref()
.unwrap()
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
.map_err(|e| QueryError::Unauthorized(e.0))?;
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Unauthorized(
"jwt token scope is Tenant, but tenant id is missing".into(),
));
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)));
}
debug!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
info!(
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
data.claims.scope, data.claims.tenant_id,
);
@@ -1003,13 +980,9 @@ where
pgb: &mut PostgresBackend<IO>,
query_string: &str,
) -> Result<(), QueryError> {
fail::fail_point!("simulated-bad-compute-connection", |_| {
info!("Hit failpoint for bad connection");
Err(QueryError::SimulatedConnectionError)
});
let ctx = self.connection_ctx.attached_child();
debug!("process query {query_string:?}");
if query_string.starts_with("pagestream ") {
let (_, params_raw) = query_string.split_at("pagestream ".len());
let params = params_raw.split(' ').collect::<Vec<_>>();

View File

@@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -577,6 +578,7 @@ impl Timeline {
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -588,7 +590,7 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
if self.cancel.is_cancelled() {
if cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
let relsize_key = rel_size_to_key(rel);

View File

@@ -1841,13 +1841,7 @@ impl Tenant {
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let span = Span::current();
js.spawn(async move {
if freeze_and_flush {
timeline.flush_and_shutdown().instrument(span).await
} else {
timeline.shutdown().instrument(span).await
}
});
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
})
};
tracing::info!("Waiting for timelines...");
@@ -3534,6 +3528,10 @@ pub(crate) mod harness {
let remote_fs_dir = conf.workdir.join("localfs");
std::fs::create_dir_all(&remote_fs_dir).unwrap();
let config = RemoteStorageConfig {
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -4653,6 +4651,74 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
.load()
.await;
let initdb_lsn = Lsn(0x20);
let utline = tenant
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = utline.raw_timeline().unwrap();
// Spawn flush loop now so that we can set the `expect_initdb_optimization`
tline.maybe_spawn_flush_loop();
// Make sure the timeline has the minimum set of required keys for operation.
// The only operation you can always do on an empty timeline is to `put` new data.
// Except if you `put` at `initdb_lsn`.
// In that case, there's an optimization to directly create image layers instead of delta layers.
// It uses `repartition()`, which assumes some keys to be present.
// Let's make sure the test timeline can handle that case.
{
let mut state = tline.flush_loop_state.lock().unwrap();
assert_eq!(
timeline::FlushLoopState::Running {
expect_initdb_optimization: false,
initdb_optimization_count: 0,
},
*state
);
*state = timeline::FlushLoopState::Running {
expect_initdb_optimization: true,
initdb_optimization_count: 0,
};
}
// Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization.
// As explained above, the optimization requires some keys to be present.
// As per `create_empty_timeline` documentation, use init_empty to set them.
// This is what `create_test_timeline` does, by the way.
let mut modification = tline.begin_modification(initdb_lsn);
modification
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(&ctx)
.await
.context("commit init_empty_test_timeline modification")?;
// Do the flush. The flush code will check the expectations that we set above.
tline.freeze_and_flush().await?;
// assert freeze_and_flush exercised the initdb optimization
{
let state = tline.flush_loop_state.lock().unwrap();
let timeline::FlushLoopState::Running {
expect_initdb_optimization,
initdb_optimization_count,
} = *state
else {
panic!("unexpected state: {:?}", *state);
};
assert!(expect_initdb_optimization);
assert!(initdb_optimization_count > 0);
}
Ok(())
}
#[tokio::test]
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
let name = "test_uninit_mark_crash";
@@ -4665,7 +4731,7 @@ mod tests {
// Keeps uninit mark in place
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown()
.shutdown(false)
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
.await;
std::mem::forget(tline);

View File

@@ -327,7 +327,7 @@ mod tests {
let mut sz: u16 = rng.gen();
// Make 50% of the arrays small
if rng.gen() {
sz &= 63;
sz |= 63;
}
random_array(sz.into())
})

View File

@@ -1552,7 +1552,14 @@ impl SlotGuard {
/// is responsible for protecting
fn old_value_is_shutdown(&self) -> bool {
match self.old_value.as_ref() {
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
Some(TenantSlot::Attached(tenant)) => {
// TODO: PR #5711 will add a gate that enables properly checking that
// shutdown completed.
matches!(
tenant.current_state(),
TenantState::Stopping { .. } | TenantState::Broken { .. }
)
}
Some(TenantSlot::Secondary) => {
// TODO: when adding secondary mode tenants, this will check for shutdown
// in the same way that we do for `Tenant` above

View File

@@ -6,6 +6,7 @@ use std::sync::Arc;
use anyhow::{bail, Context};
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -349,6 +350,10 @@ async fn fill_logical_sizes(
// our advantage with `?` error handling.
let mut joinset = tokio::task::JoinSet::new();
let cancel = tokio_util::sync::CancellationToken::new();
// be sure to cancel all spawned tasks if we are dropped
let _dg = cancel.clone().drop_guard();
// For each point that would benefit from having a logical size available,
// spawn a Task to fetch it, unless we have it cached already.
for seg in segments.iter() {
@@ -366,8 +371,15 @@ async fn fill_logical_sizes(
let parallel_size_calcs = Arc::clone(limit);
let ctx = ctx.attached_child();
joinset.spawn(
calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
.in_current_span(),
calculate_logical_size(
parallel_size_calcs,
timeline,
lsn,
cause,
ctx,
cancel.child_token(),
)
.in_current_span(),
);
}
e.insert(cached_size);
@@ -475,13 +487,14 @@ async fn calculate_logical_size(
lsn: utils::lsn::Lsn,
cause: LogicalSizeCalculationCause,
ctx: RequestContext,
cancel: CancellationToken,
) -> Result<TimelineAtLsnSizeResult, RecvError> {
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
let size_res = timeline
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
.await?;
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))

View File

@@ -36,6 +36,7 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -49,7 +50,6 @@ use crate::tenant::{
metadata::{save_metadata, TimelineMetadata},
par_fsync,
};
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -95,7 +95,12 @@ use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenant
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(super) enum FlushLoopState {
NotStarted,
Running,
Running {
#[cfg(test)]
expect_initdb_optimization: bool,
#[cfg(test)]
initdb_optimization_count: usize,
},
Exited,
}
@@ -242,7 +247,7 @@ pub struct Timeline {
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -369,19 +374,6 @@ pub enum PageReconstructError {
WalRedo(anyhow::Error),
}
#[derive(thiserror::Error, Debug)]
enum FlushLayerError {
/// Timeline cancellation token was cancelled
#[error("timeline shutting down")]
Cancelled,
#[error(transparent)]
PageReconstructError(#[from] PageReconstructError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
@@ -899,16 +891,15 @@ impl Timeline {
self.launch_eviction_task(background_jobs_can_start);
}
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
///
/// While we are flushing, we continue to accept read I/O.
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(crate) async fn flush_and_shutdown(&self) {
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
// trying to flush
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// prevent writes to the InMemoryLayer
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
@@ -917,70 +908,40 @@ impl Timeline {
)
.await;
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
self.last_record_lsn.shutdown();
// now all writers to InMemory layer are gone, do the final flush if requested
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
if let Err(e) = client.wait_completion().await {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to flush to remote storage: {e:#}");
}
if freeze_and_flush {
match self.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return; // TODO: should probably drain remote timeline client anyways?
}
}
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
// we have some extra WAL replay to do next time the timeline starts.
warn!("failed to freeze and flush: {e:#}");
// drain the upload queue
let res = if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
self.shutdown().await;
}
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
/// the graceful [`Timeline::flush_and_shutdown`] function.
pub(crate) async fn shutdown(&self) {
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
// while doing so.
self.last_record_lsn.shutdown();
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_id),
Some(self.timeline_id),
)
.await;
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
// case our caller wants to use that for a deletion
if let Some(remote_client) = self.remote_client.as_ref() {
match remote_client.stop() {
Ok(()) => {}
Err(StopError::QueueUninitialized) => {
// Shutting down during initialization is legal
}
}
}
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
// Finally wait until any gate-holders are complete
@@ -1024,12 +985,7 @@ impl Timeline {
reason,
backtrace: backtrace_str,
};
self.set_state(broken_state);
// Although the Broken state is not equivalent to shutdown() (shutdown will be called
// later when this tenant is detach or the process shuts down), firing the cancellation token
// here avoids the need for other tasks to watch for the Broken state explicitly.
self.cancel.cancel();
self.set_state(broken_state)
}
pub fn current_state(&self) -> TimelineState {
@@ -1461,7 +1417,7 @@ impl Timeline {
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
match *flush_loop_state {
FlushLoopState::NotStarted => (),
FlushLoopState::Running => {
FlushLoopState::Running { .. } => {
info!(
"skipping attempt to start flush_loop twice {}/{}",
self.tenant_id, self.timeline_id
@@ -1481,7 +1437,12 @@ impl Timeline {
let self_clone = Arc::clone(self);
debug!("spawning flush loop");
*flush_loop_state = FlushLoopState::Running;
*flush_loop_state = FlushLoopState::Running {
#[cfg(test)]
expect_initdb_optimization: false,
#[cfg(test)]
initdb_optimization_count: 0,
};
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
@@ -1493,7 +1454,7 @@ impl Timeline {
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
assert!(matches!(*flush_loop_state, FlushLoopState::Running));
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
*flush_loop_state = FlushLoopState::Exited;
Ok(())
}
@@ -1780,8 +1741,12 @@ impl Timeline {
// delay will be terminated by a timeout regardless.
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
// no extra cancellation here, because nothing really waits for this to complete compared
// to spawn_ondemand_logical_size_calculation.
let cancel = CancellationToken::new();
let calculated_size = match self_clone
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
.await
{
Ok(s) => s,
@@ -1850,6 +1815,7 @@ impl Timeline {
lsn: Lsn,
cause: LogicalSizeCalculationCause,
ctx: RequestContext,
cancel: CancellationToken,
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
let (sender, receiver) = oneshot::channel();
let self_clone = Arc::clone(self);
@@ -1870,7 +1836,7 @@ impl Timeline {
false,
async move {
let res = self_clone
.logical_size_calculation_task(lsn, cause, &ctx)
.logical_size_calculation_task(lsn, cause, &ctx, cancel)
.await;
let _ = sender.send(res).ok();
Ok(()) // Receiver is responsible for handling errors
@@ -1886,28 +1852,58 @@ impl Timeline {
lsn: Lsn,
cause: LogicalSizeCalculationCause,
ctx: &RequestContext,
cancel: CancellationToken,
) -> Result<u64, CalculateLogicalSizeError> {
span::debug_assert_current_span_has_tenant_and_timeline_id();
let _guard = self.gate.enter();
let mut timeline_state_updates = self.subscribe_for_state_updates();
let self_calculation = Arc::clone(self);
let mut calculation = pin!(async {
let cancel = cancel.child_token();
let ctx = ctx.attached_child();
self_calculation
.calculate_logical_size(lsn, cause, &ctx)
.calculate_logical_size(lsn, cause, cancel, &ctx)
.await
});
let timeline_state_cancellation = async {
loop {
match timeline_state_updates.changed().await {
Ok(()) => {
let new_state = timeline_state_updates.borrow().clone();
match new_state {
// we're running this job for active timelines only
TimelineState::Active => continue,
TimelineState::Broken { .. }
| TimelineState::Stopping
| TimelineState::Loading => {
break format!("aborted because timeline became inactive (new state: {new_state:?})")
}
}
}
Err(_sender_dropped_error) => {
// can't happen, the sender is not dropped as long as the Timeline exists
break "aborted because state watch was dropped".to_string();
}
}
}
};
let taskmgr_shutdown_cancellation = async {
task_mgr::shutdown_watcher().await;
"aborted because task_mgr shutdown requested".to_string()
};
tokio::select! {
res = &mut calculation => { res }
_ = self.cancel.cancelled() => {
debug!("cancelling logical size calculation for timeline shutdown");
reason = timeline_state_cancellation => {
debug!(reason = reason, "cancelling calculation");
cancel.cancel();
calculation.await
}
_ = task_mgr::shutdown_watcher() => {
debug!("cancelling logical size calculation for task shutdown");
reason = taskmgr_shutdown_cancellation => {
debug!(reason = reason, "cancelling calculation");
cancel.cancel();
calculation.await
}
}
@@ -1921,6 +1917,7 @@ impl Timeline {
&self,
up_to_lsn: Lsn,
cause: LogicalSizeCalculationCause,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<u64, CalculateLogicalSizeError> {
info!(
@@ -1963,7 +1960,7 @@ impl Timeline {
};
let timer = storage_time_metrics.start_timer();
let logical_size = self
.get_current_logical_size_non_incremental(up_to_lsn, ctx)
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
.await?;
debug!("calculated logical size: {logical_size}");
timer.stop_and_record();
@@ -2376,10 +2373,6 @@ impl Timeline {
info!("started flush loop");
loop {
tokio::select! {
_ = self.cancel.cancelled() => {
info!("shutting down layer flush task");
break;
},
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
break;
@@ -2391,14 +2384,6 @@ impl Timeline {
let timer = self.metrics.flush_time_histo.start_timer();
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
if self.cancel.is_cancelled() {
info!("dropping out of flush loop for timeline shutdown");
// Note: we do not bother transmitting into [`layer_flush_done_tx`], because
// anyone waiting on that will respect self.cancel as well: they will stop
// waiting at the same time we as drop out of this loop.
return;
}
let layer_to_flush = {
let guard = self.layers.read().await;
guard.layer_map().frozen_layers.front().cloned()
@@ -2407,18 +2392,9 @@ impl Timeline {
let Some(layer_to_flush) = layer_to_flush else {
break Ok(());
};
match self.flush_frozen_layer(layer_to_flush, ctx).await {
Ok(()) => {}
Err(FlushLayerError::Cancelled) => {
info!("dropping out of flush loop for timeline shutdown");
return;
}
err @ Err(
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
) => {
error!("could not flush frozen layer: {err:?}");
break err;
}
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
error!("could not flush frozen layer: {err:?}");
break Err(err);
}
};
// Notify any listeners that we're done
@@ -2440,7 +2416,7 @@ impl Timeline {
let mut my_flush_request = 0;
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
if !matches!(flush_loop_state, FlushLoopState::Running) {
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
@@ -2467,17 +2443,7 @@ impl Timeline {
}
}
trace!("waiting for flush to complete");
tokio::select! {
rx_e = rx.changed() => {
rx_e?;
},
// Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
// the notification from [`flush_loop`] that it completed.
_ = self.cancel.cancelled() => {
tracing::info!("Cancelled layer flush due on timeline shutdown");
return Ok(())
}
};
rx.changed().await?;
trace!("done")
}
}
@@ -2492,13 +2458,61 @@ impl Timeline {
self: &Arc<Self>,
frozen_layer: Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> Result<(), FlushLayerError> {
) -> anyhow::Result<()> {
// As a special case, when we have just imported an image into the repository,
// instead of writing out a L0 delta layer, we directly write out image layer
// files instead. This is possible as long as *all* the data imported into the
// repository have the same LSN.
let lsn_range = frozen_layer.get_lsn_range();
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
let (layers_to_upload, delta_layer_to_add) =
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
initdb_optimization_count,
..
} => {
*initdb_optimization_count += 1;
}
}
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
// require downloading anything during initial import.
let (partitioning, _lsn) = self
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
.await?;
// For image layers, we add them immediately into the layer map.
(
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
.await?,
None,
)
} else {
#[cfg(test)]
match &mut *self.flush_loop_state.lock().unwrap() {
FlushLoopState::NotStarted | FlushLoopState::Exited => {
panic!("flush loop not running")
}
FlushLoopState::Running {
expect_initdb_optimization,
..
} => {
assert!(!*expect_initdb_optimization, "expected initdb optimization");
}
}
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
(
// FIXME: even though we have a single image and single delta layer assumption
// we push them to vec
vec![layer.clone()],
Some(layer),
)
};
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
@@ -2509,21 +2523,18 @@ impl Timeline {
let metadata = {
let mut guard = self.layers.write().await;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
}
guard.finish_flush_l0_layer(&layer, &frozen_layer, &self.metrics);
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
if disk_consistent_lsn != old_disk_consistent_lsn {
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
self.disk_consistent_lsn.store(disk_consistent_lsn);
// Schedule remote uploads that will reflect our new disk_consistent_lsn
Some(self.schedule_uploads(disk_consistent_lsn, [layer])?)
Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
} else {
None
}
// release lock on 'layers'
};
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,

View File

@@ -326,7 +326,8 @@ impl Timeline {
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {
self.imitate_timeline_cached_layer_accesses(ctx).await;
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
.await;
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
}
}
@@ -366,12 +367,21 @@ impl Timeline {
/// Recompute the values which would cause on-demand downloads during restart.
#[instrument(skip_all)]
async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
async fn imitate_timeline_cached_layer_accesses(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) {
let lsn = self.get_last_record_lsn();
// imitiate on-restart initial logical size
let size = self
.calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
.calculate_logical_size(
lsn,
LogicalSizeCalculationCause::EvictionTaskImitation,
cancel.clone(),
ctx,
)
.instrument(info_span!("calculate_logical_size"))
.await;

View File

@@ -164,7 +164,7 @@ impl LayerManager {
/// Flush a frozen layer and add the written delta layer to the layer map.
pub(crate) fn finish_flush_l0_layer(
&mut self,
delta_layer: &ResidentLayer,
delta_layer: Option<&ResidentLayer>,
frozen_layer_for_check: &Arc<InMemoryLayer>,
metrics: &TimelineMetrics,
) {
@@ -179,14 +179,12 @@ impl LayerManager {
// layer to disk at the same time, that would not work.
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
let mut updates = self.layer_map.batch_update();
Self::insert_historic_layer(
delta_layer.as_ref().clone(),
&mut updates,
&mut self.layer_fmgr,
);
metrics.record_new_file_metrics(delta_layer.layer_desc().file_size);
updates.flush();
if let Some(l) = delta_layer {
let mut updates = self.layer_map.batch_update();
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
metrics.record_new_file_metrics(l.layer_desc().file_size);
updates.flush();
}
}
/// Called when compaction is completed.

View File

@@ -44,6 +44,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
use crate::config::PageServerConf;
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WAL_REDO_WAIT_TIME,
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
@@ -206,8 +207,11 @@ impl PostgresRedoManager {
) -> anyhow::Result<Bytes> {
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
const MAX_RETRY_ATTEMPTS: u32 = 1;
let start_time = Instant::now();
let mut n_attempts = 0u32;
loop {
let lock_time = Instant::now();
// launch the WAL redo process on first use
let proc: Arc<WalRedoProcess> = {
let proc_guard = self.redo_process.read().unwrap();
@@ -232,7 +236,7 @@ impl PostgresRedoManager {
}
};
let started_at = std::time::Instant::now();
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
@@ -240,7 +244,8 @@ impl PostgresRedoManager {
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
.context("apply_wal_records");
let duration = started_at.elapsed();
let end_time = Instant::now();
let duration = end_time.duration_since(lock_time);
let len = records.len();
let nbytes = records.iter().fold(0, |acumulator, record| {

View File

@@ -19,10 +19,7 @@
#include "access/xlog.h"
#include "access/xlogutils.h"
#include "storage/buf_internals.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "c.h"
#include "postmaster/interrupt.h"
#include "libpq-fe.h"
#include "libpq/pqformat.h"
@@ -64,63 +61,23 @@ int flush_every_n_requests = 8;
int n_reconnect_attempts = 0;
int max_reconnect_attempts = 60;
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
typedef struct
{
LWLockId lock;
pg_atomic_uint64 update_counter;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
} PagestoreShmemState;
#if PG_VERSION_NUM >= 150000
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static shmem_startup_hook_type prev_shmem_startup_hook;
static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
}
static pqsigfunc prev_signal_handler;
static void
AssignPageserverConnstring(const char *newval, void *extra)
pageserver_sighup_handler(SIGNAL_ARGS)
{
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
LWLockRelease(pagestore_shared->lock);
}
static bool
CheckConnstringUpdated()
{
if(!pagestore_shared)
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
}
static void
ReloadConnstring()
{
if(!pagestore_shared)
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
LWLockRelease(pagestore_shared->lock);
if (prev_signal_handler)
{
prev_signal_handler(postgres_signal_arg);
}
neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
pageserver_disconnect();
}
static bool
@@ -134,11 +91,6 @@ pageserver_connect(int elevel)
Assert(!connected);
if(CheckConnstringUpdated())
{
ReloadConnstring();
}
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -158,7 +110,7 @@ pageserver_connect(int elevel)
n++;
}
keywords[n] = "dbname";
values[n] = local_pageserver_connstring;
values[n] = page_server_connstring;
n++;
keywords[n] = NULL;
values[n] = NULL;
@@ -302,12 +254,6 @@ pageserver_send(NeonRequest * request)
{
StringInfoData req_buff;
if(CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
@@ -328,7 +274,6 @@ pageserver_send(NeonRequest * request)
{
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
pg_usleep(RECONNECT_INTERVAL_USEC);
}
@@ -446,8 +391,7 @@ pageserver_flush(void)
return true;
}
page_server_api api =
{
page_server_api api = {
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive
@@ -461,72 +405,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
return **newval == '\0' || HexDecodeString(id, *newval, 16);
}
static Size
PagestoreShmemSize(void)
{
return sizeof(PagestoreShmemState);
}
static bool
PagestoreShmemInit(void)
{
bool found;
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
&found);
if(!found)
{
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
return found;
}
static void
pagestore_shmem_startup_hook(void)
{
if(prev_shmem_startup_hook)
prev_shmem_startup_hook();
PagestoreShmemInit();
}
static void
pagestore_shmem_request(void)
{
#if PG_VERSION_NUM >= 150000
if(prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(PagestoreShmemSize());
RequestNamedLWLockTranche("neon_libpagestore", 1);
}
static void
pagestore_prepare_shmem(void)
{
#if PG_VERSION_NUM >= 150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = pagestore_shmem_request;
#else
pagestore_shmem_request();
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = pagestore_shmem_startup_hook;
}
/*
* Module initialization function
*/
void
pg_init_libpagestore(void)
{
pagestore_prepare_shmem();
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
@@ -534,7 +418,7 @@ pg_init_libpagestore(void)
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
NULL, NULL, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Neon timeline_id the server is running on",
@@ -615,5 +499,7 @@ pg_init_libpagestore(void)
redo_read_buffer_filter = neon_redo_read_buffer_filter;
}
prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
lfc_init();
}

View File

@@ -80,9 +80,6 @@ struct ProxyCliArgs {
/// cache for `wake_compute` api method (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
wake_compute_cache: String,
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
#[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
wake_compute_lock: String,
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
@@ -223,23 +220,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
}));
let config::WakeComputeLockOptions {
shards,
permits,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(
console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
.unwrap(),
));
tokio::spawn(locks.garbage_collect_worker(epoch));
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let api = console::provider::neon::Api::new(endpoint, caches, locks);
let api = console::provider::neon::Api::new(endpoint, caches);
auth::BackendType::Console(Cow::Owned(api), ())
}
AuthBackend::Postgres => {

View File

@@ -264,79 +264,6 @@ impl FromStr for CacheOptions {
}
}
/// Helper for cmdline cache options parsing.
pub struct WakeComputeLockOptions {
/// The number of shards the lock map should have
pub shards: usize,
/// The number of allowed concurrent requests for each endpoitn
pub permits: usize,
/// Garbage collection epoch
pub epoch: Duration,
/// Lock timeout
pub timeout: Duration,
}
impl WakeComputeLockOptions {
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
/// Parse lock options passed via cmdline.
/// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
fn parse(options: &str) -> anyhow::Result<Self> {
let mut shards = None;
let mut permits = None;
let mut epoch = None;
let mut timeout = None;
for option in options.split(',') {
let (key, value) = option
.split_once('=')
.with_context(|| format!("bad key-value pair: {option}"))?;
match key {
"shards" => shards = Some(value.parse()?),
"permits" => permits = Some(value.parse()?),
"epoch" => epoch = Some(humantime::parse_duration(value)?),
"timeout" => timeout = Some(humantime::parse_duration(value)?),
unknown => bail!("unknown key: {unknown}"),
}
}
// these dont matter if lock is disabled
if let Some(0) = permits {
timeout = Some(Duration::default());
epoch = Some(Duration::default());
shards = Some(2);
}
let out = Self {
shards: shards.context("missing `shards`")?,
permits: permits.context("missing `permits`")?,
epoch: epoch.context("missing `epoch`")?,
timeout: timeout.context("missing `timeout`")?,
};
ensure!(out.shards > 1, "shard count must be > 1");
ensure!(
out.shards.is_power_of_two(),
"shard count must be a power of two"
);
Ok(out)
}
}
impl FromStr for WakeComputeLockOptions {
type Err = anyhow::Error;
fn from_str(options: &str) -> Result<Self, Self::Err> {
let error = || format!("failed to parse cache lock options '{options}'");
Self::parse(options).with_context(error)
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -361,42 +288,4 @@ mod tests {
Ok(())
}
#[test]
fn test_parse_lock_options() -> anyhow::Result<()> {
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
assert_eq!(epoch, Duration::from_secs(10 * 60));
assert_eq!(timeout, Duration::from_secs(1));
assert_eq!(shards, 32);
assert_eq!(permits, 4);
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
assert_eq!(epoch, Duration::from_secs(60));
assert_eq!(timeout, Duration::from_millis(100));
assert_eq!(shards, 16);
assert_eq!(permits, 8);
let WakeComputeLockOptions {
epoch,
permits,
shards,
timeout,
} = "permits=0".parse()?;
assert_eq!(epoch, Duration::ZERO);
assert_eq!(timeout, Duration::ZERO);
assert_eq!(shards, 2);
assert_eq!(permits, 0);
Ok(())
}
}

View File

@@ -13,10 +13,5 @@ pub mod caches {
pub use super::provider::{ApiCaches, NodeInfoCache};
}
/// Various cache-related types.
pub mod locks {
pub use super::provider::ApiLocks;
}
/// Console's management API.
pub mod mgmt;

View File

@@ -8,13 +8,7 @@ use crate::{
compute, scram,
};
use async_trait::async_trait;
use dashmap::DashMap;
use std::{sync::Arc, time::Duration};
use tokio::{
sync::{OwnedSemaphorePermit, Semaphore},
time::Instant,
};
use tracing::info;
use std::sync::Arc;
pub mod errors {
use crate::{
@@ -155,9 +149,6 @@ pub mod errors {
#[error(transparent)]
ApiError(ApiError),
#[error("Timeout waiting to acquire wake compute lock")]
TimeoutError,
}
// This allows more useful interactions than `#[from]`.
@@ -167,17 +158,6 @@ pub mod errors {
}
}
impl From<tokio::sync::AcquireError> for WakeComputeError {
fn from(_: tokio::sync::AcquireError) -> Self {
WakeComputeError::TimeoutError
}
}
impl From<tokio::time::error::Elapsed> for WakeComputeError {
fn from(_: tokio::time::error::Elapsed) -> Self {
WakeComputeError::TimeoutError
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
use WakeComputeError::*;
@@ -187,8 +167,6 @@ pub mod errors {
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
// However, API might return a meaningful error.
ApiError(e) => e.to_string_client(),
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
}
}
}
@@ -255,145 +233,3 @@ pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,
}
/// Various caches for [`console`](super).
pub struct ApiLocks {
name: &'static str,
node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
permits: usize,
timeout: Duration,
registered: prometheus::IntCounter,
unregistered: prometheus::IntCounter,
reclamation_lag: prometheus::Histogram,
lock_acquire_lag: prometheus::Histogram,
}
impl ApiLocks {
pub fn new(
name: &'static str,
permits: usize,
shards: usize,
timeout: Duration,
) -> prometheus::Result<Self> {
let registered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_registered",
"Number of semaphores registered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(registered.clone()))?;
let unregistered = prometheus::IntCounter::with_opts(
prometheus::Opts::new(
"semaphores_unregistered",
"Number of semaphores unregistered in this api lock",
)
.namespace(name),
)?;
prometheus::register(Box::new(unregistered.clone()))?;
let reclamation_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"reclamation_lag_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 1us -> 65ms
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
)?;
prometheus::register(Box::new(reclamation_lag.clone()))?;
let lock_acquire_lag = prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(
"semaphore_acquire_seconds",
"Time it takes to reclaim unused semaphores in the api lock",
)
.namespace(name)
// 0.1ms -> 6s
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
)?;
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
Ok(Self {
name,
node_locks: DashMap::with_shard_amount(shards),
permits,
timeout,
lock_acquire_lag,
registered,
unregistered,
reclamation_lag,
})
}
pub async fn get_wake_compute_permit(
&self,
key: &Arc<str>,
) -> Result<WakeComputePermit, errors::WakeComputeError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}
let now = Instant::now();
let semaphore = {
// get fast path
if let Some(semaphore) = self.node_locks.get(key) {
semaphore.clone()
} else {
self.node_locks
.entry(key.clone())
.or_insert_with(|| {
self.registered.inc();
Arc::new(Semaphore::new(self.permits))
})
.clone()
}
};
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
self.lock_acquire_lag
.observe((Instant::now() - now).as_secs_f64());
Ok(WakeComputePermit {
permit: Some(permit??),
})
}
pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
if self.permits == 0 {
return;
}
let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
loop {
for (i, shard) in self.node_locks.shards().iter().enumerate() {
interval.tick().await;
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
// therefore releasing it is safe from race conditions
info!(
name = self.name,
shard = i,
"performing epoch reclamation on api lock"
);
let mut lock = shard.write();
let timer = self.reclamation_lag.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
drop(lock);
self.unregistered.inc_by(count as u64);
timer.observe_duration()
}
}
}
}
pub struct WakeComputePermit {
// None if the lock is disabled
permit: Option<OwnedSemaphorePermit>,
}
impl WakeComputePermit {
pub fn should_check_cache(&self) -> bool {
self.permit.is_some()
}
}

View File

@@ -3,12 +3,12 @@
use super::{
super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
errors::{ApiError, GetAuthInfoError, WakeComputeError},
ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{auth::ClientCredentials, compute, http, scram};
use async_trait::async_trait;
use futures::TryFutureExt;
use std::{net::SocketAddr, sync::Arc};
use std::net::SocketAddr;
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{error, info, info_span, warn, Instrument};
@@ -17,17 +17,12 @@ use tracing::{error, info, info_span, warn, Instrument};
pub struct Api {
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks,
jwt: String,
}
impl Api {
/// Construct an API object containing the auth parameters.
pub fn new(
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks,
) -> Self {
pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),
@@ -35,7 +30,6 @@ impl Api {
Self {
endpoint,
caches,
locks,
jwt,
}
}
@@ -169,22 +163,9 @@ impl super::Api for Api {
return Ok(cached);
}
let key: Arc<str> = key.into();
let permit = self.locks.get_wake_compute_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
// double check
if permit.should_check_cache() {
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
return Ok(cached);
}
}
let node = self.do_wake_compute(extra, creds).await?;
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
info!(key = &*key, "created a cache entry for compute node info");
let (_, cached) = self.caches.node_info.insert(key.into(), node);
info!(key = key, "created a cache entry for compute node info");
Ok(cached)
}

View File

@@ -570,7 +570,6 @@ fn report_error(e: &WakeComputeError, retry: bool) {
"api_console_other_server_error"
}
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
WakeComputeError::TimeoutError => "timeout_error",
};
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
}

View File

@@ -1,20 +1,19 @@
use utils::auth::{AuthError, Claims, Scope};
use anyhow::{bail, Result};
use utils::auth::{Claims, Scope};
use utils::id::TenantId;
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
match (&claims.scope, tenant_id) {
(Scope::Tenant, None) => Err(AuthError(
"Attempt to access management api with tenant scope. Permission denied".into(),
)),
(Scope::Tenant, None) => {
bail!("Attempt to access management api with tenant scope. Permission denied")
}
(Scope::Tenant, Some(tenant_id)) => {
if claims.tenant_id.unwrap() != tenant_id {
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
bail!("Tenant id mismatch. Permission denied")
}
Ok(())
}
(Scope::PageServerApi, _) => Err(AuthError(
"PageServerApi scope makes no sense for Safekeeper".into(),
)),
(Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
(Scope::SafekeeperData, _) => Ok(()),
}
}

View File

@@ -6,7 +6,7 @@ use std::str::FromStr;
use std::str::{self};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{debug, info, info_span, Instrument};
use tracing::{info, info_span, Instrument};
use crate::auth::check_permission;
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -165,27 +165,26 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
.auth
.as_ref()
.expect("auth_type is configured but .auth of handler is missing");
let data = auth
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
.map_err(|e| QueryError::Unauthorized(e.0))?;
let data =
auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
// The handler might be configured to allow only tenant scope tokens.
if matches!(allowed_auth_scope, Scope::Tenant)
&& !matches!(data.claims.scope, Scope::Tenant)
{
return Err(QueryError::Unauthorized(
"passed JWT token is for full access, but only tenant scope is allowed".into(),
));
return Err(QueryError::Other(anyhow::anyhow!(
"passed JWT token is for full access, but only tenant scope is allowed"
)));
}
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
return Err(QueryError::Unauthorized(
"jwt token scope is Tenant, but tenant id is missing".into(),
));
return Err(QueryError::Other(anyhow::anyhow!(
"jwt token scope is Tenant, but tenant id is missing"
)));
}
debug!(
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
info!(
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
data.claims.scope, data.claims.tenant_id,
);
@@ -264,7 +263,7 @@ impl SafekeeperPostgresHandler {
// when accessing management api supply None as an argument
// when using to authorize tenant pass corresponding tenant id
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
if self.auth.is_none() {
// auth is set to Trust, nothing to check so just return ok
return Ok(());
@@ -276,7 +275,7 @@ impl SafekeeperPostgresHandler {
.claims
.as_ref()
.expect("claims presence already checked");
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
check_permission(claims, tenant_id)
}
async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(

View File

@@ -25,7 +25,7 @@ def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
match="Unauthorized: malformed jwt token",
):
assert_client_authorized(env, http_client)
@@ -56,7 +56,9 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
assert_client_authorized(env, pageserver_http_client)
# fail to create branch using token with different tenant_id
with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
with pytest.raises(
PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
):
assert_client_authorized(env, invalid_tenant_http_client)
# create tenant using management token
@@ -65,7 +67,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
# fail to create tenant using tenant token
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
):
tenant_http_client.tenant_create(TenantId.generate())
@@ -92,7 +94,6 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
@@ -145,7 +146,6 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
@@ -162,7 +162,7 @@ def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
# Next attempt fails as we use the old auth token
with pytest.raises(
PageserverApiException,
match="Forbidden: JWT authentication error",
match="Unauthorized: malformed jwt token",
):
pageserver_http_client_old.reload_auth_validation_keys()

View File

@@ -1,60 +0,0 @@
import random
import time
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*simulated connection error.*")
pageserver_http = env.pageserver.http_client()
env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
# Enable failpoint after starting everything else up so that loading initial
# basebackup doesn't fail
pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
end_time = time.time() + 30
times_executed = 0
while time.time() < end_time:
if random.random() < 0.5:
cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
else:
cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
cur.fetchall()
times_executed += 1
log.info(f"Workload executed {times_executed} times")

View File

@@ -1,13 +1,9 @@
import asyncio
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.remote_storage import RemoteStorageKind
def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
num_connections = 3
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -20,24 +16,15 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
alt_pageserver_id = env.pageservers[1].id
env.pageservers[1].tenant_attach(env.initial_tenant)
pg_conns = [endpoint.connect() for i in range(num_connections)]
curs = [pg_conn.cursor() for pg_conn in pg_conns]
def execute(statement: str):
for cur in curs:
cur.execute(statement)
def fetchone():
results = [cur.fetchone() for cur in curs]
assert all(result == results[0] for result in results)
return results[0]
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
curs[0].execute("CREATE TABLE foo (t text)")
curs[0].execute(
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
@@ -46,25 +33,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
)
# Verify that the table is larger than shared_buffers
curs[0].execute(
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
from pg_settings where name = 'shared_buffers'
"""
)
row = curs[0].fetchone()
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
connstring = fetchone()
cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
connstring = cur.fetchone()
assert connstring is not None
expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
assert expected_connstring == expected_connstring
@@ -73,45 +60,5 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
0
].stop() # Stop the old pageserver just to make sure we're reading from the new one
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
# Try failing back, and this time we will stop the current pageserver before reconfiguring
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
# is more like what happens in an unexpected pageserver failure.
env.pageservers[0].start()
env.pageservers[1].stop()
endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
env.pageservers[0].stop()
env.pageservers[1].start()
# Test a (former) bug where a child process spins without updating its connection string
# by executing a query separately. This query will hang until we issue the reconfigure.
async def reconfigure_async():
await asyncio.sleep(
1
) # Sleep for 1 second just to make sure we actually started our count(*) query
endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
def execute_count():
execute("SELECT count(*) FROM FOO")
async def execute_and_reconfigure():
task_exec = asyncio.to_thread(execute_count)
task_reconfig = asyncio.create_task(reconfigure_async())
await asyncio.gather(
task_exec,
task_reconfig,
)
asyncio.run(execute_and_reconfigure())
assert fetchone() == (100000,)
# One final check that nothing hangs
execute("SELECT count(*) FROM foo")
assert fetchone() == (100000,)
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)

View File

@@ -366,17 +366,11 @@ def test_deletion_queue_recovery(
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
if validate_before == ValidateBefore.VALIDATE:
# At this point, one or more DeletionLists have been written. We have set a failpoint
# to prevent them successfully executing, but we want to see them get validated.
#
# We await _some_ validations instead of _all_ validations, because our execution failpoint
# will prevent validation proceeding for any but the first DeletionList. Usually the workload
# just generates one, but if it generates two due to timing, then we must not expect that the
# second one will be validated.
def assert_some_validations():
assert get_deletion_queue_validated(ps_http) > 0
wait_until(20, 1, assert_some_validations)
def assert_validation_complete():
assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
wait_until(20, 1, assert_validation_complete)
# The validatated keys statistic advances before the header is written, so we
# also wait to see the header hit the disk: this seems paranoid but the race
@@ -386,11 +380,6 @@ def test_deletion_queue_recovery(
wait_until(20, 1, assert_header_written)
# If we will lose attachment, then our expectation on restart is that only the ones
# we already validated will execute. Act like only those were present in the queue.
if keep_attachment == KeepAttachment.LOSE:
before_restart_depth = get_deletion_queue_validated(ps_http)
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
@@ -413,13 +402,11 @@ def test_deletion_queue_recovery(
ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment == KeepAttachment.KEEP:
if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
# - If we kept the attachment, then our pre-restart deletions should execute
# because on re-attach they were from the immediately preceding generation
assert get_deletion_queue_executed(ps_http) == before_restart_depth
elif validate_before == ValidateBefore.VALIDATE:
# - If we validated before restart, then we should execute however many keys were
# validated before restart.
# - If we validated before restart, then the deletions should execute because the
# deletion queue header records a validated deletion list sequence number.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

View File

@@ -25,7 +25,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
clap = { version = "4", features = ["derive", "string"] }
clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
crossbeam-utils = { version = "0.8" }
dashmap = { version = "5", default-features = false, features = ["raw-api"] }
either = { version = "1" }
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
futures = { version = "0.3" }