mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-04 03:00:37 +00:00
Compare commits
33 Commits
stepashka-
...
remove_ini
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd235a5fe3 | ||
|
|
f95f001b8b | ||
|
|
e0821e1eab | ||
|
|
4469b1a62c | ||
|
|
842223b47f | ||
|
|
893616051d | ||
|
|
7cdde285a5 | ||
|
|
9c30883c4b | ||
|
|
0495798591 | ||
|
|
87389bc933 | ||
|
|
ea118a238a | ||
|
|
e9b227a11e | ||
|
|
40441f8ada | ||
|
|
a8a39cd464 | ||
|
|
b989ad1922 | ||
|
|
acef742a6e | ||
|
|
11d9d801b5 | ||
|
|
fc47af156f | ||
|
|
e310533ed3 | ||
|
|
1d68f52b57 | ||
|
|
4cd47b7d4b | ||
|
|
0141c95788 | ||
|
|
0ac4cf67a6 | ||
|
|
4be6bc7251 | ||
|
|
a394f49e0d | ||
|
|
c00651ff9b | ||
|
|
bea8efac24 | ||
|
|
ad5b02e175 | ||
|
|
b09a851705 | ||
|
|
85cd97af61 | ||
|
|
e6470ee92e | ||
|
|
dc72567288 | ||
|
|
6defa2b5d5 |
@@ -22,5 +22,11 @@ platforms = [
|
|||||||
# "x86_64-pc-windows-msvc",
|
# "x86_64-pc-windows-msvc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[final-excludes]
|
||||||
|
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||||
|
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||||
|
# from depending on workspace-hack because most of the dependencies are not used.
|
||||||
|
workspace-members = ["vm_monitor"]
|
||||||
|
|
||||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||||
# exact-versions = true
|
# exact-versions = true
|
||||||
|
|||||||
5
.github/ISSUE_TEMPLATE/epic-template.md
vendored
5
.github/ISSUE_TEMPLATE/epic-template.md
vendored
@@ -17,8 +17,9 @@ assignees: ''
|
|||||||
## Implementation ideas
|
## Implementation ideas
|
||||||
|
|
||||||
|
|
||||||
## Tasks
|
```[tasklist]
|
||||||
- [ ]
|
### Tasks
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Other related tasks and Epics
|
## Other related tasks and Epics
|
||||||
|
|||||||
1
.github/workflows/build_and_test.yml
vendored
1
.github/workflows/build_and_test.yml
vendored
@@ -723,6 +723,7 @@ jobs:
|
|||||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||||
--context .
|
--context .
|
||||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
|
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||||
|
|||||||
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -170,6 +170,12 @@ dependencies = [
|
|||||||
"backtrace",
|
"backtrace",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "arc-swap"
|
||||||
|
version = "1.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "archery"
|
name = "archery"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
@@ -4058,6 +4064,7 @@ dependencies = [
|
|||||||
"aws-config",
|
"aws-config",
|
||||||
"aws-credential-types",
|
"aws-credential-types",
|
||||||
"aws-sdk-s3",
|
"aws-sdk-s3",
|
||||||
|
"aws-smithy-async",
|
||||||
"aws-smithy-http",
|
"aws-smithy-http",
|
||||||
"aws-types",
|
"aws-types",
|
||||||
"azure_core",
|
"azure_core",
|
||||||
@@ -5951,6 +5958,7 @@ name = "utils"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"arc-swap",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -6048,7 +6056,6 @@ dependencies = [
|
|||||||
"tokio-util",
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"workspace_hack",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6476,6 +6483,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"clap_builder",
|
"clap_builder",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
|
"dashmap",
|
||||||
"either",
|
"either",
|
||||||
"fail",
|
"fail",
|
||||||
"futures",
|
"futures",
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ license = "Apache-2.0"
|
|||||||
## All dependency versions, used in the project
|
## All dependency versions, used in the project
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||||
|
arc-swap = "1.6"
|
||||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||||
azure_core = "0.16"
|
azure_core = "0.16"
|
||||||
azure_identity = "0.16"
|
azure_identity = "0.16"
|
||||||
@@ -47,6 +48,7 @@ async-trait = "0.1"
|
|||||||
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
|
||||||
aws-sdk-s3 = "0.29"
|
aws-sdk-s3 = "0.29"
|
||||||
aws-smithy-http = "0.56"
|
aws-smithy-http = "0.56"
|
||||||
|
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
|
||||||
aws-credential-types = "0.56"
|
aws-credential-types = "0.56"
|
||||||
aws-types = "0.56"
|
aws-types = "0.56"
|
||||||
axum = { version = "0.6.20", features = ["ws"] }
|
axum = { version = "0.6.20", features = ["ws"] }
|
||||||
@@ -65,7 +67,7 @@ comfy-table = "6.1"
|
|||||||
const_format = "0.2"
|
const_format = "0.2"
|
||||||
crc32c = "0.6"
|
crc32c = "0.6"
|
||||||
crossbeam-utils = "0.8.5"
|
crossbeam-utils = "0.8.5"
|
||||||
dashmap = "5.5.0"
|
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||||
either = "1.8"
|
either = "1.8"
|
||||||
enum-map = "2.4.2"
|
enum-map = "2.4.2"
|
||||||
enumset = "1.0.12"
|
enumset = "1.0.12"
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ RUN set -e \
|
|||||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||||
WORKDIR /home/nonroot
|
WORKDIR /home/nonroot
|
||||||
ARG GIT_VERSION=local
|
ARG GIT_VERSION=local
|
||||||
|
ARG BUILD_TAG
|
||||||
|
|
||||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||||
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
|
|||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||||
|
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||||
|
|||||||
4
Makefile
4
Makefile
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
|
|||||||
#
|
#
|
||||||
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||||
+@echo "Configuring Postgres $* build"
|
+@echo "Configuring Postgres $* build"
|
||||||
|
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||||
|
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||||
|
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
|
||||||
|
exit 1; }
|
||||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
|
||||||
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
|
||||||
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
|
||||||
|
|||||||
@@ -710,8 +710,12 @@ impl ComputeNode {
|
|||||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
||||||
// have opened connection to Postgres and superuser access.
|
// have opened connection to Postgres and superuser access.
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
|
fn pg_reload_conf(&self) -> Result<()> {
|
||||||
client.simple_query("SELECT pg_reload_conf()")?;
|
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
||||||
|
Command::new(pgctl_bin)
|
||||||
|
.args(["reload", "-D", &self.pgdata])
|
||||||
|
.output()
|
||||||
|
.expect("cannot run pg_ctl process");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -724,9 +728,9 @@ impl ComputeNode {
|
|||||||
// Write new config
|
// Write new config
|
||||||
let pgdata_path = Path::new(&self.pgdata);
|
let pgdata_path = Path::new(&self.pgdata);
|
||||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||||
|
self.pg_reload_conf()?;
|
||||||
|
|
||||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||||
self.pg_reload_conf(&mut client)?;
|
|
||||||
|
|
||||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ use regex::Regex;
|
|||||||
use remote_storage::*;
|
use remote_storage::*;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::NonZeroUsize;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str;
|
use std::str;
|
||||||
use tar::Archive;
|
use tar::Archive;
|
||||||
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
|
|||||||
max_keys_per_list_response: None,
|
max_keys_per_list_response: None,
|
||||||
};
|
};
|
||||||
let config = RemoteStorageConfig {
|
let config = RemoteStorageConfig {
|
||||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
|
||||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
|
||||||
storage: RemoteStorageKind::AwsS3(config),
|
storage: RemoteStorageKind::AwsS3(config),
|
||||||
};
|
};
|
||||||
GenericRemoteStorage::from_config(&config)
|
GenericRemoteStorage::from_config(&config)
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
//!
|
|
||||||
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
||||||
//! configuration.
|
//! configuration.
|
||||||
//!
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
pub mod checker;
|
pub mod checker;
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod configurator;
|
pub mod configurator;
|
||||||
|
|||||||
@@ -262,7 +262,7 @@ where
|
|||||||
P: Into<Utf8PathBuf>,
|
P: Into<Utf8PathBuf>,
|
||||||
{
|
{
|
||||||
let path: Utf8PathBuf = path.into();
|
let path: Utf8PathBuf = path.into();
|
||||||
// SAFETY
|
// SAFETY:
|
||||||
// pre_exec is marked unsafe because it runs between fork and exec.
|
// pre_exec is marked unsafe because it runs between fork and exec.
|
||||||
// Why is that dangerous in various ways?
|
// Why is that dangerous in various ways?
|
||||||
// Long answer: https://github.com/rust-lang/rust/issues/39575
|
// Long answer: https://github.com/rust-lang/rust/issues/39575
|
||||||
|
|||||||
@@ -1,11 +1,10 @@
|
|||||||
//
|
//! Local control plane.
|
||||||
// Local control plane.
|
//!
|
||||||
//
|
//! Can start, configure and stop postgres instances running as a local processes.
|
||||||
// Can start, configure and stop postgres instances running as a local processes.
|
//!
|
||||||
//
|
//! Intended to be used in integration tests and in CLI tools for
|
||||||
// Intended to be used in integration tests and in CLI tools for
|
//! local installations.
|
||||||
// local installations.
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
//
|
|
||||||
|
|
||||||
pub mod attachment_service;
|
pub mod attachment_service;
|
||||||
mod background_process;
|
mod background_process;
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
pub mod requests;
|
pub mod requests;
|
||||||
pub mod responses;
|
pub mod responses;
|
||||||
pub mod spec;
|
pub mod spec;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
//!
|
|
||||||
//! Shared code for consumption metics collection
|
//! Shared code for consumption metics collection
|
||||||
//!
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
//! make sure that we use the same dep version everywhere.
|
//! make sure that we use the same dep version everywhere.
|
||||||
//! Otherwise, we might not see all metrics registered via
|
//! Otherwise, we might not see all metrics registered via
|
||||||
//! a default registry.
|
//! a default registry.
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||||
pub use prometheus::opts;
|
pub use prometheus::opts;
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
|
|
||||||
/// Public API types
|
/// Public API types
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
//! To use, create PostgresBackend and run() it, passing the Handler
|
//! To use, create PostgresBackend and run() it, passing the Handler
|
||||||
//! implementation determining how to process the queries. Currently its API
|
//! implementation determining how to process the queries. Currently its API
|
||||||
//! is rather narrow, but we can extend it once required.
|
//! is rather narrow, but we can extend it once required.
|
||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::pin_mut;
|
use futures::pin_mut;
|
||||||
@@ -15,7 +17,7 @@ use std::{fmt, io};
|
|||||||
use std::{future::Future, str::FromStr};
|
use std::{future::Future, str::FromStr};
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tokio_rustls::TlsAcceptor;
|
use tokio_rustls::TlsAcceptor;
|
||||||
use tracing::{debug, error, info, trace};
|
use tracing::{debug, error, info, trace, warn};
|
||||||
|
|
||||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||||
use pq_proto::{
|
use pq_proto::{
|
||||||
@@ -33,6 +35,11 @@ pub enum QueryError {
|
|||||||
/// We were instructed to shutdown while processing the query
|
/// We were instructed to shutdown while processing the query
|
||||||
#[error("Shutting down")]
|
#[error("Shutting down")]
|
||||||
Shutdown,
|
Shutdown,
|
||||||
|
/// Authentication failure
|
||||||
|
#[error("Unauthorized: {0}")]
|
||||||
|
Unauthorized(std::borrow::Cow<'static, str>),
|
||||||
|
#[error("Simulated Connection Error")]
|
||||||
|
SimulatedConnectionError,
|
||||||
/// Some other error
|
/// Some other error
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
|
|||||||
impl QueryError {
|
impl QueryError {
|
||||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||||
match self {
|
match self {
|
||||||
Self::Disconnected(_) => b"08006", // connection failure
|
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||||
|
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
|
|
||||||
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
|
||||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||||
&e.to_string(),
|
&short_error(&e),
|
||||||
Some(e.pg_error_code()),
|
Some(e.pg_error_code()),
|
||||||
))?;
|
))?;
|
||||||
return Err(e);
|
return Err(e);
|
||||||
@@ -728,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
|
|
||||||
trace!("got query {query_string:?}");
|
trace!("got query {query_string:?}");
|
||||||
if let Err(e) = handler.process_query(self, query_string).await {
|
if let Err(e) = handler.process_query(self, query_string).await {
|
||||||
log_query_error(query_string, &e);
|
match e {
|
||||||
let short_error = short_error(&e);
|
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
|
||||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
QueryError::SimulatedConnectionError => {
|
||||||
&short_error,
|
return Err(QueryError::SimulatedConnectionError)
|
||||||
Some(e.pg_error_code()),
|
}
|
||||||
))?;
|
e => {
|
||||||
|
log_query_error(query_string, &e);
|
||||||
|
let short_error = short_error(&e);
|
||||||
|
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||||
|
&short_error,
|
||||||
|
Some(e.pg_error_code()),
|
||||||
|
))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||||
}
|
}
|
||||||
@@ -959,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
|
|||||||
match e {
|
match e {
|
||||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||||
QueryError::Shutdown => "shutdown".to_string(),
|
QueryError::Shutdown => "shutdown".to_string(),
|
||||||
|
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||||
|
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||||
QueryError::Other(e) => format!("{e:#}"),
|
QueryError::Other(e) => format!("{e:#}"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -975,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
|
|||||||
QueryError::Disconnected(other_connection_error) => {
|
QueryError::Disconnected(other_connection_error) => {
|
||||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||||
}
|
}
|
||||||
|
QueryError::SimulatedConnectionError => {
|
||||||
|
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||||
|
}
|
||||||
QueryError::Shutdown => {
|
QueryError::Shutdown => {
|
||||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||||
}
|
}
|
||||||
|
QueryError::Unauthorized(e) => {
|
||||||
|
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||||
|
}
|
||||||
QueryError::Other(e) => {
|
QueryError::Other(e) => {
|
||||||
error!("query handler for '{query}' failed: {e:?}");
|
error!("query handler for '{query}' failed: {e:?}");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|||||||
@@ -8,6 +8,7 @@
|
|||||||
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
||||||
// types, and trigger a too eager lint.
|
// types, and trigger a too eager lint.
|
||||||
#![allow(clippy::duplicate_mod)]
|
#![allow(clippy::duplicate_mod)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use utils::bin_ser::SerializeError;
|
use utils::bin_ser::SerializeError;
|
||||||
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
|
|||||||
pub mod bindings {
|
pub mod bindings {
|
||||||
// bindgen generates bindings for a lot of stuff we don't need
|
// bindgen generates bindings for a lot of stuff we don't need
|
||||||
#![allow(dead_code)]
|
#![allow(dead_code)]
|
||||||
|
#![allow(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
include!(concat!(
|
include!(concat!(
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
//! Postgres protocol messages serialization-deserialization. See
|
//! Postgres protocol messages serialization-deserialization. See
|
||||||
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
|
||||||
//! on message formats.
|
//! on message formats.
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
pub mod framed;
|
pub mod framed;
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ license.workspace = true
|
|||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
|
aws-smithy-async.workspace = true
|
||||||
aws-smithy-http.workspace = true
|
aws-smithy-http.workspace = true
|
||||||
aws-types.workspace = true
|
aws-types.workspace = true
|
||||||
aws-config.workspace = true
|
aws-config.workspace = true
|
||||||
|
|||||||
@@ -1,21 +1,18 @@
|
|||||||
//! Azure Blob Storage wrapper
|
//! Azure Blob Storage wrapper
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::{borrow::Cow, collections::HashMap, io::Cursor};
|
use std::{borrow::Cow, io::Cursor};
|
||||||
|
|
||||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||||
use azure_core::Header;
|
|
||||||
use azure_identity::DefaultAzureCredential;
|
use azure_identity::DefaultAzureCredential;
|
||||||
use azure_storage::StorageCredentials;
|
use azure_storage::StorageCredentials;
|
||||||
use azure_storage_blobs::prelude::ClientBuilder;
|
use azure_storage_blobs::prelude::ClientBuilder;
|
||||||
use azure_storage_blobs::{
|
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||||
blob::operations::GetBlobBuilder,
|
|
||||||
prelude::{BlobClient, ContainerClient},
|
|
||||||
};
|
|
||||||
use futures_util::StreamExt;
|
use futures_util::StreamExt;
|
||||||
use http_types::StatusCode;
|
use http_types::StatusCode;
|
||||||
use tokio::io::AsyncRead;
|
use tokio::io::AsyncRead;
|
||||||
@@ -112,16 +109,19 @@ impl AzureBlobStorage {
|
|||||||
|
|
||||||
async fn download_for_builder(
|
async fn download_for_builder(
|
||||||
&self,
|
&self,
|
||||||
metadata: StorageMetadata,
|
|
||||||
builder: GetBlobBuilder,
|
builder: GetBlobBuilder,
|
||||||
) -> Result<Download, DownloadError> {
|
) -> Result<Download, DownloadError> {
|
||||||
let mut response = builder.into_stream();
|
let mut response = builder.into_stream();
|
||||||
|
|
||||||
|
let mut metadata = HashMap::new();
|
||||||
// TODO give proper streaming response instead of buffering into RAM
|
// TODO give proper streaming response instead of buffering into RAM
|
||||||
// https://github.com/neondatabase/neon/issues/5563
|
// https://github.com/neondatabase/neon/issues/5563
|
||||||
let mut buf = Vec::new();
|
let mut buf = Vec::new();
|
||||||
while let Some(part) = response.next().await {
|
while let Some(part) = response.next().await {
|
||||||
let part = part.map_err(to_download_error)?;
|
let part = part.map_err(to_download_error)?;
|
||||||
|
if let Some(blob_meta) = part.blob.metadata {
|
||||||
|
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||||
|
}
|
||||||
let data = part
|
let data = part
|
||||||
.data
|
.data
|
||||||
.collect()
|
.collect()
|
||||||
@@ -131,28 +131,9 @@ impl AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
download_stream: Box::pin(Cursor::new(buf)),
|
download_stream: Box::pin(Cursor::new(buf)),
|
||||||
metadata: Some(metadata),
|
metadata: Some(StorageMetadata(metadata)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
// TODO get rid of this function once we have metadata included in the response
|
|
||||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1439
|
|
||||||
async fn get_metadata(
|
|
||||||
&self,
|
|
||||||
blob_client: &BlobClient,
|
|
||||||
) -> Result<StorageMetadata, DownloadError> {
|
|
||||||
let builder = blob_client.get_metadata();
|
|
||||||
|
|
||||||
let response = builder.into_future().await.map_err(to_download_error)?;
|
|
||||||
let mut map = HashMap::new();
|
|
||||||
|
|
||||||
for md in response.metadata.iter() {
|
|
||||||
map.insert(
|
|
||||||
md.name().as_str().to_string(),
|
|
||||||
md.value().as_str().to_string(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Ok(StorageMetadata(map))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||||
self.concurrency_limiter
|
self.concurrency_limiter
|
||||||
@@ -269,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
let _permit = self.permit(RequestKind::Get).await;
|
let _permit = self.permit(RequestKind::Get).await;
|
||||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||||
|
|
||||||
let metadata = self.get_metadata(&blob_client).await?;
|
|
||||||
|
|
||||||
let builder = blob_client.get();
|
let builder = blob_client.get();
|
||||||
|
|
||||||
self.download_for_builder(metadata, builder).await
|
self.download_for_builder(builder).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download_byte_range(
|
async fn download_byte_range(
|
||||||
@@ -285,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
let _permit = self.permit(RequestKind::Get).await;
|
let _permit = self.permit(RequestKind::Get).await;
|
||||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||||
|
|
||||||
let metadata = self.get_metadata(&blob_client).await?;
|
|
||||||
|
|
||||||
let mut builder = blob_client.get();
|
let mut builder = blob_client.get();
|
||||||
|
|
||||||
if let Some(end_exclusive) = end_exclusive {
|
if let Some(end_exclusive) = end_exclusive {
|
||||||
@@ -301,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||||
}
|
}
|
||||||
|
|
||||||
self.download_for_builder(metadata, builder).await
|
self.download_for_builder(builder).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
|||||||
@@ -6,19 +6,15 @@
|
|||||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||||
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage
|
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage
|
||||||
//!
|
//!
|
||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
mod azure_blob;
|
mod azure_blob;
|
||||||
mod local_fs;
|
mod local_fs;
|
||||||
mod s3_bucket;
|
mod s3_bucket;
|
||||||
mod simulate_failures;
|
mod simulate_failures;
|
||||||
|
|
||||||
use std::{
|
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||||
collections::HashMap,
|
|
||||||
fmt::Debug,
|
|
||||||
num::{NonZeroU32, NonZeroUsize},
|
|
||||||
pin::Pin,
|
|
||||||
sync::Arc,
|
|
||||||
};
|
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
@@ -34,12 +30,6 @@ pub use self::{
|
|||||||
};
|
};
|
||||||
use s3_bucket::RequestKind;
|
use s3_bucket::RequestKind;
|
||||||
|
|
||||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
|
||||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
|
||||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
|
||||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
|
||||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
|
||||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
|
||||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||||
/// ~200 RPS for IAM services
|
/// ~200 RPS for IAM services
|
||||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||||
@@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
|
|||||||
/// External backup storage configuration, enough for creating a client for that storage.
|
/// External backup storage configuration, enough for creating a client for that storage.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub struct RemoteStorageConfig {
|
pub struct RemoteStorageConfig {
|
||||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
|
||||||
pub max_concurrent_syncs: NonZeroUsize,
|
|
||||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
|
||||||
pub max_sync_errors: NonZeroU32,
|
|
||||||
/// The storage connection configuration.
|
/// The storage connection configuration.
|
||||||
pub storage: RemoteStorageKind,
|
pub storage: RemoteStorageKind,
|
||||||
}
|
}
|
||||||
@@ -540,18 +526,6 @@ impl RemoteStorageConfig {
|
|||||||
|
|
||||||
let use_azure = container_name.is_some() && container_region.is_some();
|
let use_azure = container_name.is_some() && container_region.is_some();
|
||||||
|
|
||||||
let max_concurrent_syncs = NonZeroUsize::new(
|
|
||||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
|
||||||
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
|
||||||
)
|
|
||||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
|
||||||
|
|
||||||
let max_sync_errors = NonZeroU32::new(
|
|
||||||
parse_optional_integer("max_sync_errors", toml)?
|
|
||||||
.unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
|
||||||
)
|
|
||||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
|
||||||
|
|
||||||
let default_concurrency_limit = if use_azure {
|
let default_concurrency_limit = if use_azure {
|
||||||
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
|
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
|
||||||
} else {
|
} else {
|
||||||
@@ -633,11 +607,7 @@ impl RemoteStorageConfig {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Some(RemoteStorageConfig {
|
Ok(Some(RemoteStorageConfig { storage }))
|
||||||
max_concurrent_syncs,
|
|
||||||
max_sync_errors,
|
|
||||||
storage,
|
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,23 +4,27 @@
|
|||||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||||
//! their bucket prefixes are both specified and different.
|
//! their bucket prefixes are both specified and different.
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::{borrow::Cow, sync::Arc};
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use aws_config::{
|
use aws_config::{
|
||||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||||
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
|
imds::credentials::ImdsCredentialsProvider,
|
||||||
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
|
meta::credentials::CredentialsProviderChain,
|
||||||
|
provider_config::ProviderConfig,
|
||||||
|
retry::{RetryConfigBuilder, RetryMode},
|
||||||
|
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||||
};
|
};
|
||||||
use aws_credential_types::cache::CredentialsCache;
|
use aws_credential_types::cache::CredentialsCache;
|
||||||
use aws_sdk_s3::{
|
use aws_sdk_s3::{
|
||||||
config::{Config, Region},
|
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
|
||||||
error::SdkError,
|
error::SdkError,
|
||||||
operation::get_object::GetObjectError,
|
operation::get_object::GetObjectError,
|
||||||
primitives::ByteStream,
|
primitives::ByteStream,
|
||||||
types::{Delete, ObjectIdentifier},
|
types::{Delete, ObjectIdentifier},
|
||||||
Client,
|
Client,
|
||||||
};
|
};
|
||||||
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
use aws_smithy_http::body::SdkBody;
|
use aws_smithy_http::body::SdkBody;
|
||||||
use hyper::Body;
|
use hyper::Body;
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
@@ -83,10 +87,23 @@ impl S3Bucket {
|
|||||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
|
||||||
|
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||||
|
|
||||||
|
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
|
||||||
|
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
|
||||||
|
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
|
||||||
|
let mut retry_config = RetryConfigBuilder::new();
|
||||||
|
retry_config
|
||||||
|
.set_max_attempts(Some(1))
|
||||||
|
.set_mode(Some(RetryMode::Adaptive));
|
||||||
|
|
||||||
let mut config_builder = Config::builder()
|
let mut config_builder = Config::builder()
|
||||||
.region(region)
|
.region(region)
|
||||||
.credentials_cache(CredentialsCache::lazy())
|
.credentials_cache(CredentialsCache::lazy())
|
||||||
.credentials_provider(credentials_provider);
|
.credentials_provider(credentials_provider)
|
||||||
|
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||||
|
.retry_config(retry_config.build());
|
||||||
|
|
||||||
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
|
||||||
config_builder = config_builder
|
config_builder = config_builder
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::NonZeroUsize;
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -469,8 +469,6 @@ fn create_azure_client(
|
|||||||
let random = rand::thread_rng().gen::<u32>();
|
let random = rand::thread_rng().gen::<u32>();
|
||||||
|
|
||||||
let remote_storage_config = RemoteStorageConfig {
|
let remote_storage_config = RemoteStorageConfig {
|
||||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
|
||||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
|
||||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||||
container_name: remote_storage_azure_container,
|
container_name: remote_storage_azure_container,
|
||||||
container_region: remote_storage_azure_region,
|
container_region: remote_storage_azure_region,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::NonZeroUsize;
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -396,8 +396,6 @@ fn create_s3_client(
|
|||||||
let random = rand::thread_rng().gen::<u32>();
|
let random = rand::thread_rng().gen::<u32>();
|
||||||
|
|
||||||
let remote_storage_config = RemoteStorageConfig {
|
let remote_storage_config = RemoteStorageConfig {
|
||||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
|
||||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
|
||||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||||
bucket_name: remote_storage_s3_bucket,
|
bucket_name: remote_storage_s3_bucket,
|
||||||
bucket_region: remote_storage_s3_region,
|
bucket_region: remote_storage_s3_region,
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use const_format::formatcp;
|
use const_format::formatcp;
|
||||||
|
|
||||||
/// Public API types
|
/// Public API types
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
//! Synthetic size calculation
|
//! Synthetic size calculation
|
||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
mod calculation;
|
mod calculation;
|
||||||
pub mod svg;
|
pub mod svg;
|
||||||
|
|||||||
@@ -32,6 +32,8 @@
|
|||||||
//! .init();
|
//! .init();
|
||||||
//! }
|
//! }
|
||||||
//! ```
|
//! ```
|
||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
use opentelemetry::sdk::Resource;
|
use opentelemetry::sdk::Resource;
|
||||||
use opentelemetry::KeyValue;
|
use opentelemetry::KeyValue;
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ edition.workspace = true
|
|||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
// For details about authentication see docs/authentication.md
|
// For details about authentication see docs/authentication.md
|
||||||
|
|
||||||
|
use arc_swap::ArcSwap;
|
||||||
use serde;
|
use serde;
|
||||||
use std::fs;
|
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
@@ -10,7 +11,7 @@ use jsonwebtoken::{
|
|||||||
};
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::id::TenantId;
|
use crate::{http::error::ApiError, id::TenantId};
|
||||||
|
|
||||||
/// Algorithm to use. We require EdDSA.
|
/// Algorithm to use. We require EdDSA.
|
||||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||||
@@ -44,31 +45,106 @@ impl Claims {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
|
||||||
|
|
||||||
|
impl SwappableJwtAuth {
|
||||||
|
pub fn new(jwt_auth: JwtAuth) -> Self {
|
||||||
|
SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
|
||||||
|
}
|
||||||
|
pub fn swap(&self, jwt_auth: JwtAuth) {
|
||||||
|
self.0.swap(Arc::new(jwt_auth));
|
||||||
|
}
|
||||||
|
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||||
|
self.0.load().decode(token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for SwappableJwtAuth {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "Swappable({:?})", self.0.load())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
|
||||||
|
pub struct AuthError(pub Cow<'static, str>);
|
||||||
|
|
||||||
|
impl Display for AuthError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{}", self.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<AuthError> for ApiError {
|
||||||
|
fn from(_value: AuthError) -> Self {
|
||||||
|
// Don't pass on the value of the AuthError as a precautionary measure.
|
||||||
|
// Being intentionally vague in public error communication hurts debugability
|
||||||
|
// but it is more secure.
|
||||||
|
ApiError::Forbidden("JWT authentication error".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct JwtAuth {
|
pub struct JwtAuth {
|
||||||
decoding_key: DecodingKey,
|
decoding_keys: Vec<DecodingKey>,
|
||||||
validation: Validation,
|
validation: Validation,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl JwtAuth {
|
impl JwtAuth {
|
||||||
pub fn new(decoding_key: DecodingKey) -> Self {
|
pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
|
||||||
let mut validation = Validation::default();
|
let mut validation = Validation::default();
|
||||||
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
|
validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
|
||||||
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
||||||
// expiration.
|
// expiration.
|
||||||
validation.required_spec_claims = [].into();
|
validation.required_spec_claims = [].into();
|
||||||
Self {
|
Self {
|
||||||
decoding_key,
|
decoding_keys,
|
||||||
validation,
|
validation,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
|
pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
|
||||||
let public_key = fs::read(key_path)?;
|
let metadata = key_path.metadata()?;
|
||||||
Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
|
let decoding_keys = if metadata.is_dir() {
|
||||||
|
let mut keys = Vec::new();
|
||||||
|
for entry in fs::read_dir(key_path)? {
|
||||||
|
let path = entry?.path();
|
||||||
|
if !path.is_file() {
|
||||||
|
// Ignore directories (don't recurse)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let public_key = fs::read(path)?;
|
||||||
|
keys.push(DecodingKey::from_ed_pem(&public_key)?);
|
||||||
|
}
|
||||||
|
keys
|
||||||
|
} else if metadata.is_file() {
|
||||||
|
let public_key = fs::read(key_path)?;
|
||||||
|
vec![DecodingKey::from_ed_pem(&public_key)?]
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("path is neither a directory or a file")
|
||||||
|
};
|
||||||
|
if decoding_keys.is_empty() {
|
||||||
|
anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
|
||||||
|
}
|
||||||
|
Ok(Self::new(decoding_keys))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
|
/// Attempt to decode the token with the internal decoding keys.
|
||||||
Ok(decode(token, &self.decoding_key, &self.validation)?)
|
///
|
||||||
|
/// The function tries the stored decoding keys in succession,
|
||||||
|
/// and returns the first yielding a successful result.
|
||||||
|
/// If there is no working decoding key, it returns the last error.
|
||||||
|
pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
|
||||||
|
let mut res = None;
|
||||||
|
for decoding_key in &self.decoding_keys {
|
||||||
|
res = Some(decode(token, decoding_key, &self.validation));
|
||||||
|
if let Some(Ok(res)) = res {
|
||||||
|
return Ok(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(res) = res {
|
||||||
|
res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
|
||||||
|
} else {
|
||||||
|
Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
|||||||
"#;
|
"#;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_decode() -> Result<(), anyhow::Error> {
|
fn test_decode() {
|
||||||
let expected_claims = Claims {
|
let expected_claims = Claims {
|
||||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
|
||||||
scope: Scope::Tenant,
|
scope: Scope::Tenant,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
|||||||
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
|
||||||
|
|
||||||
// Check it can be validated with the public key
|
// Check it can be validated with the public key
|
||||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||||
let claims_from_token = auth.decode(encoded_eddsa)?.claims;
|
let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
|
||||||
assert_eq!(claims_from_token, expected_claims);
|
assert_eq!(claims_from_token, expected_claims);
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_encode() -> Result<(), anyhow::Error> {
|
fn test_encode() {
|
||||||
let claims = Claims {
|
let claims = Claims {
|
||||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
|
||||||
scope: Scope::Tenant,
|
scope: Scope::Tenant,
|
||||||
};
|
};
|
||||||
|
|
||||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
|
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
|
||||||
|
|
||||||
// decode it back
|
// decode it back
|
||||||
let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
|
let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
|
||||||
let decoded = auth.decode(&encoded)?;
|
let decoded = auth.decode(&encoded).unwrap();
|
||||||
|
|
||||||
assert_eq!(decoded.claims, claims);
|
assert_eq!(decoded.claims, claims);
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use crate::auth::{Claims, JwtAuth};
|
use crate::auth::{AuthError, Claims, SwappableJwtAuth};
|
||||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||||
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
||||||
provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
|
provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
|
||||||
) -> Middleware<B, ApiError> {
|
) -> Middleware<B, ApiError> {
|
||||||
Middleware::pre(move |req| async move {
|
Middleware::pre(move |req| async move {
|
||||||
if let Some(auth) = provide_auth(&req) {
|
if let Some(auth) = provide_auth(&req) {
|
||||||
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
|
|||||||
})?;
|
})?;
|
||||||
let token = parse_token(header_value)?;
|
let token = parse_token(header_value)?;
|
||||||
|
|
||||||
let data = auth
|
let data = auth.decode(token).map_err(|err| {
|
||||||
.decode(token)
|
warn!("Authentication error: {err}");
|
||||||
.map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
|
// Rely on From<AuthError> for ApiError impl
|
||||||
|
err
|
||||||
|
})?;
|
||||||
req.set_context(data.claims);
|
req.set_context(data.claims);
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
@@ -450,12 +452,11 @@ where
|
|||||||
|
|
||||||
pub fn check_permission_with(
|
pub fn check_permission_with(
|
||||||
req: &Request<Body>,
|
req: &Request<Body>,
|
||||||
check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
|
check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
|
||||||
) -> Result<(), ApiError> {
|
) -> Result<(), ApiError> {
|
||||||
match req.context::<Claims>() {
|
match req.context::<Claims>() {
|
||||||
Some(claims) => {
|
Some(claims) => Ok(check_permission(&claims)
|
||||||
Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
|
.map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
|
||||||
}
|
|
||||||
None => Ok(()), // claims is None because auth is disabled
|
None => Ok(()), // claims is None because auth is disabled
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::error::Error as StdError;
|
use std::error::Error as StdError;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing::{error, info};
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ApiError {
|
pub enum ApiError {
|
||||||
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
|||||||
// Print a stack trace for Internal Server errors
|
// Print a stack trace for Internal Server errors
|
||||||
|
|
||||||
match api_error {
|
match api_error {
|
||||||
|
ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
|
||||||
|
warn!("Error processing HTTP request: {api_error:#}")
|
||||||
|
}
|
||||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||||
|
|||||||
@@ -120,6 +120,8 @@ impl Id {
|
|||||||
chunk[0] = HEX[((b >> 4) & 0xf) as usize];
|
chunk[0] = HEX[((b >> 4) & 0xf) as usize];
|
||||||
chunk[1] = HEX[(b & 0xf) as usize];
|
chunk[1] = HEX[(b & 0xf) as usize];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SAFETY: vec constructed out of `HEX`, it can only be ascii
|
||||||
unsafe { String::from_utf8_unchecked(buf) }
|
unsafe { String::from_utf8_unchecked(buf) }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
//! `utils` is intended to be a place to put code that is shared
|
//! `utils` is intended to be a place to put code that is shared
|
||||||
//! between other crates in this repository.
|
//! between other crates in this repository.
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
pub mod backoff;
|
pub mod backoff;
|
||||||
|
|
||||||
@@ -77,6 +78,9 @@ pub mod completion;
|
|||||||
/// Reporting utilities
|
/// Reporting utilities
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
|
||||||
|
/// async timeout helper
|
||||||
|
pub mod timeout;
|
||||||
|
|
||||||
pub mod sync;
|
pub mod sync;
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
|
|||||||
@@ -125,6 +125,9 @@ where
|
|||||||
// Wake everyone with an error.
|
// Wake everyone with an error.
|
||||||
let mut internal = self.internal.lock().unwrap();
|
let mut internal = self.internal.lock().unwrap();
|
||||||
|
|
||||||
|
// Block any future waiters from starting
|
||||||
|
internal.shutdown = true;
|
||||||
|
|
||||||
// This will steal the entire waiters map.
|
// This will steal the entire waiters map.
|
||||||
// When we drop it all waiters will be woken.
|
// When we drop it all waiters will be woken.
|
||||||
mem::take(&mut internal.waiters)
|
mem::take(&mut internal.waiters)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
/// Immediately terminate the calling process without calling
|
/// Immediately terminate the calling process without calling
|
||||||
/// atexit callbacks, C runtime destructors etc. We mainly use
|
/// atexit callbacks, C runtime destructors etc. We mainly use
|
||||||
/// this to protect coverage data from concurrent writes.
|
/// this to protect coverage data from concurrent writes.
|
||||||
pub fn exit_now(code: u8) {
|
pub fn exit_now(code: u8) -> ! {
|
||||||
|
// SAFETY: exiting is safe, the ffi is not safe
|
||||||
unsafe { nix::libc::_exit(code as _) };
|
unsafe { nix::libc::_exit(code as _) };
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1 +1,3 @@
|
|||||||
pub mod heavier_once_cell;
|
pub mod heavier_once_cell;
|
||||||
|
|
||||||
|
pub mod gate;
|
||||||
|
|||||||
158
libs/utils/src/sync/gate.rs
Normal file
158
libs/utils/src/sync/gate.rs
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
|
||||||
|
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
||||||
|
///
|
||||||
|
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
|
||||||
|
/// the resource calls `close()` when they want to ensure that all holders of guards
|
||||||
|
/// have released them, and that no future guards will be issued.
|
||||||
|
pub struct Gate {
|
||||||
|
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
||||||
|
/// take all the units to ensure all GateGuards are destroyed.
|
||||||
|
sem: Arc<tokio::sync::Semaphore>,
|
||||||
|
|
||||||
|
/// For observability only: a name that will be used to log warnings if a particular
|
||||||
|
/// gate is holding up shutdown
|
||||||
|
name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||||
|
/// not complete.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
||||||
|
|
||||||
|
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
||||||
|
async fn warn_if_stuck<Fut: std::future::Future>(
|
||||||
|
fut: Fut,
|
||||||
|
name: &str,
|
||||||
|
warn_period: std::time::Duration,
|
||||||
|
) -> <Fut as std::future::Future>::Output {
|
||||||
|
let started = std::time::Instant::now();
|
||||||
|
|
||||||
|
let mut fut = std::pin::pin!(fut);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||||
|
Ok(ret) => return ret,
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!(
|
||||||
|
gate = name,
|
||||||
|
elapsed_ms = started.elapsed().as_millis(),
|
||||||
|
"still waiting, taking longer than expected..."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum GateError {
|
||||||
|
GateClosed,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Gate {
|
||||||
|
const MAX_UNITS: u32 = u32::MAX;
|
||||||
|
|
||||||
|
pub fn new(name: String) -> Self {
|
||||||
|
Self {
|
||||||
|
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
||||||
|
name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Acquire a guard that will prevent close() calls from completing. If close()
|
||||||
|
/// was already called, this will return an error which should be interpreted
|
||||||
|
/// as "shutting down".
|
||||||
|
///
|
||||||
|
/// This function would typically be used from e.g. request handlers. While holding
|
||||||
|
/// the guard returned from this function, it is important to respect a CancellationToken
|
||||||
|
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||||
|
/// also contain a CancellationToken.
|
||||||
|
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||||
|
self.sem
|
||||||
|
.clone()
|
||||||
|
.try_acquire_owned()
|
||||||
|
.map(GateGuard)
|
||||||
|
.map_err(|_| GateError::GateClosed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Types with a shutdown() method and a gate should call this method at the
|
||||||
|
/// end of shutdown, to ensure that all GateGuard holders are done.
|
||||||
|
///
|
||||||
|
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
|
||||||
|
/// important that the holders of such guards are respecting a CancellationToken which has
|
||||||
|
/// been cancelled before entering this function.
|
||||||
|
pub async fn close(&self) {
|
||||||
|
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This
|
||||||
|
/// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
|
||||||
|
/// the CancellationToken on such types is analogous to "Did shutdown start?"
|
||||||
|
pub fn close_complete(&self) -> bool {
|
||||||
|
self.sem.is_closed()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn do_close(&self) {
|
||||||
|
tracing::debug!(gate = self.name, "Closing Gate...");
|
||||||
|
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
||||||
|
Ok(_units) => {
|
||||||
|
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
||||||
|
self.sem.close();
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
||||||
|
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
||||||
|
// once.
|
||||||
|
tracing::debug!(gate = self.name, "Double close")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tracing::debug!(gate = self.name, "Closed Gate.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use futures::FutureExt;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_idle_gate() {
|
||||||
|
// Having taken no gates, we should not be blocked in close
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
gate.close().await;
|
||||||
|
|
||||||
|
// If a guard is dropped before entering, close should not be blocked
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
let guard = gate.enter().unwrap();
|
||||||
|
drop(guard);
|
||||||
|
gate.close().await;
|
||||||
|
|
||||||
|
// Entering a closed guard fails
|
||||||
|
gate.enter().expect_err("enter should fail after close");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_busy_gate() {
|
||||||
|
let gate = Gate::new("test".to_string());
|
||||||
|
|
||||||
|
let guard = gate.enter().unwrap();
|
||||||
|
|
||||||
|
let mut close_fut = std::pin::pin!(gate.close());
|
||||||
|
|
||||||
|
// Close should be blocked
|
||||||
|
assert!(close_fut.as_mut().now_or_never().is_none());
|
||||||
|
|
||||||
|
// Attempting to enter() should fail, even though close isn't done yet.
|
||||||
|
gate.enter()
|
||||||
|
.expect_err("enter should fail after entering close");
|
||||||
|
|
||||||
|
drop(guard);
|
||||||
|
|
||||||
|
// Guard is gone, close should finish
|
||||||
|
assert!(close_fut.as_mut().now_or_never().is_some());
|
||||||
|
|
||||||
|
// Attempting to enter() is still forbidden
|
||||||
|
gate.enter().expect_err("enter should fail finishing close");
|
||||||
|
}
|
||||||
|
}
|
||||||
37
libs/utils/src/timeout.rs
Normal file
37
libs/utils/src/timeout.rs
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
pub enum TimeoutCancellableError {
|
||||||
|
Timeout,
|
||||||
|
Cancelled,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wrap [`tokio::time::timeout`] with a CancellationToken.
|
||||||
|
///
|
||||||
|
/// This wrapper is appropriate for any long running operation in a task
|
||||||
|
/// that ought to respect a CancellationToken (which means most tasks).
|
||||||
|
///
|
||||||
|
/// The only time you should use a bare tokio::timeout is when the future `F`
|
||||||
|
/// itself respects a CancellationToken: otherwise, always use this wrapper
|
||||||
|
/// with your CancellationToken to ensure that your task does not hold up
|
||||||
|
/// graceful shutdown.
|
||||||
|
pub async fn timeout_cancellable<F>(
|
||||||
|
duration: Duration,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
future: F,
|
||||||
|
) -> Result<F::Output, TimeoutCancellableError>
|
||||||
|
where
|
||||||
|
F: std::future::Future,
|
||||||
|
{
|
||||||
|
tokio::select!(
|
||||||
|
r = tokio::time::timeout(duration, future) => {
|
||||||
|
r.map_err(|_| TimeoutCancellableError::Timeout)
|
||||||
|
|
||||||
|
},
|
||||||
|
_ = cancel.cancelled() => {
|
||||||
|
Err(TimeoutCancellableError::Cancelled)
|
||||||
|
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -19,13 +19,12 @@ inotify.workspace = true
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
sysinfo.workspace = true
|
sysinfo.workspace = true
|
||||||
tokio.workspace = true
|
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
tokio-stream.workspace = true
|
tokio-stream.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-subscriber.workspace = true
|
tracing-subscriber.workspace = true
|
||||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
cgroups-rs = "0.3.3"
|
cgroups-rs = "0.3.3"
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
#![cfg(target_os = "linux")]
|
#![cfg(target_os = "linux")]
|
||||||
|
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
|||||||
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::unnecessary_cast)]
|
||||||
extern "C" fn wal_read(
|
extern "C" fn wal_read(
|
||||||
sk: *mut Safekeeper,
|
sk: *mut Safekeeper,
|
||||||
buf: *mut ::std::os::raw::c_char,
|
buf: *mut ::std::os::raw::c_char,
|
||||||
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Take ownership of `Vec<u8>` from StringInfoData.
|
/// Take ownership of `Vec<u8>` from StringInfoData.
|
||||||
|
#[allow(clippy::unnecessary_cast)]
|
||||||
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
||||||
if pg.data.is_null() {
|
if pg.data.is_null() {
|
||||||
return None;
|
return None;
|
||||||
|
|||||||
@@ -186,7 +186,7 @@ impl Wrapper {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.into_bytes_with_nul();
|
.into_bytes_with_nul();
|
||||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
|
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
|
||||||
|
|
||||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||||
|
|
||||||
|
|||||||
@@ -1,22 +1,21 @@
|
|||||||
use anyhow::{bail, Result};
|
use utils::auth::{AuthError, Claims, Scope};
|
||||||
use utils::auth::{Claims, Scope};
|
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
|
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||||
match (&claims.scope, tenant_id) {
|
match (&claims.scope, tenant_id) {
|
||||||
(Scope::Tenant, None) => {
|
(Scope::Tenant, None) => Err(AuthError(
|
||||||
bail!("Attempt to access management api with tenant scope. Permission denied")
|
"Attempt to access management api with tenant scope. Permission denied".into(),
|
||||||
}
|
)),
|
||||||
(Scope::Tenant, Some(tenant_id)) => {
|
(Scope::Tenant, Some(tenant_id)) => {
|
||||||
if claims.tenant_id.unwrap() != tenant_id {
|
if claims.tenant_id.unwrap() != tenant_id {
|
||||||
bail!("Tenant id mismatch. Permission denied")
|
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||||
(Scope::SafekeeperData, _) => {
|
(Scope::SafekeeperData, _) => Err(AuthError(
|
||||||
bail!("SafekeeperData scope makes no sense for Pageserver")
|
"SafekeeperData scope makes no sense for Pageserver".into(),
|
||||||
}
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
|
|||||||
use utils::logging::TracingErrorLayerEnablement;
|
use utils::logging::TracingErrorLayerEnablement;
|
||||||
use utils::signals::ShutdownSignals;
|
use utils::signals::ShutdownSignals;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
|
auth::{JwtAuth, SwappableJwtAuth},
|
||||||
signals::Signal, tcp_listener,
|
logging, project_build_tag, project_git_version,
|
||||||
|
sentry_init::init_sentry,
|
||||||
|
signals::Signal,
|
||||||
|
tcp_listener,
|
||||||
};
|
};
|
||||||
|
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
@@ -321,13 +324,12 @@ fn start_pageserver(
|
|||||||
let http_auth;
|
let http_auth;
|
||||||
let pg_auth;
|
let pg_auth;
|
||||||
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
|
||||||
// unwrap is ok because check is performed when creating config, so path is set and file exists
|
// unwrap is ok because check is performed when creating config, so path is set and exists
|
||||||
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
|
||||||
info!(
|
info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
|
||||||
"Loading public key for verifying JWT tokens from {:#?}",
|
|
||||||
key_path
|
let jwt_auth = JwtAuth::from_key_path(key_path)?;
|
||||||
);
|
let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
|
||||||
let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
|
|
||||||
|
|
||||||
http_auth = match &conf.http_auth_type {
|
http_auth = match &conf.http_auth_type {
|
||||||
AuthType::Trust => None,
|
AuthType::Trust => None,
|
||||||
@@ -410,7 +412,7 @@ fn start_pageserver(
|
|||||||
|
|
||||||
// Scan the local 'tenants/' directory and start loading the tenants
|
// Scan the local 'tenants/' directory and start loading the tenants
|
||||||
let deletion_queue_client = deletion_queue.new_client();
|
let deletion_queue_client = deletion_queue.new_client();
|
||||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||||
conf,
|
conf,
|
||||||
TenantSharedResources {
|
TenantSharedResources {
|
||||||
broker_client: broker_client.clone(),
|
broker_client: broker_client.clone(),
|
||||||
@@ -420,6 +422,7 @@ fn start_pageserver(
|
|||||||
order,
|
order,
|
||||||
shutdown_pageserver.clone(),
|
shutdown_pageserver.clone(),
|
||||||
))?;
|
))?;
|
||||||
|
let tenant_manager = Arc::new(tenant_manager);
|
||||||
|
|
||||||
BACKGROUND_RUNTIME.spawn({
|
BACKGROUND_RUNTIME.spawn({
|
||||||
let init_done_rx = init_done_rx;
|
let init_done_rx = init_done_rx;
|
||||||
@@ -548,6 +551,7 @@ fn start_pageserver(
|
|||||||
let router_state = Arc::new(
|
let router_state = Arc::new(
|
||||||
http::routes::State::new(
|
http::routes::State::new(
|
||||||
conf,
|
conf,
|
||||||
|
tenant_manager,
|
||||||
http_auth.clone(),
|
http_auth.clone(),
|
||||||
remote_storage.clone(),
|
remote_storage.clone(),
|
||||||
broker_client.clone(),
|
broker_client.clone(),
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ pub struct PageServerConf {
|
|||||||
pub http_auth_type: AuthType,
|
pub http_auth_type: AuthType,
|
||||||
/// authentication method for libpq connections from compute
|
/// authentication method for libpq connections from compute
|
||||||
pub pg_auth_type: AuthType,
|
pub pg_auth_type: AuthType,
|
||||||
/// Path to a file containing public key for verifying JWT tokens.
|
/// Path to a file or directory containing public key(s) for verifying JWT tokens.
|
||||||
/// Used for both mgmt and compute auth, if enabled.
|
/// Used for both mgmt and compute auth, if enabled.
|
||||||
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
|
pub auth_validation_public_key_path: Option<Utf8PathBuf>,
|
||||||
|
|
||||||
@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
parsed_remote_storage_config,
|
parsed_remote_storage_config,
|
||||||
RemoteStorageConfig {
|
RemoteStorageConfig {
|
||||||
max_concurrent_syncs: NonZeroUsize::new(
|
|
||||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
|
||||||
.unwrap(),
|
|
||||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||||
},
|
},
|
||||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||||
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
parsed_remote_storage_config,
|
parsed_remote_storage_config,
|
||||||
RemoteStorageConfig {
|
RemoteStorageConfig {
|
||||||
max_concurrent_syncs,
|
|
||||||
max_sync_errors,
|
|
||||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||||
bucket_name: bucket_name.clone(),
|
bucket_name: bucket_name.clone(),
|
||||||
bucket_region: bucket_region.clone(),
|
bucket_region: bucket_region.clone(),
|
||||||
|
|||||||
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
|
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
|
||||||
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
||||||
// We can put in some prioritization for consumption metrics.
|
// We can put in some prioritization for consumption metrics.
|
||||||
// Same for the loop that fetches computed metrics.
|
// Same for the loop that fetches computed metrics.
|
||||||
|
|||||||
@@ -202,7 +202,6 @@ pub(super) async fn collect_all_metrics(
|
|||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
crate::tenant::mgr::get_tenant(id, true)
|
crate::tenant::mgr::get_tenant(id, true)
|
||||||
.await
|
|
||||||
.ok()
|
.ok()
|
||||||
.map(|tenant| (id, tenant))
|
.map(|tenant| (id, tenant))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -893,14 +893,6 @@ mod test {
|
|||||||
std::fs::create_dir_all(remote_fs_dir)?;
|
std::fs::create_dir_all(remote_fs_dir)?;
|
||||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||||
let storage_config = RemoteStorageConfig {
|
let storage_config = RemoteStorageConfig {
|
||||||
max_concurrent_syncs: std::num::NonZeroUsize::new(
|
|
||||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
max_sync_errors: std::num::NonZeroU32::new(
|
|
||||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
|
||||||
)
|
|
||||||
.unwrap(),
|
|
||||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||||
};
|
};
|
||||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||||
|
|||||||
@@ -55,21 +55,24 @@ impl Deleter {
|
|||||||
|
|
||||||
/// Wrap the remote `delete_objects` with a failpoint
|
/// Wrap the remote `delete_objects` with a failpoint
|
||||||
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
|
async fn remote_delete(&self) -> Result<(), anyhow::Error> {
|
||||||
fail::fail_point!("deletion-queue-before-execute", |_| {
|
|
||||||
info!("Skipping execution, failpoint set");
|
|
||||||
metrics::DELETION_QUEUE
|
|
||||||
.remote_errors
|
|
||||||
.with_label_values(&["failpoint"])
|
|
||||||
.inc();
|
|
||||||
Err(anyhow::anyhow!("failpoint hit"))
|
|
||||||
});
|
|
||||||
|
|
||||||
// A backoff::retry is used here for two reasons:
|
// A backoff::retry is used here for two reasons:
|
||||||
// - To provide a backoff rather than busy-polling the API on errors
|
// - To provide a backoff rather than busy-polling the API on errors
|
||||||
// - To absorb transient 429/503 conditions without hitting our error
|
// - To absorb transient 429/503 conditions without hitting our error
|
||||||
// logging path for issues deleting objects.
|
// logging path for issues deleting objects.
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
|| async { self.remote_storage.delete_objects(&self.accumulator).await },
|
|| async {
|
||||||
|
fail::fail_point!("deletion-queue-before-execute", |_| {
|
||||||
|
info!("Skipping execution, failpoint set");
|
||||||
|
|
||||||
|
metrics::DELETION_QUEUE
|
||||||
|
.remote_errors
|
||||||
|
.with_label_values(&["failpoint"])
|
||||||
|
.inc();
|
||||||
|
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
|
||||||
|
});
|
||||||
|
|
||||||
|
self.remote_storage.delete_objects(&self.accumulator).await
|
||||||
|
},
|
||||||
|_| false,
|
|_| false,
|
||||||
3,
|
3,
|
||||||
10,
|
10,
|
||||||
|
|||||||
@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|||||||
return (evicted_bytes, evictions_failed);
|
return (evicted_bytes, evictions_failed);
|
||||||
};
|
};
|
||||||
|
|
||||||
let results = timeline.evict_layers(&batch, &cancel).await;
|
let results = timeline.evict_layers(&batch).await;
|
||||||
|
|
||||||
match results {
|
match results {
|
||||||
Ok(results) => {
|
Ok(results) => {
|
||||||
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
|
|||||||
if cancel.is_cancelled() {
|
if cancel.is_cancelled() {
|
||||||
return Ok(EvictionCandidates::Cancelled);
|
return Ok(EvictionCandidates::Cancelled);
|
||||||
}
|
}
|
||||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
|
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
|
||||||
Ok(tenant) => tenant,
|
Ok(tenant) => tenant,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// this can happen if tenant has lifecycle transition after we fetched it
|
// this can happen if tenant has lifecycle transition after we fetched it
|
||||||
@@ -554,6 +554,11 @@ async fn collect_eviction_candidates(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if tenant.cancel.is_cancelled() {
|
||||||
|
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// collect layers from all timelines in this tenant
|
// collect layers from all timelines in this tenant
|
||||||
//
|
//
|
||||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||||
|
|||||||
@@ -52,6 +52,31 @@ paths:
|
|||||||
schema:
|
schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
||||||
|
/v1/reload_auth_validation_keys:
|
||||||
|
post:
|
||||||
|
description: Reloads the JWT public keys from their pre-configured location on disk.
|
||||||
|
responses:
|
||||||
|
"200":
|
||||||
|
description: The reload completed successfully.
|
||||||
|
"401":
|
||||||
|
description: Unauthorized Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/UnauthorizedError"
|
||||||
|
"403":
|
||||||
|
description: Forbidden Error
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ForbiddenError"
|
||||||
|
"500":
|
||||||
|
description: Generic operation error (also hits if no keys were found)
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/Error"
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}:
|
/v1/tenant/{tenant_id}:
|
||||||
parameters:
|
parameters:
|
||||||
- name: tenant_id
|
- name: tenant_id
|
||||||
@@ -327,7 +352,8 @@ paths:
|
|||||||
in: query
|
in: query
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: integer
|
type: string
|
||||||
|
format: hex
|
||||||
description: A LSN to get the timestamp
|
description: A LSN to get the timestamp
|
||||||
responses:
|
responses:
|
||||||
"200":
|
"200":
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
|
|||||||
use tenant_size_model::{SizeResult, StorageModel};
|
use tenant_size_model::{SizeResult, StorageModel};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use utils::auth::JwtAuth;
|
||||||
use utils::http::endpoint::request_span;
|
use utils::http::endpoint::request_span;
|
||||||
use utils::http::json::json_request_or_empty_body;
|
use utils::http::json::json_request_or_empty_body;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
@@ -35,7 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
|
|||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||||
use crate::tenant::mgr::{
|
use crate::tenant::mgr::{
|
||||||
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
|
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||||
|
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||||
};
|
};
|
||||||
use crate::tenant::size::ModelInputs;
|
use crate::tenant::size::ModelInputs;
|
||||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||||
@@ -44,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
|
|||||||
use crate::{config::PageServerConf, tenant::mgr};
|
use crate::{config::PageServerConf, tenant::mgr};
|
||||||
use crate::{disk_usage_eviction_task, tenant};
|
use crate::{disk_usage_eviction_task, tenant};
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::JwtAuth,
|
auth::SwappableJwtAuth,
|
||||||
generation::Generation,
|
generation::Generation,
|
||||||
http::{
|
http::{
|
||||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
|
||||||
@@ -62,7 +64,8 @@ use super::models::ConfigureFailpointsRequest;
|
|||||||
|
|
||||||
pub struct State {
|
pub struct State {
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
tenant_manager: Arc<TenantManager>,
|
||||||
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
allowlist_routes: Vec<Uri>,
|
allowlist_routes: Vec<Uri>,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
@@ -73,7 +76,8 @@ pub struct State {
|
|||||||
impl State {
|
impl State {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
tenant_manager: Arc<TenantManager>,
|
||||||
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||||
@@ -85,6 +89,7 @@ impl State {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
conf,
|
conf,
|
||||||
|
tenant_manager,
|
||||||
auth,
|
auth,
|
||||||
allowlist_routes,
|
allowlist_routes,
|
||||||
remote_storage,
|
remote_storage,
|
||||||
@@ -146,28 +151,59 @@ impl From<PageReconstructError> for ApiError {
|
|||||||
impl From<TenantMapInsertError> for ApiError {
|
impl From<TenantMapInsertError> for ApiError {
|
||||||
fn from(tmie: TenantMapInsertError) -> ApiError {
|
fn from(tmie: TenantMapInsertError) -> ApiError {
|
||||||
match tmie {
|
match tmie {
|
||||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
TenantMapInsertError::SlotError(e) => e.into(),
|
||||||
ApiError::ResourceUnavailable(format!("{tmie}").into())
|
TenantMapInsertError::SlotUpsertError(e) => e.into(),
|
||||||
}
|
|
||||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
|
||||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
|
||||||
}
|
|
||||||
TenantMapInsertError::TenantExistsSecondary(id) => {
|
|
||||||
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
|
|
||||||
}
|
|
||||||
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
|
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<TenantSlotError> for ApiError {
|
||||||
|
fn from(e: TenantSlotError) -> ApiError {
|
||||||
|
use TenantSlotError::*;
|
||||||
|
match e {
|
||||||
|
NotFound(tenant_id) => {
|
||||||
|
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||||
|
}
|
||||||
|
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
|
||||||
|
InProgress => {
|
||||||
|
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||||
|
}
|
||||||
|
MapState(e) => e.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<TenantSlotUpsertError> for ApiError {
|
||||||
|
fn from(e: TenantSlotUpsertError) -> ApiError {
|
||||||
|
use TenantSlotUpsertError::*;
|
||||||
|
match e {
|
||||||
|
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||||
|
MapState(e) => e.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<TenantMapError> for ApiError {
|
||||||
|
fn from(e: TenantMapError) -> ApiError {
|
||||||
|
use TenantMapError::*;
|
||||||
|
match e {
|
||||||
|
StillInitializing | ShuttingDown => {
|
||||||
|
ApiError::ResourceUnavailable(format!("{e}").into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<TenantStateError> for ApiError {
|
impl From<TenantStateError> for ApiError {
|
||||||
fn from(tse: TenantStateError) -> ApiError {
|
fn from(tse: TenantStateError) -> ApiError {
|
||||||
match tse {
|
match tse {
|
||||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
|
||||||
TenantStateError::IsStopping(_) => {
|
TenantStateError::IsStopping(_) => {
|
||||||
ApiError::ResourceUnavailable("Tenant is stopping".into())
|
ApiError::ResourceUnavailable("Tenant is stopping".into())
|
||||||
}
|
}
|
||||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
TenantStateError::SlotError(e) => e.into(),
|
||||||
|
TenantStateError::SlotUpsertError(e) => e.into(),
|
||||||
|
TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -188,6 +224,7 @@ impl From<GetTenantError> for ApiError {
|
|||||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||||
}
|
}
|
||||||
|
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -242,6 +279,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
|||||||
Get(g) => ApiError::from(g),
|
Get(g) => ApiError::from(g),
|
||||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||||
Timeline(t) => ApiError::from(t),
|
Timeline(t) => ApiError::from(t),
|
||||||
|
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||||
|
SlotError(e) => e.into(),
|
||||||
|
SlotUpsertError(e) => e.into(),
|
||||||
Other(o) => ApiError::InternalServerError(o),
|
Other(o) => ApiError::InternalServerError(o),
|
||||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||||
}
|
}
|
||||||
@@ -263,11 +303,7 @@ async fn build_timeline_info(
|
|||||||
// we're executing this function, we will outlive the timeline on-disk state.
|
// we're executing this function, we will outlive the timeline on-disk state.
|
||||||
info.current_logical_size_non_incremental = Some(
|
info.current_logical_size_non_incremental = Some(
|
||||||
timeline
|
timeline
|
||||||
.get_current_logical_size_non_incremental(
|
.get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
|
||||||
info.last_record_lsn,
|
|
||||||
CancellationToken::new(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.await?,
|
.await?,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -353,6 +389,32 @@ async fn status_handler(
|
|||||||
json_response(StatusCode::OK, StatusResponse { id: config.id })
|
json_response(StatusCode::OK, StatusResponse { id: config.id })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn reload_auth_validation_keys_handler(
|
||||||
|
request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permission(&request, None)?;
|
||||||
|
let config = get_config(&request);
|
||||||
|
let state = get_state(&request);
|
||||||
|
let Some(shared_auth) = &state.auth else {
|
||||||
|
return json_response(StatusCode::BAD_REQUEST, ());
|
||||||
|
};
|
||||||
|
// unwrap is ok because check is performed when creating config, so path is set and exists
|
||||||
|
let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
|
||||||
|
info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
|
||||||
|
|
||||||
|
match JwtAuth::from_key_path(key_path) {
|
||||||
|
Ok(new_auth) => {
|
||||||
|
shared_auth.swap(new_auth);
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||||
|
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn timeline_create_handler(
|
async fn timeline_create_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
@@ -368,7 +430,7 @@ async fn timeline_create_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
async {
|
async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
match tenant.create_timeline(
|
match tenant.create_timeline(
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||||
@@ -396,6 +458,9 @@ async fn timeline_create_handler(
|
|||||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
||||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
||||||
}
|
}
|
||||||
|
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
||||||
|
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
||||||
|
}
|
||||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -415,7 +480,7 @@ async fn timeline_list_handler(
|
|||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let response_data = async {
|
let response_data = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
let timelines = tenant.list_timelines();
|
let timelines = tenant.list_timelines();
|
||||||
|
|
||||||
let mut response_data = Vec::with_capacity(timelines.len());
|
let mut response_data = Vec::with_capacity(timelines.len());
|
||||||
@@ -454,7 +519,7 @@ async fn timeline_detail_handler(
|
|||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
|
|
||||||
let timeline_info = async {
|
let timeline_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
|
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.get_timeline(timeline_id, false)
|
.get_timeline(timeline_id, false)
|
||||||
@@ -710,7 +775,7 @@ async fn tenant_status(
|
|||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant_info = async {
|
let tenant_info = async {
|
||||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||||
|
|
||||||
// Calculate total physical size of all timelines
|
// Calculate total physical size of all timelines
|
||||||
let mut current_physical_size = 0;
|
let mut current_physical_size = 0;
|
||||||
@@ -773,7 +838,7 @@ async fn tenant_size_handler(
|
|||||||
let headers = request.headers();
|
let headers = request.headers();
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
|
|
||||||
// this can be long operation
|
// this can be long operation
|
||||||
let inputs = tenant
|
let inputs = tenant
|
||||||
@@ -1030,7 +1095,7 @@ async fn get_tenant_config_handler(
|
|||||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||||
check_permission(&request, Some(tenant_id))?;
|
check_permission(&request, Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||||
|
|
||||||
let response = HashMap::from([
|
let response = HashMap::from([
|
||||||
(
|
(
|
||||||
@@ -1089,7 +1154,7 @@ async fn put_tenant_location_config_handler(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
match e {
|
match e {
|
||||||
TenantStateError::NotFound(_) => {
|
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||||
// This API is idempotent: a NotFound on a detach is fine.
|
// This API is idempotent: a NotFound on a detach is fine.
|
||||||
}
|
}
|
||||||
_ => return Err(e.into()),
|
_ => return Err(e.into()),
|
||||||
@@ -1101,20 +1166,14 @@ async fn put_tenant_location_config_handler(
|
|||||||
let location_conf =
|
let location_conf =
|
||||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||||
|
|
||||||
mgr::upsert_location(
|
state
|
||||||
state.conf,
|
.tenant_manager
|
||||||
tenant_id,
|
.upsert_location(tenant_id, location_conf, &ctx)
|
||||||
location_conf,
|
.await
|
||||||
state.broker_client.clone(),
|
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||||
state.remote_storage.clone(),
|
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||||
state.deletion_queue_client.clone(),
|
// which is not a 400 but a 409.
|
||||||
&ctx,
|
.map_err(ApiError::BadRequest)?;
|
||||||
)
|
|
||||||
.await
|
|
||||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
|
||||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
|
||||||
// which is not a 400 but a 409.
|
|
||||||
.map_err(ApiError::BadRequest)?;
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
@@ -1127,7 +1186,6 @@ async fn handle_tenant_break(
|
|||||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||||
|
|
||||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||||
.await
|
|
||||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||||
|
|
||||||
tenant.set_broken("broken from test".to_owned()).await;
|
tenant.set_broken("broken from test".to_owned()).await;
|
||||||
@@ -1432,7 +1490,7 @@ async fn active_timeline_of_active_tenant(
|
|||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
) -> Result<Arc<Timeline>, ApiError> {
|
) -> Result<Arc<Timeline>, ApiError> {
|
||||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||||
tenant
|
tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_timeline(timeline_id, true)
|
||||||
.map_err(|e| ApiError::NotFound(e.into()))
|
.map_err(|e| ApiError::NotFound(e.into()))
|
||||||
@@ -1609,6 +1667,8 @@ where
|
|||||||
);
|
);
|
||||||
|
|
||||||
match handle.await {
|
match handle.await {
|
||||||
|
// TODO: never actually return Err from here, always Ok(...) so that we can log
|
||||||
|
// spanned errors. Call api_error_handler instead and return appropriate Body.
|
||||||
Ok(result) => result,
|
Ok(result) => result,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// The handler task panicked. We have a global panic handler that logs the
|
// The handler task panicked. We have a global panic handler that logs the
|
||||||
@@ -1657,7 +1717,7 @@ where
|
|||||||
pub fn make_router(
|
pub fn make_router(
|
||||||
state: Arc<State>,
|
state: Arc<State>,
|
||||||
launch_ts: &'static LaunchTimestamp,
|
launch_ts: &'static LaunchTimestamp,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||||
let spec = include_bytes!("openapi_spec.yml");
|
let spec = include_bytes!("openapi_spec.yml");
|
||||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||||
@@ -1686,6 +1746,9 @@ pub fn make_router(
|
|||||||
.put("/v1/failpoints", |r| {
|
.put("/v1/failpoints", |r| {
|
||||||
testing_api_handler("manage failpoints", r, failpoints_handler)
|
testing_api_handler("manage failpoints", r, failpoints_handler)
|
||||||
})
|
})
|
||||||
|
.post("/v1/reload_auth_validation_keys", |r| {
|
||||||
|
api_handler(r, reload_auth_validation_keys_handler)
|
||||||
|
})
|
||||||
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
|
||||||
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
|
||||||
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
mod auth;
|
mod auth;
|
||||||
pub mod basebackup;
|
pub mod basebackup;
|
||||||
pub mod config;
|
pub mod config;
|
||||||
@@ -61,14 +63,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
// Shut down any page service tasks.
|
|
||||||
timed(
|
|
||||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
|
||||||
"shutdown PageRequestHandlers",
|
|
||||||
Duration::from_secs(1),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
// Shut down all the tenants. This flushes everything to disk and kills
|
// Shut down all the tenants. This flushes everything to disk and kills
|
||||||
// the checkpoint and GC tasks.
|
// the checkpoint and GC tasks.
|
||||||
timed(
|
timed(
|
||||||
@@ -78,6 +72,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||||
|
// should already have been canclled via mgr::shutdown_all_tenants
|
||||||
|
timed(
|
||||||
|
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||||
|
"shutdown PageRequestHandlers",
|
||||||
|
Duration::from_secs(1),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||||
if let Some(mut deletion_queue) = deletion_queue {
|
if let Some(mut deletion_queue) = deletion_queue {
|
||||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||||
|
|||||||
@@ -962,6 +962,32 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub(crate) struct TenantManagerMetrics {
|
||||||
|
pub(crate) tenant_slots: UIntGauge,
|
||||||
|
pub(crate) tenant_slot_writes: IntCounter,
|
||||||
|
pub(crate) unexpected_errors: IntCounter,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
|
||||||
|
TenantManagerMetrics {
|
||||||
|
tenant_slots: register_uint_gauge!(
|
||||||
|
"pageserver_tenant_manager_slots",
|
||||||
|
"How many slots currently exist, including all attached, secondary and in-progress operations",
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
tenant_slot_writes: register_int_counter!(
|
||||||
|
"pageserver_tenant_manager_slot_writes",
|
||||||
|
"Writes to a tenant slot, including all of create/attach/detach/delete"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
unexpected_errors: register_int_counter!(
|
||||||
|
"pageserver_tenant_manager_unexpected_errors_total",
|
||||||
|
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) struct DeletionQueueMetrics {
|
pub(crate) struct DeletionQueueMetrics {
|
||||||
pub(crate) keys_submitted: IntCounter,
|
pub(crate) keys_submitted: IntCounter,
|
||||||
pub(crate) keys_dropped: IntCounter,
|
pub(crate) keys_dropped: IntCounter,
|
||||||
@@ -1199,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
|
||||||
register_histogram!(
|
|
||||||
"pageserver_wal_redo_wait_seconds",
|
|
||||||
"Time spent waiting for access to the Postgres WAL redo process",
|
|
||||||
redo_histogram_time_buckets!(),
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||||
register_histogram!(
|
register_histogram!(
|
||||||
"pageserver_wal_redo_records_histogram",
|
"pageserver_wal_redo_records_histogram",
|
||||||
@@ -1884,6 +1901,9 @@ pub fn preinitialize_metrics() {
|
|||||||
// Deletion queue stats
|
// Deletion queue stats
|
||||||
Lazy::force(&DELETION_QUEUE);
|
Lazy::force(&DELETION_QUEUE);
|
||||||
|
|
||||||
|
// Tenant manager stats
|
||||||
|
Lazy::force(&TENANT_MANAGER);
|
||||||
|
|
||||||
// countervecs
|
// countervecs
|
||||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -1899,7 +1919,6 @@ pub fn preinitialize_metrics() {
|
|||||||
&READ_NUM_FS_LAYERS,
|
&READ_NUM_FS_LAYERS,
|
||||||
&WAIT_LSN_TIME,
|
&WAIT_LSN_TIME,
|
||||||
&WAL_REDO_TIME,
|
&WAL_REDO_TIME,
|
||||||
&WAL_REDO_WAIT_TIME,
|
|
||||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||||
&WAL_REDO_BYTES_HISTOGRAM,
|
&WAL_REDO_BYTES_HISTOGRAM,
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ use tracing::field;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::ConnectionId;
|
use utils::id::ConnectionId;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::{Claims, JwtAuth, Scope},
|
auth::{Claims, Scope, SwappableJwtAuth},
|
||||||
id::{TenantId, TimelineId},
|
id::{TenantId, TimelineId},
|
||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
simple_rcu::RcuReadGuard,
|
simple_rcu::RcuReadGuard,
|
||||||
@@ -55,16 +55,20 @@ use crate::metrics;
|
|||||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant;
|
|
||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
use crate::tenant::mgr;
|
use crate::tenant::mgr;
|
||||||
use crate::tenant::mgr::GetTenantError;
|
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||||
use crate::tenant::{Tenant, Timeline};
|
use crate::tenant::mgr::GetActiveTenantError;
|
||||||
|
use crate::tenant::Timeline;
|
||||||
use crate::trace::Tracer;
|
use crate::trace::Tracer;
|
||||||
|
|
||||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
|
|
||||||
|
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||||
|
// is not yet in state [`TenantState::Active`].
|
||||||
|
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||||
|
|
||||||
/// Read the end of a tar archive.
|
/// Read the end of a tar archive.
|
||||||
///
|
///
|
||||||
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
|
||||||
@@ -118,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
|||||||
pub async fn libpq_listener_main(
|
pub async fn libpq_listener_main(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
listener: TcpListener,
|
listener: TcpListener,
|
||||||
auth_type: AuthType,
|
auth_type: AuthType,
|
||||||
listener_ctx: RequestContext,
|
listener_ctx: RequestContext,
|
||||||
@@ -186,7 +190,7 @@ pub async fn libpq_listener_main(
|
|||||||
async fn page_service_conn_main(
|
async fn page_service_conn_main(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
socket: tokio::net::TcpStream,
|
socket: tokio::net::TcpStream,
|
||||||
auth_type: AuthType,
|
auth_type: AuthType,
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
@@ -214,22 +218,34 @@ async fn page_service_conn_main(
|
|||||||
// no write timeout is used, because the kernel is assumed to error writes after some time.
|
// no write timeout is used, because the kernel is assumed to error writes after some time.
|
||||||
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
|
let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
|
||||||
|
|
||||||
// timeout should be lower, but trying out multiple days for
|
let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
|
||||||
// <https://github.com/neondatabase/neon/issues/4205>
|
let socket_timeout_ms = (|| {
|
||||||
socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
|
fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
|
||||||
|
// Exponential distribution for simulating
|
||||||
|
// poor network conditions, expect about avg_timeout_ms to be around 15
|
||||||
|
// in tests
|
||||||
|
if let Some(avg_timeout_ms) = avg_timeout_ms {
|
||||||
|
let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
|
||||||
|
let u = rand::random::<f32>();
|
||||||
|
((1.0 - u).ln() / (-avg)) as u64
|
||||||
|
} else {
|
||||||
|
default_timeout_ms
|
||||||
|
}
|
||||||
|
});
|
||||||
|
default_timeout_ms
|
||||||
|
})();
|
||||||
|
|
||||||
|
// A timeout here does not mean the client died, it can happen if it's just idle for
|
||||||
|
// a while: we will tear down this PageServerHandler and instantiate a new one if/when
|
||||||
|
// they reconnect.
|
||||||
|
socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
|
||||||
let socket = std::pin::pin!(socket);
|
let socket = std::pin::pin!(socket);
|
||||||
|
|
||||||
// XXX: pgbackend.run() should take the connection_ctx,
|
// XXX: pgbackend.run() should take the connection_ctx,
|
||||||
// and create a child per-query context when it invokes process_query.
|
// and create a child per-query context when it invokes process_query.
|
||||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||||
// and create the per-query context in process_query ourselves.
|
// and create the per-query context in process_query ourselves.
|
||||||
let mut conn_handler = PageServerHandler::new(
|
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
|
||||||
conf,
|
|
||||||
broker_client,
|
|
||||||
auth,
|
|
||||||
connection_ctx,
|
|
||||||
task_mgr::shutdown_token(),
|
|
||||||
);
|
|
||||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||||
|
|
||||||
match pgbackend
|
match pgbackend
|
||||||
@@ -255,7 +271,7 @@ async fn page_service_conn_main(
|
|||||||
struct PageServerHandler {
|
struct PageServerHandler {
|
||||||
_conf: &'static PageServerConf,
|
_conf: &'static PageServerConf,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
claims: Option<Claims>,
|
claims: Option<Claims>,
|
||||||
|
|
||||||
/// The context created for the lifetime of the connection
|
/// The context created for the lifetime of the connection
|
||||||
@@ -263,19 +279,14 @@ struct PageServerHandler {
|
|||||||
/// For each query received over the connection,
|
/// For each query received over the connection,
|
||||||
/// `process_query` creates a child context from this one.
|
/// `process_query` creates a child context from this one.
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
|
|
||||||
/// A token that should fire when the tenant transitions from
|
|
||||||
/// attached state, or when the pageserver is shutting down.
|
|
||||||
cancel: CancellationToken,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerHandler {
|
impl PageServerHandler {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
broker_client: storage_broker::BrokerClientChannel,
|
broker_client: storage_broker::BrokerClientChannel,
|
||||||
auth: Option<Arc<JwtAuth>>,
|
auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> Self {
|
) -> Self {
|
||||||
PageServerHandler {
|
PageServerHandler {
|
||||||
_conf: conf,
|
_conf: conf,
|
||||||
@@ -283,7 +294,6 @@ impl PageServerHandler {
|
|||||||
auth,
|
auth,
|
||||||
claims: None,
|
claims: None,
|
||||||
connection_ctx,
|
connection_ctx,
|
||||||
cancel,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -291,7 +301,11 @@ impl PageServerHandler {
|
|||||||
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
||||||
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
||||||
/// in the flush.
|
/// in the flush.
|
||||||
async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
|
async fn flush_cancellable<IO>(
|
||||||
|
&self,
|
||||||
|
pgb: &mut PostgresBackend<IO>,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
) -> Result<(), QueryError>
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
@@ -299,7 +313,7 @@ impl PageServerHandler {
|
|||||||
flush_r = pgb.flush() => {
|
flush_r = pgb.flush() => {
|
||||||
Ok(flush_r?)
|
Ok(flush_r?)
|
||||||
},
|
},
|
||||||
_ = self.cancel.cancelled() => {
|
_ = cancel.cancelled() => {
|
||||||
Err(QueryError::Shutdown)
|
Err(QueryError::Shutdown)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -308,6 +322,7 @@ impl PageServerHandler {
|
|||||||
fn copyin_stream<'a, IO>(
|
fn copyin_stream<'a, IO>(
|
||||||
&'a self,
|
&'a self,
|
||||||
pgb: &'a mut PostgresBackend<IO>,
|
pgb: &'a mut PostgresBackend<IO>,
|
||||||
|
cancel: &'a CancellationToken,
|
||||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
||||||
where
|
where
|
||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
@@ -317,7 +332,7 @@ impl PageServerHandler {
|
|||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = self.cancel.cancelled() => {
|
_ = cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
let msg = "pageserver is shutting down";
|
let msg = "pageserver is shutting down";
|
||||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||||
@@ -357,7 +372,7 @@ impl PageServerHandler {
|
|||||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||||
// error can't happen here, ErrorResponse serialization should be always ok
|
// error can't happen here, ErrorResponse serialization should be always ok
|
||||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||||
self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||||
}
|
}
|
||||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||||
@@ -384,12 +399,13 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// NOTE: pagerequests handler exits when connection is closed,
|
|
||||||
// so there is no need to reset the association
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
|
|
||||||
// Make request tracer if needed
|
// Make request tracer if needed
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = mgr::get_active_tenant_with_timeout(
|
||||||
|
tenant_id,
|
||||||
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
|
&task_mgr::shutdown_token(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
let mut tracer = if tenant.get_trace_read_requests() {
|
let mut tracer = if tenant.get_trace_read_requests() {
|
||||||
let connection_id = ConnectionId::generate();
|
let connection_id = ConnectionId::generate();
|
||||||
let path = tenant
|
let path = tenant
|
||||||
@@ -405,9 +421,14 @@ impl PageServerHandler {
|
|||||||
.get_timeline(timeline_id, true)
|
.get_timeline(timeline_id, true)
|
||||||
.map_err(|e| anyhow::anyhow!(e))?;
|
.map_err(|e| anyhow::anyhow!(e))?;
|
||||||
|
|
||||||
|
// Avoid starting new requests if the timeline has already started shutting down,
|
||||||
|
// and block timeline shutdown until this request is complete, or drops out due
|
||||||
|
// to cancellation.
|
||||||
|
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||||
|
|
||||||
// switch client to COPYBOTH
|
// switch client to COPYBOTH
|
||||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||||
|
|
||||||
@@ -415,7 +436,7 @@ impl PageServerHandler {
|
|||||||
let msg = tokio::select! {
|
let msg = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
|
||||||
_ = self.cancel.cancelled() => {
|
_ = timeline.cancel.cancelled() => {
|
||||||
// We were requested to shut down.
|
// We were requested to shut down.
|
||||||
info!("shutdown request received in page handler");
|
info!("shutdown request received in page handler");
|
||||||
return Err(QueryError::Shutdown)
|
return Err(QueryError::Shutdown)
|
||||||
@@ -490,9 +511,24 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if let Err(e) = &response {
|
||||||
|
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||||
|
// because wait_lsn etc will drop out
|
||||||
|
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||||
|
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||||
|
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
||||||
|
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||||
|
// shutdown, then do not send the error to the client. Instead just drop the
|
||||||
|
// connection.
|
||||||
|
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||||
|
return Err(QueryError::Shutdown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let response = response.unwrap_or_else(|e| {
|
let response = response.unwrap_or_else(|e| {
|
||||||
// print the all details to the log with {:#}, but for the client the
|
// print the all details to the log with {:#}, but for the client the
|
||||||
// error message is enough
|
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||||
|
// here includes cancellation which is not an error.
|
||||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||||
message: e.to_string(),
|
message: e.to_string(),
|
||||||
@@ -500,7 +536,7 @@ impl PageServerHandler {
|
|||||||
});
|
});
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -522,10 +558,14 @@ impl PageServerHandler {
|
|||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
// Create empty timeline
|
// Create empty timeline
|
||||||
info!("creating new timeline");
|
info!("creating new timeline");
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = get_active_tenant_with_timeout(
|
||||||
|
tenant_id,
|
||||||
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
|
&task_mgr::shutdown_token(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -543,9 +583,9 @@ impl PageServerHandler {
|
|||||||
// Import basebackup provided via CopyData
|
// Import basebackup provided via CopyData
|
||||||
info!("importing basebackup");
|
info!("importing basebackup");
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||||
|
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||||
timeline
|
timeline
|
||||||
.import_basebackup_from_tar(
|
.import_basebackup_from_tar(
|
||||||
&mut copyin_reader,
|
&mut copyin_reader,
|
||||||
@@ -582,9 +622,10 @@ impl PageServerHandler {
|
|||||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||||
{
|
{
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
|
||||||
|
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = self
|
||||||
|
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||||
|
.await?;
|
||||||
let last_record_lsn = timeline.get_last_record_lsn();
|
let last_record_lsn = timeline.get_last_record_lsn();
|
||||||
if last_record_lsn != start_lsn {
|
if last_record_lsn != start_lsn {
|
||||||
return Err(QueryError::Other(
|
return Err(QueryError::Other(
|
||||||
@@ -598,8 +639,8 @@ impl PageServerHandler {
|
|||||||
// Import wal provided via CopyData
|
// Import wal provided via CopyData
|
||||||
info!("importing wal");
|
info!("importing wal");
|
||||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
|
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||||
info!("wal import complete");
|
info!("wal import complete");
|
||||||
|
|
||||||
@@ -792,7 +833,9 @@ impl PageServerHandler {
|
|||||||
let started = std::time::Instant::now();
|
let started = std::time::Instant::now();
|
||||||
|
|
||||||
// check that the timeline exists
|
// check that the timeline exists
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = self
|
||||||
|
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||||
|
.await?;
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
if let Some(lsn) = lsn {
|
if let Some(lsn) = lsn {
|
||||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||||
@@ -807,7 +850,7 @@ impl PageServerHandler {
|
|||||||
|
|
||||||
// switch client to COPYOUT
|
// switch client to COPYOUT
|
||||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||||
@@ -859,7 +902,7 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||||
self.flush_cancellable(pgb).await?;
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
|
||||||
let basebackup_after = started
|
let basebackup_after = started
|
||||||
.elapsed()
|
.elapsed()
|
||||||
@@ -877,7 +920,7 @@ impl PageServerHandler {
|
|||||||
|
|
||||||
// when accessing management api supply None as an argument
|
// when accessing management api supply None as an argument
|
||||||
// when using to authorize tenant pass corresponding tenant id
|
// when using to authorize tenant pass corresponding tenant id
|
||||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
|
||||||
if self.auth.is_none() {
|
if self.auth.is_none() {
|
||||||
// auth is set to Trust, nothing to check so just return ok
|
// auth is set to Trust, nothing to check so just return ok
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -889,7 +932,26 @@ impl PageServerHandler {
|
|||||||
.claims
|
.claims
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("claims presence already checked");
|
.expect("claims presence already checked");
|
||||||
check_permission(claims, tenant_id)
|
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||||
|
async fn get_active_tenant_timeline(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
timeline_id: TimelineId,
|
||||||
|
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||||
|
let tenant = get_active_tenant_with_timeout(
|
||||||
|
tenant_id,
|
||||||
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
|
&task_mgr::shutdown_token(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(GetActiveTimelineError::Tenant)?;
|
||||||
|
let timeline = tenant
|
||||||
|
.get_timeline(timeline_id, true)
|
||||||
|
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||||
|
Ok(timeline)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -909,16 +971,17 @@ where
|
|||||||
.auth
|
.auth
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
|
||||||
|
.map_err(|e| QueryError::Unauthorized(e.0))?;
|
||||||
|
|
||||||
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
return Err(QueryError::Unauthorized(
|
||||||
"jwt token scope is Tenant, but tenant id is missing"
|
"jwt token scope is Tenant, but tenant id is missing".into(),
|
||||||
)));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
debug!(
|
||||||
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
|
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
|
||||||
data.claims.scope, data.claims.tenant_id,
|
data.claims.scope, data.claims.tenant_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -940,9 +1003,13 @@ where
|
|||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
query_string: &str,
|
query_string: &str,
|
||||||
) -> Result<(), QueryError> {
|
) -> Result<(), QueryError> {
|
||||||
|
fail::fail_point!("simulated-bad-compute-connection", |_| {
|
||||||
|
info!("Hit failpoint for bad connection");
|
||||||
|
Err(QueryError::SimulatedConnectionError)
|
||||||
|
});
|
||||||
|
|
||||||
let ctx = self.connection_ctx.attached_child();
|
let ctx = self.connection_ctx.attached_child();
|
||||||
debug!("process query {query_string:?}");
|
debug!("process query {query_string:?}");
|
||||||
|
|
||||||
if query_string.starts_with("pagestream ") {
|
if query_string.starts_with("pagestream ") {
|
||||||
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
let (_, params_raw) = query_string.split_at("pagestream ".len());
|
||||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||||
@@ -1048,7 +1115,9 @@ where
|
|||||||
.record("timeline_id", field::display(timeline_id));
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
let timeline = self
|
||||||
|
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||||
|
|
||||||
@@ -1232,7 +1301,12 @@ where
|
|||||||
|
|
||||||
self.check_permission(Some(tenant_id))?;
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
let tenant = get_active_tenant_with_timeout(
|
||||||
|
tenant_id,
|
||||||
|
ACTIVE_TENANT_TIMEOUT,
|
||||||
|
&task_mgr::shutdown_token(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||||
@@ -1278,67 +1352,16 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
|
||||||
enum GetActiveTenantError {
|
|
||||||
#[error(
|
|
||||||
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
|
|
||||||
)]
|
|
||||||
WaitForActiveTimeout {
|
|
||||||
latest_state: TenantState,
|
|
||||||
wait_time: Duration,
|
|
||||||
},
|
|
||||||
#[error(transparent)]
|
|
||||||
NotFound(GetTenantError),
|
|
||||||
#[error(transparent)]
|
|
||||||
WaitTenantActive(tenant::WaitToBecomeActiveError),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<GetActiveTenantError> for QueryError {
|
impl From<GetActiveTenantError> for QueryError {
|
||||||
fn from(e: GetActiveTenantError) -> Self {
|
fn from(e: GetActiveTenantError) -> Self {
|
||||||
match e {
|
match e {
|
||||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||||
),
|
),
|
||||||
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
|
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||||
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
|
QueryError::Shutdown
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get active tenant.
|
|
||||||
///
|
|
||||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
|
||||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
|
||||||
/// all tenants are still loading.
|
|
||||||
async fn get_active_tenant_with_timeout(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
|
||||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
|
||||||
let tenant = match mgr::get_tenant(tenant_id, false).await {
|
|
||||||
Ok(tenant) => tenant,
|
|
||||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
|
||||||
Err(GetTenantError::NotActive(_)) => {
|
|
||||||
unreachable!("we're calling get_tenant with active_only=false")
|
|
||||||
}
|
|
||||||
Err(GetTenantError::Broken(_)) => {
|
|
||||||
unreachable!("we're calling get_tenant with active_only=false")
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let wait_time = Duration::from_secs(30);
|
|
||||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
|
||||||
Ok(Ok(())) => Ok(tenant),
|
|
||||||
// no .context(), the error message is good enough and some tests depend on it
|
|
||||||
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
|
|
||||||
Err(_) => {
|
|
||||||
let latest_state = tenant.current_state();
|
|
||||||
if latest_state == TenantState::Active {
|
|
||||||
Ok(tenant)
|
|
||||||
} else {
|
|
||||||
Err(GetActiveTenantError::WaitForActiveTimeout {
|
|
||||||
latest_state,
|
|
||||||
wait_time,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1359,18 +1382,3 @@ impl From<GetActiveTimelineError> for QueryError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
|
||||||
async fn get_active_tenant_timeline(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
|
||||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
|
|
||||||
.await
|
|
||||||
.map_err(GetActiveTimelineError::Tenant)?;
|
|
||||||
let timeline = tenant
|
|
||||||
.get_timeline(timeline_id, true)
|
|
||||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
|
||||||
Ok(timeline)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ use serde::{Deserialize, Serialize};
|
|||||||
use std::collections::{hash_map, HashMap, HashSet};
|
use std::collections::{hash_map, HashMap, HashSet};
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
use tracing::{debug, trace, warn};
|
use tracing::{debug, trace, warn};
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
@@ -44,6 +43,17 @@ pub enum CalculateLogicalSizeError {
|
|||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||||
|
fn from(pre: PageReconstructError) -> Self {
|
||||||
|
match pre {
|
||||||
|
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||||
|
Self::Cancelled
|
||||||
|
}
|
||||||
|
_ => Self::Other(pre.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum RelationError {
|
pub enum RelationError {
|
||||||
#[error("Relation Already Exists")]
|
#[error("Relation Already Exists")]
|
||||||
@@ -567,30 +577,22 @@ impl Timeline {
|
|||||||
pub async fn get_current_logical_size_non_incremental(
|
pub async fn get_current_logical_size_non_incremental(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
cancel: CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<u64, CalculateLogicalSizeError> {
|
) -> Result<u64, CalculateLogicalSizeError> {
|
||||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// Fetch list of database dirs and iterate them
|
// Fetch list of database dirs and iterate them
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||||
|
|
||||||
let mut total_size: u64 = 0;
|
let mut total_size: u64 = 0;
|
||||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||||
for rel in self
|
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||||
.list_rels(*spcnode, *dbnode, lsn, ctx)
|
if self.cancel.is_cancelled() {
|
||||||
.await
|
|
||||||
.context("list rels")?
|
|
||||||
{
|
|
||||||
if cancel.is_cancelled() {
|
|
||||||
return Err(CalculateLogicalSizeError::Cancelled);
|
return Err(CalculateLogicalSizeError::Cancelled);
|
||||||
}
|
}
|
||||||
let relsize_key = rel_size_to_key(rel);
|
let relsize_key = rel_size_to_key(rel);
|
||||||
let mut buf = self
|
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||||
.get(relsize_key, lsn, ctx)
|
|
||||||
.await
|
|
||||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
|
||||||
let relsize = buf.get_u32_le();
|
let relsize = buf.get_u32_le();
|
||||||
|
|
||||||
total_size += relsize as u64;
|
total_size += relsize as u64;
|
||||||
|
|||||||
@@ -299,10 +299,6 @@ pub enum TaskKind {
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct MutableTaskState {
|
struct MutableTaskState {
|
||||||
/// Tenant and timeline that this task is associated with.
|
|
||||||
tenant_id: Option<TenantId>,
|
|
||||||
timeline_id: Option<TimelineId>,
|
|
||||||
|
|
||||||
/// Handle for waiting for the task to exit. It can be None, if the
|
/// Handle for waiting for the task to exit. It can be None, if the
|
||||||
/// the task has already exited.
|
/// the task has already exited.
|
||||||
join_handle: Option<JoinHandle<()>>,
|
join_handle: Option<JoinHandle<()>>,
|
||||||
@@ -319,6 +315,11 @@ struct PageServerTask {
|
|||||||
// To request task shutdown, just cancel this token.
|
// To request task shutdown, just cancel this token.
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
|
|
||||||
|
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||||
|
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||||
|
tenant_id: Option<TenantId>,
|
||||||
|
timeline_id: Option<TimelineId>,
|
||||||
|
|
||||||
mutable: Mutex<MutableTaskState>,
|
mutable: Mutex<MutableTaskState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -344,11 +345,9 @@ where
|
|||||||
kind,
|
kind,
|
||||||
name: name.to_string(),
|
name: name.to_string(),
|
||||||
cancel: cancel.clone(),
|
cancel: cancel.clone(),
|
||||||
mutable: Mutex::new(MutableTaskState {
|
tenant_id,
|
||||||
tenant_id,
|
timeline_id,
|
||||||
timeline_id,
|
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
||||||
join_handle: None,
|
|
||||||
}),
|
|
||||||
});
|
});
|
||||||
|
|
||||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||||
@@ -418,8 +417,6 @@ async fn task_finish(
|
|||||||
|
|
||||||
let mut shutdown_process = false;
|
let mut shutdown_process = false;
|
||||||
{
|
{
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(Ok(())) => {
|
Ok(Ok(())) => {
|
||||||
debug!("Task '{}' exited normally", task_name);
|
debug!("Task '{}' exited normally", task_name);
|
||||||
@@ -428,13 +425,13 @@ async fn task_finish(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -442,13 +439,13 @@ async fn task_finish(
|
|||||||
if shutdown_process_on_error {
|
if shutdown_process_on_error {
|
||||||
error!(
|
error!(
|
||||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
shutdown_process = true;
|
shutdown_process = true;
|
||||||
} else {
|
} else {
|
||||||
error!(
|
error!(
|
||||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
task_name, task.tenant_id, task.timeline_id, err
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -460,17 +457,6 @@ async fn task_finish(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// expected to be called from the task of the given id.
|
|
||||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
|
||||||
CURRENT_TASK.with(|ct| {
|
|
||||||
let mut task_mut = ct.mutable.lock().unwrap();
|
|
||||||
task_mut.tenant_id = tenant_id;
|
|
||||||
task_mut.timeline_id = timeline_id;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Is there a task running that matches the criteria
|
|
||||||
|
|
||||||
/// Signal and wait for tasks to shut down.
|
/// Signal and wait for tasks to shut down.
|
||||||
///
|
///
|
||||||
///
|
///
|
||||||
@@ -493,17 +479,16 @@ pub async fn shutdown_tasks(
|
|||||||
{
|
{
|
||||||
let tasks = TASKS.lock().unwrap();
|
let tasks = TASKS.lock().unwrap();
|
||||||
for task in tasks.values() {
|
for task in tasks.values() {
|
||||||
let task_mut = task.mutable.lock().unwrap();
|
|
||||||
if (kind.is_none() || Some(task.kind) == kind)
|
if (kind.is_none() || Some(task.kind) == kind)
|
||||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
|
||||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
task.cancel.cancel();
|
task.cancel.cancel();
|
||||||
victim_tasks.push((
|
victim_tasks.push((
|
||||||
Arc::clone(task),
|
Arc::clone(task),
|
||||||
task.kind,
|
task.kind,
|
||||||
task_mut.tenant_id,
|
task.tenant_id,
|
||||||
task_mut.timeline_id,
|
task.timeline_id,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ use tracing::*;
|
|||||||
use utils::completion;
|
use utils::completion;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
use utils::fs_ext;
|
use utils::fs_ext;
|
||||||
|
use utils::sync::gate::Gate;
|
||||||
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
@@ -54,6 +55,8 @@ use self::config::TenantConf;
|
|||||||
use self::delete::DeleteTenantFlow;
|
use self::delete::DeleteTenantFlow;
|
||||||
use self::metadata::LoadMetadataError;
|
use self::metadata::LoadMetadataError;
|
||||||
use self::metadata::TimelineMetadata;
|
use self::metadata::TimelineMetadata;
|
||||||
|
use self::mgr::GetActiveTenantError;
|
||||||
|
use self::mgr::GetTenantError;
|
||||||
use self::mgr::TenantsMap;
|
use self::mgr::TenantsMap;
|
||||||
use self::remote_timeline_client::RemoteTimelineClient;
|
use self::remote_timeline_client::RemoteTimelineClient;
|
||||||
use self::timeline::uninit::TimelineUninitMark;
|
use self::timeline::uninit::TimelineUninitMark;
|
||||||
@@ -252,6 +255,20 @@ pub struct Tenant {
|
|||||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||||
|
|
||||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||||
|
|
||||||
|
// Cancellation token fires when we have entered shutdown(). This is a parent of
|
||||||
|
// Timelines' cancellation token.
|
||||||
|
pub(crate) cancel: CancellationToken,
|
||||||
|
|
||||||
|
// Users of the Tenant such as the page service must take this Gate to avoid
|
||||||
|
// trying to use a Tenant which is shutting down.
|
||||||
|
pub(crate) gate: Gate,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for Tenant {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "{} ({})", self.tenant_id, self.current_state())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) enum WalRedoManager {
|
pub(crate) enum WalRedoManager {
|
||||||
@@ -359,34 +376,6 @@ impl Debug for SetStoppingError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
|
||||||
pub(crate) enum WaitToBecomeActiveError {
|
|
||||||
WillNotBecomeActive {
|
|
||||||
tenant_id: TenantId,
|
|
||||||
state: TenantState,
|
|
||||||
},
|
|
||||||
TenantDropped {
|
|
||||||
tenant_id: TenantId,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for WaitToBecomeActiveError {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
match self {
|
|
||||||
WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"Tenant {} will not become active. Current state: {:?}",
|
|
||||||
tenant_id, state
|
|
||||||
)
|
|
||||||
}
|
|
||||||
WaitToBecomeActiveError::TenantDropped { tenant_id } => {
|
|
||||||
write!(f, "Tenant {tenant_id} will not become active (dropped)")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(thiserror::Error, Debug)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub enum CreateTimelineError {
|
pub enum CreateTimelineError {
|
||||||
#[error("a timeline with the given ID already exists")]
|
#[error("a timeline with the given ID already exists")]
|
||||||
@@ -395,6 +384,8 @@ pub enum CreateTimelineError {
|
|||||||
AncestorLsn(anyhow::Error),
|
AncestorLsn(anyhow::Error),
|
||||||
#[error("ancestor timeline is not active")]
|
#[error("ancestor timeline is not active")]
|
||||||
AncestorNotActive,
|
AncestorNotActive,
|
||||||
|
#[error("tenant shutting down")]
|
||||||
|
ShuttingDown,
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
@@ -526,7 +517,7 @@ impl Tenant {
|
|||||||
resources: TenantSharedResources,
|
resources: TenantSharedResources,
|
||||||
attached_conf: AttachedTenantConf,
|
attached_conf: AttachedTenantConf,
|
||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Tenant>> {
|
) -> anyhow::Result<Arc<Tenant>> {
|
||||||
@@ -1524,6 +1515,11 @@ impl Tenant {
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||||
|
|
||||||
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
|
||||||
debug!("timeline {new_timeline_id} already exists");
|
debug!("timeline {new_timeline_id} already exists");
|
||||||
|
|
||||||
@@ -1808,6 +1804,7 @@ impl Tenant {
|
|||||||
freeze_and_flush: bool,
|
freeze_and_flush: bool,
|
||||||
) -> Result<(), completion::Barrier> {
|
) -> Result<(), completion::Barrier> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
// Set tenant (and its timlines) to Stoppping state.
|
// Set tenant (and its timlines) to Stoppping state.
|
||||||
//
|
//
|
||||||
// Since we can only transition into Stopping state after activation is complete,
|
// Since we can only transition into Stopping state after activation is complete,
|
||||||
@@ -1833,6 +1830,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
Err(SetStoppingError::AlreadyStopping(other)) => {
|
Err(SetStoppingError::AlreadyStopping(other)) => {
|
||||||
// give caller the option to wait for this this shutdown
|
// give caller the option to wait for this this shutdown
|
||||||
|
info!("Tenant::shutdown: AlreadyStopping");
|
||||||
return Err(other);
|
return Err(other);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -1843,9 +1841,16 @@ impl Tenant {
|
|||||||
timelines.values().for_each(|timeline| {
|
timelines.values().for_each(|timeline| {
|
||||||
let timeline = Arc::clone(timeline);
|
let timeline = Arc::clone(timeline);
|
||||||
let span = Span::current();
|
let span = Span::current();
|
||||||
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
|
js.spawn(async move {
|
||||||
|
if freeze_and_flush {
|
||||||
|
timeline.flush_and_shutdown().instrument(span).await
|
||||||
|
} else {
|
||||||
|
timeline.shutdown().instrument(span).await
|
||||||
|
}
|
||||||
|
});
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
|
tracing::info!("Waiting for timelines...");
|
||||||
while let Some(res) = js.join_next().await {
|
while let Some(res) = js.join_next().await {
|
||||||
match res {
|
match res {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
@@ -1855,12 +1860,21 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
|
||||||
|
// them to continue to do work during their shutdown methods, e.g. flushing data.
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
self.cancel.cancel();
|
||||||
|
|
||||||
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
// shutdown all tenant and timeline tasks: gc, compaction, page service
|
||||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||||
//
|
//
|
||||||
// this will additionally shutdown and await all timeline tasks.
|
// this will additionally shutdown and await all timeline tasks.
|
||||||
|
tracing::debug!("Waiting for tasks...");
|
||||||
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
|
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
|
||||||
|
|
||||||
|
// Wait for any in-flight operations to complete
|
||||||
|
self.gate.close().await;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2021,7 +2035,7 @@ impl Tenant {
|
|||||||
self.state.subscribe()
|
self.state.subscribe()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
|
pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
|
||||||
let mut receiver = self.state.subscribe();
|
let mut receiver = self.state.subscribe();
|
||||||
loop {
|
loop {
|
||||||
let current_state = receiver.borrow_and_update().clone();
|
let current_state = receiver.borrow_and_update().clone();
|
||||||
@@ -2029,11 +2043,9 @@ impl Tenant {
|
|||||||
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
|
||||||
// in these states, there's a chance that we can reach ::Active
|
// in these states, there's a chance that we can reach ::Active
|
||||||
receiver.changed().await.map_err(
|
receiver.changed().await.map_err(
|
||||||
|_e: tokio::sync::watch::error::RecvError| {
|
|_e: tokio::sync::watch::error::RecvError|
|
||||||
WaitToBecomeActiveError::TenantDropped {
|
// Tenant existed but was dropped: report it as non-existent
|
||||||
tenant_id: self.tenant_id,
|
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
|
||||||
}
|
|
||||||
},
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
TenantState::Active { .. } => {
|
TenantState::Active { .. } => {
|
||||||
@@ -2041,10 +2053,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||||
// There's no chance the tenant can transition back into ::Active
|
// There's no chance the tenant can transition back into ::Active
|
||||||
return Err(WaitToBecomeActiveError::WillNotBecomeActive {
|
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
|
||||||
tenant_id: self.tenant_id,
|
|
||||||
state: current_state,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2110,6 +2119,9 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Tenant {
|
impl Tenant {
|
||||||
|
pub fn get_tenant_id(&self) -> TenantId {
|
||||||
|
self.tenant_id
|
||||||
|
}
|
||||||
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.read().unwrap().tenant_conf
|
self.tenant_conf.read().unwrap().tenant_conf
|
||||||
}
|
}
|
||||||
@@ -2267,6 +2279,7 @@ impl Tenant {
|
|||||||
initial_logical_size_can_start.cloned(),
|
initial_logical_size_can_start.cloned(),
|
||||||
initial_logical_size_attempt.cloned().flatten(),
|
initial_logical_size_attempt.cloned().flatten(),
|
||||||
state,
|
state,
|
||||||
|
self.cancel.child_token(),
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
@@ -2356,6 +2369,8 @@ impl Tenant {
|
|||||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||||
|
cancel: CancellationToken::default(),
|
||||||
|
gate: Gate::new(format!("Tenant<{tenant_id}>")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3519,10 +3534,6 @@ pub(crate) mod harness {
|
|||||||
let remote_fs_dir = conf.workdir.join("localfs");
|
let remote_fs_dir = conf.workdir.join("localfs");
|
||||||
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
std::fs::create_dir_all(&remote_fs_dir).unwrap();
|
||||||
let config = RemoteStorageConfig {
|
let config = RemoteStorageConfig {
|
||||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
|
|
||||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
|
||||||
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
|
|
||||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
|
||||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||||
};
|
};
|
||||||
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
|
||||||
@@ -3692,7 +3703,7 @@ mod tests {
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
static TEST_KEY: Lazy<Key> =
|
static TEST_KEY: Lazy<Key> =
|
||||||
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
|
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_basic() -> anyhow::Result<()> {
|
async fn test_basic() -> anyhow::Result<()> {
|
||||||
@@ -3788,9 +3799,9 @@ mod tests {
|
|||||||
let writer = tline.writer().await;
|
let writer = tline.writer().await;
|
||||||
|
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||||
#[allow(non_snake_case)]
|
#[allow(non_snake_case)]
|
||||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();
|
||||||
|
|
||||||
// Insert a value on the timeline
|
// Insert a value on the timeline
|
||||||
writer
|
writer
|
||||||
@@ -4236,11 +4247,7 @@ mod tests {
|
|||||||
metadata_bytes[8] ^= 1;
|
metadata_bytes[8] ^= 1;
|
||||||
std::fs::write(metadata_path, metadata_bytes)?;
|
std::fs::write(metadata_path, metadata_bytes)?;
|
||||||
|
|
||||||
let err = harness
|
let err = harness.try_load_local(&ctx).await.expect_err("should fail");
|
||||||
.try_load_local(&ctx)
|
|
||||||
.await
|
|
||||||
.err()
|
|
||||||
.expect("should fail");
|
|
||||||
// get all the stack with all .context, not only the last one
|
// get all the stack with all .context, not only the last one
|
||||||
let message = format!("{err:#}");
|
let message = format!("{err:#}");
|
||||||
let expected = "failed to load metadata";
|
let expected = "failed to load metadata";
|
||||||
@@ -4374,7 +4381,7 @@ mod tests {
|
|||||||
|
|
||||||
let mut keyspace = KeySpaceAccum::new();
|
let mut keyspace = KeySpaceAccum::new();
|
||||||
|
|
||||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
let mut blknum = 0;
|
let mut blknum = 0;
|
||||||
for _ in 0..50 {
|
for _ in 0..50 {
|
||||||
for _ in 0..10000 {
|
for _ in 0..10000 {
|
||||||
@@ -4420,7 +4427,7 @@ mod tests {
|
|||||||
|
|
||||||
const NUM_KEYS: usize = 1000;
|
const NUM_KEYS: usize = 1000;
|
||||||
|
|
||||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
|
|
||||||
let mut keyspace = KeySpaceAccum::new();
|
let mut keyspace = KeySpaceAccum::new();
|
||||||
|
|
||||||
@@ -4501,7 +4508,7 @@ mod tests {
|
|||||||
|
|
||||||
const NUM_KEYS: usize = 1000;
|
const NUM_KEYS: usize = 1000;
|
||||||
|
|
||||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
|
|
||||||
let mut keyspace = KeySpaceAccum::new();
|
let mut keyspace = KeySpaceAccum::new();
|
||||||
|
|
||||||
@@ -4592,7 +4599,7 @@ mod tests {
|
|||||||
const NUM_KEYS: usize = 100;
|
const NUM_KEYS: usize = 100;
|
||||||
const NUM_TLINES: usize = 50;
|
const NUM_TLINES: usize = 50;
|
||||||
|
|
||||||
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
|
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||||
// Track page mutation lsns across different timelines.
|
// Track page mutation lsns across different timelines.
|
||||||
let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
|
let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
|
||||||
|
|
||||||
@@ -4646,74 +4653,6 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
|
|
||||||
let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")?
|
|
||||||
.load()
|
|
||||||
.await;
|
|
||||||
|
|
||||||
let initdb_lsn = Lsn(0x20);
|
|
||||||
let utline = tenant
|
|
||||||
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
|
|
||||||
.await?;
|
|
||||||
let tline = utline.raw_timeline().unwrap();
|
|
||||||
|
|
||||||
// Spawn flush loop now so that we can set the `expect_initdb_optimization`
|
|
||||||
tline.maybe_spawn_flush_loop();
|
|
||||||
|
|
||||||
// Make sure the timeline has the minimum set of required keys for operation.
|
|
||||||
// The only operation you can always do on an empty timeline is to `put` new data.
|
|
||||||
// Except if you `put` at `initdb_lsn`.
|
|
||||||
// In that case, there's an optimization to directly create image layers instead of delta layers.
|
|
||||||
// It uses `repartition()`, which assumes some keys to be present.
|
|
||||||
// Let's make sure the test timeline can handle that case.
|
|
||||||
{
|
|
||||||
let mut state = tline.flush_loop_state.lock().unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
timeline::FlushLoopState::Running {
|
|
||||||
expect_initdb_optimization: false,
|
|
||||||
initdb_optimization_count: 0,
|
|
||||||
},
|
|
||||||
*state
|
|
||||||
);
|
|
||||||
*state = timeline::FlushLoopState::Running {
|
|
||||||
expect_initdb_optimization: true,
|
|
||||||
initdb_optimization_count: 0,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization.
|
|
||||||
// As explained above, the optimization requires some keys to be present.
|
|
||||||
// As per `create_empty_timeline` documentation, use init_empty to set them.
|
|
||||||
// This is what `create_test_timeline` does, by the way.
|
|
||||||
let mut modification = tline.begin_modification(initdb_lsn);
|
|
||||||
modification
|
|
||||||
.init_empty_test_timeline()
|
|
||||||
.context("init_empty_test_timeline")?;
|
|
||||||
modification
|
|
||||||
.commit(&ctx)
|
|
||||||
.await
|
|
||||||
.context("commit init_empty_test_timeline modification")?;
|
|
||||||
|
|
||||||
// Do the flush. The flush code will check the expectations that we set above.
|
|
||||||
tline.freeze_and_flush().await?;
|
|
||||||
|
|
||||||
// assert freeze_and_flush exercised the initdb optimization
|
|
||||||
{
|
|
||||||
let state = tline.flush_loop_state.lock().unwrap();
|
|
||||||
let timeline::FlushLoopState::Running {
|
|
||||||
expect_initdb_optimization,
|
|
||||||
initdb_optimization_count,
|
|
||||||
} = *state
|
|
||||||
else {
|
|
||||||
panic!("unexpected state: {:?}", *state);
|
|
||||||
};
|
|
||||||
assert!(expect_initdb_optimization);
|
|
||||||
assert!(initdb_optimization_count > 0);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
||||||
let name = "test_uninit_mark_crash";
|
let name = "test_uninit_mark_crash";
|
||||||
@@ -4726,7 +4665,7 @@ mod tests {
|
|||||||
// Keeps uninit mark in place
|
// Keeps uninit mark in place
|
||||||
let raw_tline = tline.raw_timeline().unwrap();
|
let raw_tline = tline.raw_timeline().unwrap();
|
||||||
raw_tline
|
raw_tline
|
||||||
.shutdown(false)
|
.shutdown()
|
||||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
|
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
|
||||||
.await;
|
.await;
|
||||||
std::mem::forget(tline);
|
std::mem::forget(tline);
|
||||||
|
|||||||
@@ -327,7 +327,7 @@ mod tests {
|
|||||||
let mut sz: u16 = rng.gen();
|
let mut sz: u16 = rng.gen();
|
||||||
// Make 50% of the arrays small
|
// Make 50% of the arrays small
|
||||||
if rng.gen() {
|
if rng.gen() {
|
||||||
sz |= 63;
|
sz &= 63;
|
||||||
}
|
}
|
||||||
random_array(sz.into())
|
random_array(sz.into())
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
mgr::{GetTenantError, TenantsMap},
|
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||||
span,
|
span,
|
||||||
timeline::delete::DeleteTimelineFlow,
|
timeline::delete::DeleteTimelineFlow,
|
||||||
@@ -33,12 +33,21 @@ pub(crate) enum DeleteTenantError {
|
|||||||
#[error("GetTenant {0}")]
|
#[error("GetTenant {0}")]
|
||||||
Get(#[from] GetTenantError),
|
Get(#[from] GetTenantError),
|
||||||
|
|
||||||
|
#[error("Tenant not attached")]
|
||||||
|
NotAttached,
|
||||||
|
|
||||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||||
InvalidState(TenantState),
|
InvalidState(TenantState),
|
||||||
|
|
||||||
#[error("Tenant deletion is already in progress")]
|
#[error("Tenant deletion is already in progress")]
|
||||||
AlreadyInProgress,
|
AlreadyInProgress,
|
||||||
|
|
||||||
|
#[error("Tenant map slot error {0}")]
|
||||||
|
SlotError(#[from] TenantSlotError),
|
||||||
|
|
||||||
|
#[error("Tenant map slot upsert error {0}")]
|
||||||
|
SlotUpsertError(#[from] TenantSlotUpsertError),
|
||||||
|
|
||||||
#[error("Timeline {0}")]
|
#[error("Timeline {0}")]
|
||||||
Timeline(#[from] DeleteTimelineError),
|
Timeline(#[from] DeleteTimelineError),
|
||||||
|
|
||||||
@@ -273,12 +282,12 @@ impl DeleteTenantFlow {
|
|||||||
pub(crate) async fn run(
|
pub(crate) async fn run(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant_id: TenantId,
|
tenant: Arc<Tenant>,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
|
let mut guard = Self::prepare(&tenant).await?;
|
||||||
|
|
||||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||||
tenant.set_broken(format!("{e:#}")).await;
|
tenant.set_broken(format!("{e:#}")).await;
|
||||||
@@ -378,7 +387,7 @@ impl DeleteTenantFlow {
|
|||||||
guard: DeletionGuard,
|
guard: DeletionGuard,
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
preload: Option<TenantPreload>,
|
preload: Option<TenantPreload>,
|
||||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
init_order: Option<InitializationOrder>,
|
init_order: Option<InitializationOrder>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
@@ -405,15 +414,8 @@ impl DeleteTenantFlow {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn prepare(
|
async fn prepare(
|
||||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
tenant: &Arc<Tenant>,
|
||||||
tenant_id: TenantId,
|
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||||
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
|
|
||||||
let m = tenants.read().await;
|
|
||||||
|
|
||||||
let tenant = m
|
|
||||||
.get(&tenant_id)
|
|
||||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
|
||||||
|
|
||||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||||
// Broken and Stopping is needed for retries.
|
// Broken and Stopping is needed for retries.
|
||||||
@@ -447,14 +449,14 @@ impl DeleteTenantFlow {
|
|||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((Arc::clone(tenant), guard))
|
Ok(guard)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn schedule_background(
|
fn schedule_background(
|
||||||
guard: OwnedMutexGuard<Self>,
|
guard: OwnedMutexGuard<Self>,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
) {
|
) {
|
||||||
let tenant_id = tenant.tenant_id;
|
let tenant_id = tenant.tenant_id;
|
||||||
@@ -487,7 +489,7 @@ impl DeleteTenantFlow {
|
|||||||
mut guard: OwnedMutexGuard<Self>,
|
mut guard: OwnedMutexGuard<Self>,
|
||||||
conf: &PageServerConf,
|
conf: &PageServerConf,
|
||||||
remote_storage: Option<GenericRemoteStorage>,
|
remote_storage: Option<GenericRemoteStorage>,
|
||||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
) -> Result<(), DeleteTenantError> {
|
) -> Result<(), DeleteTenantError> {
|
||||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||||
@@ -535,10 +537,18 @@ impl DeleteTenantFlow {
|
|||||||
.await
|
.await
|
||||||
.context("cleanup_remaining_fs_traces")?;
|
.context("cleanup_remaining_fs_traces")?;
|
||||||
|
|
||||||
let mut locked = tenants.write().await;
|
{
|
||||||
if locked.remove(&tenant.tenant_id).is_none() {
|
let mut locked = tenants.write().unwrap();
|
||||||
warn!("Tenant got removed from tenants map during deletion");
|
if locked.remove(&tenant.tenant_id).is_none() {
|
||||||
};
|
warn!("Tenant got removed from tenants map during deletion");
|
||||||
|
};
|
||||||
|
|
||||||
|
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||||
|
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||||
|
crate::metrics::TENANT_MANAGER
|
||||||
|
.tenant_slots
|
||||||
|
.set(locked.len() as u64);
|
||||||
|
}
|
||||||
|
|
||||||
*guard = Self::Finished;
|
*guard = Self::Finished;
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,6 @@ use std::sync::Arc;
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use tokio::sync::oneshot::error::RecvError;
|
use tokio::sync::oneshot::error::RecvError;
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
use tokio_util::sync::CancellationToken;
|
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||||
@@ -350,10 +349,6 @@ async fn fill_logical_sizes(
|
|||||||
// our advantage with `?` error handling.
|
// our advantage with `?` error handling.
|
||||||
let mut joinset = tokio::task::JoinSet::new();
|
let mut joinset = tokio::task::JoinSet::new();
|
||||||
|
|
||||||
let cancel = tokio_util::sync::CancellationToken::new();
|
|
||||||
// be sure to cancel all spawned tasks if we are dropped
|
|
||||||
let _dg = cancel.clone().drop_guard();
|
|
||||||
|
|
||||||
// For each point that would benefit from having a logical size available,
|
// For each point that would benefit from having a logical size available,
|
||||||
// spawn a Task to fetch it, unless we have it cached already.
|
// spawn a Task to fetch it, unless we have it cached already.
|
||||||
for seg in segments.iter() {
|
for seg in segments.iter() {
|
||||||
@@ -371,15 +366,8 @@ async fn fill_logical_sizes(
|
|||||||
let parallel_size_calcs = Arc::clone(limit);
|
let parallel_size_calcs = Arc::clone(limit);
|
||||||
let ctx = ctx.attached_child();
|
let ctx = ctx.attached_child();
|
||||||
joinset.spawn(
|
joinset.spawn(
|
||||||
calculate_logical_size(
|
calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
|
||||||
parallel_size_calcs,
|
.in_current_span(),
|
||||||
timeline,
|
|
||||||
lsn,
|
|
||||||
cause,
|
|
||||||
ctx,
|
|
||||||
cancel.child_token(),
|
|
||||||
)
|
|
||||||
.in_current_span(),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
e.insert(cached_size);
|
e.insert(cached_size);
|
||||||
@@ -406,10 +394,12 @@ async fn fill_logical_sizes(
|
|||||||
have_any_error = true;
|
have_any_error = true;
|
||||||
}
|
}
|
||||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
||||||
warn!(
|
if !matches!(error, CalculateLogicalSizeError::Cancelled) {
|
||||||
timeline_id=%timeline.timeline_id,
|
warn!(
|
||||||
"failed to calculate logical size at {lsn}: {error:#}"
|
timeline_id=%timeline.timeline_id,
|
||||||
);
|
"failed to calculate logical size at {lsn}: {error:#}"
|
||||||
|
);
|
||||||
|
}
|
||||||
have_any_error = true;
|
have_any_error = true;
|
||||||
}
|
}
|
||||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
||||||
@@ -485,14 +475,13 @@ async fn calculate_logical_size(
|
|||||||
lsn: utils::lsn::Lsn,
|
lsn: utils::lsn::Lsn,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||||
.await
|
.await
|
||||||
.expect("global semaphore should not had been closed");
|
.expect("global semaphore should not had been closed");
|
||||||
|
|
||||||
let size_res = timeline
|
let size_res = timeline
|
||||||
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
|
.spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
|
||||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
||||||
.await?;
|
.await?;
|
||||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||||
|
|||||||
@@ -345,14 +345,19 @@ impl InMemoryLayer {
|
|||||||
|
|
||||||
let cursor = inner.file.block_cursor();
|
let cursor = inner.file.block_cursor();
|
||||||
|
|
||||||
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
// Sort the keys because delta layer writer expects them sorted.
|
||||||
keys.sort_by_key(|k| k.0);
|
//
|
||||||
|
// NOTE: this sort can take up significant time if the layer has millions of
|
||||||
|
// keys. To speed up all the comparisons we convert the key to i128 and
|
||||||
|
// keep the value as a reference.
|
||||||
|
let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
|
||||||
|
keys.sort_unstable_by_key(|k| k.0);
|
||||||
|
|
||||||
let ctx = RequestContextBuilder::extend(ctx)
|
let ctx = RequestContextBuilder::extend(ctx)
|
||||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||||
.build();
|
.build();
|
||||||
for (key, vec_map) in keys.iter() {
|
for (key, vec_map) in keys.iter() {
|
||||||
let key = **key;
|
let key = Key::from_i128(*key);
|
||||||
// Write all page versions
|
// Write all page versions
|
||||||
for (lsn, pos) in vec_map.as_slice() {
|
for (lsn, pos) in vec_map.as_slice() {
|
||||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ use tokio::{
|
|||||||
};
|
};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::{id::TenantTimelineId, sync::gate::Gate};
|
||||||
|
|
||||||
use std::cmp::{max, min, Ordering};
|
use std::cmp::{max, min, Ordering};
|
||||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||||
@@ -36,7 +36,6 @@ use std::time::{Duration, Instant, SystemTime};
|
|||||||
use crate::context::{
|
use crate::context::{
|
||||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||||
};
|
};
|
||||||
use crate::deletion_queue::DeletionQueueClient;
|
|
||||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||||
use crate::tenant::storage_layer::{
|
use crate::tenant::storage_layer::{
|
||||||
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
|
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
|
||||||
@@ -50,6 +49,7 @@ use crate::tenant::{
|
|||||||
metadata::{save_metadata, TimelineMetadata},
|
metadata::{save_metadata, TimelineMetadata},
|
||||||
par_fsync,
|
par_fsync,
|
||||||
};
|
};
|
||||||
|
use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
|
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
|
||||||
@@ -95,12 +95,7 @@ use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenant
|
|||||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||||
pub(super) enum FlushLoopState {
|
pub(super) enum FlushLoopState {
|
||||||
NotStarted,
|
NotStarted,
|
||||||
Running {
|
Running,
|
||||||
#[cfg(test)]
|
|
||||||
expect_initdb_optimization: bool,
|
|
||||||
#[cfg(test)]
|
|
||||||
initdb_optimization_count: usize,
|
|
||||||
},
|
|
||||||
Exited,
|
Exited,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,7 +242,7 @@ pub struct Timeline {
|
|||||||
/// the flush finishes. You can use that to wait for the flush to finish.
|
/// the flush finishes. You can use that to wait for the flush to finish.
|
||||||
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
|
||||||
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
|
||||||
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
|
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
|
||||||
|
|
||||||
/// Layer removal lock.
|
/// Layer removal lock.
|
||||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||||
@@ -310,6 +305,13 @@ pub struct Timeline {
|
|||||||
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
||||||
/// happened. Used for consumption metrics.
|
/// happened. Used for consumption metrics.
|
||||||
pub(crate) loaded_at: (Lsn, SystemTime),
|
pub(crate) loaded_at: (Lsn, SystemTime),
|
||||||
|
|
||||||
|
/// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
|
||||||
|
pub(crate) gate: Gate,
|
||||||
|
|
||||||
|
/// Cancellation token scoped to this timeline: anything doing long-running work relating
|
||||||
|
/// to the timeline should drop out when this token fires.
|
||||||
|
pub(crate) cancel: CancellationToken,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiverInfo {
|
pub struct WalReceiverInfo {
|
||||||
@@ -367,6 +369,19 @@ pub enum PageReconstructError {
|
|||||||
WalRedo(anyhow::Error),
|
WalRedo(anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
enum FlushLayerError {
|
||||||
|
/// Timeline cancellation token was cancelled
|
||||||
|
#[error("timeline shutting down")]
|
||||||
|
Cancelled,
|
||||||
|
|
||||||
|
#[error(transparent)]
|
||||||
|
PageReconstructError(#[from] PageReconstructError),
|
||||||
|
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for PageReconstructError {
|
impl std::fmt::Debug for PageReconstructError {
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||||
match self {
|
match self {
|
||||||
@@ -786,7 +801,11 @@ impl Timeline {
|
|||||||
// as an empty timeline. Also in unit tests, when we use the timeline
|
// as an empty timeline. Also in unit tests, when we use the timeline
|
||||||
// as a simple key-value store, ignoring the datadir layout. Log the
|
// as a simple key-value store, ignoring the datadir layout. Log the
|
||||||
// error but continue.
|
// error but continue.
|
||||||
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
//
|
||||||
|
// Suppress error when it's due to cancellation
|
||||||
|
if !self.cancel.is_cancelled() {
|
||||||
|
error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -880,11 +899,17 @@ impl Timeline {
|
|||||||
self.launch_eviction_task(background_jobs_can_start);
|
self.launch_eviction_task(background_jobs_can_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
|
||||||
|
/// also to remote storage. This method can easily take multiple seconds for a busy timeline.
|
||||||
|
///
|
||||||
|
/// While we are flushing, we continue to accept read I/O.
|
||||||
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
|
||||||
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
|
pub(crate) async fn flush_and_shutdown(&self) {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
// prevent writes to the InMemoryLayer
|
// Stop ingesting data, so that we are not still writing to an InMemoryLayer while
|
||||||
|
// trying to flush
|
||||||
|
tracing::debug!("Waiting for WalReceiverManager...");
|
||||||
task_mgr::shutdown_tasks(
|
task_mgr::shutdown_tasks(
|
||||||
Some(TaskKind::WalReceiverManager),
|
Some(TaskKind::WalReceiverManager),
|
||||||
Some(self.tenant_id),
|
Some(self.tenant_id),
|
||||||
@@ -892,34 +917,74 @@ impl Timeline {
|
|||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
// Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
|
||||||
|
self.last_record_lsn.shutdown();
|
||||||
|
|
||||||
// now all writers to InMemory layer are gone, do the final flush if requested
|
// now all writers to InMemory layer are gone, do the final flush if requested
|
||||||
if freeze_and_flush {
|
match self.freeze_and_flush().await {
|
||||||
match self.freeze_and_flush().await {
|
Ok(_) => {
|
||||||
Ok(()) => {}
|
// drain the upload queue
|
||||||
Err(e) => {
|
if let Some(client) = self.remote_client.as_ref() {
|
||||||
warn!("failed to freeze and flush: {e:#}");
|
// if we did not wait for completion here, it might be our shutdown process
|
||||||
return; // TODO: should probably drain remote timeline client anyways?
|
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
||||||
|
// be spawned.
|
||||||
|
//
|
||||||
|
// what is problematic is the shutting down of RemoteTimelineClient, because
|
||||||
|
// obviously it does not make sense to stop while we wait for it, but what
|
||||||
|
// about corner cases like s3 suddenly hanging up?
|
||||||
|
if let Err(e) = client.wait_completion().await {
|
||||||
|
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||||
|
// we have some extra WAL replay to do next time the timeline starts.
|
||||||
|
warn!("failed to flush to remote storage: {e:#}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Err(e) => {
|
||||||
// drain the upload queue
|
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
|
||||||
let res = if let Some(client) = self.remote_client.as_ref() {
|
// we have some extra WAL replay to do next time the timeline starts.
|
||||||
// if we did not wait for completion here, it might be our shutdown process
|
warn!("failed to freeze and flush: {e:#}");
|
||||||
// didn't wait for remote uploads to complete at all, as new tasks can forever
|
|
||||||
// be spawned.
|
|
||||||
//
|
|
||||||
// what is problematic is the shutting down of RemoteTimelineClient, because
|
|
||||||
// obviously it does not make sense to stop while we wait for it, but what
|
|
||||||
// about corner cases like s3 suddenly hanging up?
|
|
||||||
client.wait_completion().await
|
|
||||||
} else {
|
|
||||||
Ok(())
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Err(e) = res {
|
|
||||||
warn!("failed to await for frozen and flushed uploads: {e:#}");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.shutdown().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
||||||
|
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
||||||
|
pub(crate) async fn shutdown(&self) {
|
||||||
|
// Signal any subscribers to our cancellation token to drop out
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
self.cancel.cancel();
|
||||||
|
|
||||||
|
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
|
||||||
|
// while doing so.
|
||||||
|
self.last_record_lsn.shutdown();
|
||||||
|
|
||||||
|
// Shut down the layer flush task before the remote client, as one depends on the other
|
||||||
|
task_mgr::shutdown_tasks(
|
||||||
|
Some(TaskKind::LayerFlushTask),
|
||||||
|
Some(self.tenant_id),
|
||||||
|
Some(self.timeline_id),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
|
||||||
|
// case our caller wants to use that for a deletion
|
||||||
|
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||||
|
match remote_client.stop() {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(StopError::QueueUninitialized) => {
|
||||||
|
// Shutting down during initialization is legal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::debug!("Waiting for tasks...");
|
||||||
|
|
||||||
|
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
|
||||||
|
|
||||||
|
// Finally wait until any gate-holders are complete
|
||||||
|
self.gate.close().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_state(&self, new_state: TimelineState) {
|
pub fn set_state(&self, new_state: TimelineState) {
|
||||||
@@ -959,7 +1024,12 @@ impl Timeline {
|
|||||||
reason,
|
reason,
|
||||||
backtrace: backtrace_str,
|
backtrace: backtrace_str,
|
||||||
};
|
};
|
||||||
self.set_state(broken_state)
|
self.set_state(broken_state);
|
||||||
|
|
||||||
|
// Although the Broken state is not equivalent to shutdown() (shutdown will be called
|
||||||
|
// later when this tenant is detach or the process shuts down), firing the cancellation token
|
||||||
|
// here avoids the need for other tasks to watch for the Broken state explicitly.
|
||||||
|
self.cancel.cancel();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn current_state(&self) -> TimelineState {
|
pub fn current_state(&self) -> TimelineState {
|
||||||
@@ -1048,6 +1118,11 @@ impl Timeline {
|
|||||||
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
|
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
|
||||||
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
|
||||||
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
|
||||||
|
|
||||||
let Some(local_layer) = self.find_layer(layer_file_name).await else {
|
let Some(local_layer) = self.find_layer(layer_file_name).await else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
@@ -1063,9 +1138,8 @@ impl Timeline {
|
|||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let results = self
|
let results = self
|
||||||
.evict_layer_batch(remote_client, &[local_layer], &cancel)
|
.evict_layer_batch(remote_client, &[local_layer])
|
||||||
.await?;
|
.await?;
|
||||||
assert_eq!(results.len(), 1);
|
assert_eq!(results.len(), 1);
|
||||||
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
|
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
|
||||||
@@ -1080,15 +1154,18 @@ impl Timeline {
|
|||||||
pub(crate) async fn evict_layers(
|
pub(crate) async fn evict_layers(
|
||||||
&self,
|
&self,
|
||||||
layers_to_evict: &[Layer],
|
layers_to_evict: &[Layer],
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||||
|
let _gate = self
|
||||||
|
.gate
|
||||||
|
.enter()
|
||||||
|
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
|
||||||
|
|
||||||
let remote_client = self
|
let remote_client = self
|
||||||
.remote_client
|
.remote_client
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.context("timeline must have RemoteTimelineClient")?;
|
.context("timeline must have RemoteTimelineClient")?;
|
||||||
|
|
||||||
self.evict_layer_batch(remote_client, layers_to_evict, cancel)
|
self.evict_layer_batch(remote_client, layers_to_evict).await
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Evict multiple layers at once, continuing through errors.
|
/// Evict multiple layers at once, continuing through errors.
|
||||||
@@ -1109,7 +1186,6 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
remote_client: &Arc<RemoteTimelineClient>,
|
remote_client: &Arc<RemoteTimelineClient>,
|
||||||
layers_to_evict: &[Layer],
|
layers_to_evict: &[Layer],
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||||
// ensure that the layers have finished uploading
|
// ensure that the layers have finished uploading
|
||||||
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
||||||
@@ -1157,7 +1233,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
_ = cancel.cancelled() => {},
|
_ = self.cancel.cancelled() => {},
|
||||||
_ = join => {}
|
_ = join => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1267,6 +1343,7 @@ impl Timeline {
|
|||||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||||
initial_logical_size_attempt: Option<completion::Completion>,
|
initial_logical_size_attempt: Option<completion::Completion>,
|
||||||
state: TimelineState,
|
state: TimelineState,
|
||||||
|
cancel: CancellationToken,
|
||||||
) -> Arc<Self> {
|
) -> Arc<Self> {
|
||||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||||
let (state, _) = watch::channel(state);
|
let (state, _) = watch::channel(state);
|
||||||
@@ -1367,6 +1444,8 @@ impl Timeline {
|
|||||||
|
|
||||||
initial_logical_size_can_start,
|
initial_logical_size_can_start,
|
||||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||||
|
cancel,
|
||||||
|
gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
|
||||||
};
|
};
|
||||||
result.repartition_threshold =
|
result.repartition_threshold =
|
||||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||||
@@ -1382,7 +1461,7 @@ impl Timeline {
|
|||||||
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
|
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
|
||||||
match *flush_loop_state {
|
match *flush_loop_state {
|
||||||
FlushLoopState::NotStarted => (),
|
FlushLoopState::NotStarted => (),
|
||||||
FlushLoopState::Running { .. } => {
|
FlushLoopState::Running => {
|
||||||
info!(
|
info!(
|
||||||
"skipping attempt to start flush_loop twice {}/{}",
|
"skipping attempt to start flush_loop twice {}/{}",
|
||||||
self.tenant_id, self.timeline_id
|
self.tenant_id, self.timeline_id
|
||||||
@@ -1402,12 +1481,7 @@ impl Timeline {
|
|||||||
let self_clone = Arc::clone(self);
|
let self_clone = Arc::clone(self);
|
||||||
|
|
||||||
debug!("spawning flush loop");
|
debug!("spawning flush loop");
|
||||||
*flush_loop_state = FlushLoopState::Running {
|
*flush_loop_state = FlushLoopState::Running;
|
||||||
#[cfg(test)]
|
|
||||||
expect_initdb_optimization: false,
|
|
||||||
#[cfg(test)]
|
|
||||||
initdb_optimization_count: 0,
|
|
||||||
};
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||||
task_mgr::TaskKind::LayerFlushTask,
|
task_mgr::TaskKind::LayerFlushTask,
|
||||||
@@ -1419,7 +1493,7 @@ impl Timeline {
|
|||||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||||
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
||||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
|
assert!(matches!(*flush_loop_state, FlushLoopState::Running));
|
||||||
*flush_loop_state = FlushLoopState::Exited;
|
*flush_loop_state = FlushLoopState::Exited;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1706,12 +1780,8 @@ impl Timeline {
|
|||||||
// delay will be terminated by a timeout regardless.
|
// delay will be terminated by a timeout regardless.
|
||||||
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
||||||
|
|
||||||
// no extra cancellation here, because nothing really waits for this to complete compared
|
|
||||||
// to spawn_ondemand_logical_size_calculation.
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
|
|
||||||
let calculated_size = match self_clone
|
let calculated_size = match self_clone
|
||||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
|
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(s) => s,
|
Ok(s) => s,
|
||||||
@@ -1780,7 +1850,6 @@ impl Timeline {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
|
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
|
||||||
let (sender, receiver) = oneshot::channel();
|
let (sender, receiver) = oneshot::channel();
|
||||||
let self_clone = Arc::clone(self);
|
let self_clone = Arc::clone(self);
|
||||||
@@ -1801,7 +1870,7 @@ impl Timeline {
|
|||||||
false,
|
false,
|
||||||
async move {
|
async move {
|
||||||
let res = self_clone
|
let res = self_clone
|
||||||
.logical_size_calculation_task(lsn, cause, &ctx, cancel)
|
.logical_size_calculation_task(lsn, cause, &ctx)
|
||||||
.await;
|
.await;
|
||||||
let _ = sender.send(res).ok();
|
let _ = sender.send(res).ok();
|
||||||
Ok(()) // Receiver is responsible for handling errors
|
Ok(()) // Receiver is responsible for handling errors
|
||||||
@@ -1817,58 +1886,28 @@ impl Timeline {
|
|||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> Result<u64, CalculateLogicalSizeError> {
|
) -> Result<u64, CalculateLogicalSizeError> {
|
||||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
let mut timeline_state_updates = self.subscribe_for_state_updates();
|
let _guard = self.gate.enter();
|
||||||
|
|
||||||
let self_calculation = Arc::clone(self);
|
let self_calculation = Arc::clone(self);
|
||||||
|
|
||||||
let mut calculation = pin!(async {
|
let mut calculation = pin!(async {
|
||||||
let cancel = cancel.child_token();
|
|
||||||
let ctx = ctx.attached_child();
|
let ctx = ctx.attached_child();
|
||||||
self_calculation
|
self_calculation
|
||||||
.calculate_logical_size(lsn, cause, cancel, &ctx)
|
.calculate_logical_size(lsn, cause, &ctx)
|
||||||
.await
|
.await
|
||||||
});
|
});
|
||||||
let timeline_state_cancellation = async {
|
|
||||||
loop {
|
|
||||||
match timeline_state_updates.changed().await {
|
|
||||||
Ok(()) => {
|
|
||||||
let new_state = timeline_state_updates.borrow().clone();
|
|
||||||
match new_state {
|
|
||||||
// we're running this job for active timelines only
|
|
||||||
TimelineState::Active => continue,
|
|
||||||
TimelineState::Broken { .. }
|
|
||||||
| TimelineState::Stopping
|
|
||||||
| TimelineState::Loading => {
|
|
||||||
break format!("aborted because timeline became inactive (new state: {new_state:?})")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_sender_dropped_error) => {
|
|
||||||
// can't happen, the sender is not dropped as long as the Timeline exists
|
|
||||||
break "aborted because state watch was dropped".to_string();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let taskmgr_shutdown_cancellation = async {
|
|
||||||
task_mgr::shutdown_watcher().await;
|
|
||||||
"aborted because task_mgr shutdown requested".to_string()
|
|
||||||
};
|
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
res = &mut calculation => { res }
|
res = &mut calculation => { res }
|
||||||
reason = timeline_state_cancellation => {
|
_ = self.cancel.cancelled() => {
|
||||||
debug!(reason = reason, "cancelling calculation");
|
debug!("cancelling logical size calculation for timeline shutdown");
|
||||||
cancel.cancel();
|
|
||||||
calculation.await
|
calculation.await
|
||||||
}
|
}
|
||||||
reason = taskmgr_shutdown_cancellation => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
debug!(reason = reason, "cancelling calculation");
|
debug!("cancelling logical size calculation for task shutdown");
|
||||||
cancel.cancel();
|
|
||||||
calculation.await
|
calculation.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1882,7 +1921,6 @@ impl Timeline {
|
|||||||
&self,
|
&self,
|
||||||
up_to_lsn: Lsn,
|
up_to_lsn: Lsn,
|
||||||
cause: LogicalSizeCalculationCause,
|
cause: LogicalSizeCalculationCause,
|
||||||
cancel: CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<u64, CalculateLogicalSizeError> {
|
) -> Result<u64, CalculateLogicalSizeError> {
|
||||||
info!(
|
info!(
|
||||||
@@ -1925,7 +1963,7 @@ impl Timeline {
|
|||||||
};
|
};
|
||||||
let timer = storage_time_metrics.start_timer();
|
let timer = storage_time_metrics.start_timer();
|
||||||
let logical_size = self
|
let logical_size = self
|
||||||
.get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
|
.get_current_logical_size_non_incremental(up_to_lsn, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
debug!("calculated logical size: {logical_size}");
|
debug!("calculated logical size: {logical_size}");
|
||||||
timer.stop_and_record();
|
timer.stop_and_record();
|
||||||
@@ -2030,6 +2068,10 @@ impl Timeline {
|
|||||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||||
|
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(PageReconstructError::Cancelled);
|
||||||
|
}
|
||||||
|
|
||||||
// The function should have updated 'state'
|
// The function should have updated 'state'
|
||||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||||
match result {
|
match result {
|
||||||
@@ -2334,6 +2376,10 @@ impl Timeline {
|
|||||||
info!("started flush loop");
|
info!("started flush loop");
|
||||||
loop {
|
loop {
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
|
_ = self.cancel.cancelled() => {
|
||||||
|
info!("shutting down layer flush task");
|
||||||
|
break;
|
||||||
|
},
|
||||||
_ = task_mgr::shutdown_watcher() => {
|
_ = task_mgr::shutdown_watcher() => {
|
||||||
info!("shutting down layer flush task");
|
info!("shutting down layer flush task");
|
||||||
break;
|
break;
|
||||||
@@ -2345,6 +2391,14 @@ impl Timeline {
|
|||||||
let timer = self.metrics.flush_time_histo.start_timer();
|
let timer = self.metrics.flush_time_histo.start_timer();
|
||||||
let flush_counter = *layer_flush_start_rx.borrow();
|
let flush_counter = *layer_flush_start_rx.borrow();
|
||||||
let result = loop {
|
let result = loop {
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
info!("dropping out of flush loop for timeline shutdown");
|
||||||
|
// Note: we do not bother transmitting into [`layer_flush_done_tx`], because
|
||||||
|
// anyone waiting on that will respect self.cancel as well: they will stop
|
||||||
|
// waiting at the same time we as drop out of this loop.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let layer_to_flush = {
|
let layer_to_flush = {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
guard.layer_map().frozen_layers.front().cloned()
|
guard.layer_map().frozen_layers.front().cloned()
|
||||||
@@ -2353,9 +2407,18 @@ impl Timeline {
|
|||||||
let Some(layer_to_flush) = layer_to_flush else {
|
let Some(layer_to_flush) = layer_to_flush else {
|
||||||
break Ok(());
|
break Ok(());
|
||||||
};
|
};
|
||||||
if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
|
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||||
error!("could not flush frozen layer: {err:?}");
|
Ok(()) => {}
|
||||||
break Err(err);
|
Err(FlushLayerError::Cancelled) => {
|
||||||
|
info!("dropping out of flush loop for timeline shutdown");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
err @ Err(
|
||||||
|
FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
|
||||||
|
) => {
|
||||||
|
error!("could not flush frozen layer: {err:?}");
|
||||||
|
break err;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// Notify any listeners that we're done
|
// Notify any listeners that we're done
|
||||||
@@ -2377,7 +2440,7 @@ impl Timeline {
|
|||||||
let mut my_flush_request = 0;
|
let mut my_flush_request = 0;
|
||||||
|
|
||||||
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
|
||||||
if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
|
if !matches!(flush_loop_state, FlushLoopState::Running) {
|
||||||
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2404,7 +2467,17 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
trace!("waiting for flush to complete");
|
trace!("waiting for flush to complete");
|
||||||
rx.changed().await?;
|
tokio::select! {
|
||||||
|
rx_e = rx.changed() => {
|
||||||
|
rx_e?;
|
||||||
|
},
|
||||||
|
// Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
|
||||||
|
// the notification from [`flush_loop`] that it completed.
|
||||||
|
_ = self.cancel.cancelled() => {
|
||||||
|
tracing::info!("Cancelled layer flush due on timeline shutdown");
|
||||||
|
return Ok(())
|
||||||
|
}
|
||||||
|
};
|
||||||
trace!("done")
|
trace!("done")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2419,61 +2492,13 @@ impl Timeline {
|
|||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
frozen_layer: Arc<InMemoryLayer>,
|
frozen_layer: Arc<InMemoryLayer>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> Result<(), FlushLayerError> {
|
||||||
// As a special case, when we have just imported an image into the repository,
|
|
||||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
|
||||||
// files instead. This is possible as long as *all* the data imported into the
|
|
||||||
// repository have the same LSN.
|
|
||||||
let lsn_range = frozen_layer.get_lsn_range();
|
let lsn_range = frozen_layer.get_lsn_range();
|
||||||
let (layers_to_upload, delta_layer_to_add) =
|
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
||||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
|
||||||
#[cfg(test)]
|
if self.cancel.is_cancelled() {
|
||||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
return Err(FlushLayerError::Cancelled);
|
||||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
}
|
||||||
panic!("flush loop not running")
|
|
||||||
}
|
|
||||||
FlushLoopState::Running {
|
|
||||||
initdb_optimization_count,
|
|
||||||
..
|
|
||||||
} => {
|
|
||||||
*initdb_optimization_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
|
|
||||||
// require downloading anything during initial import.
|
|
||||||
let (partitioning, _lsn) = self
|
|
||||||
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
|
|
||||||
.await?;
|
|
||||||
// For image layers, we add them immediately into the layer map.
|
|
||||||
(
|
|
||||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
|
||||||
.await?,
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
#[cfg(test)]
|
|
||||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
|
||||||
FlushLoopState::NotStarted | FlushLoopState::Exited => {
|
|
||||||
panic!("flush loop not running")
|
|
||||||
}
|
|
||||||
FlushLoopState::Running {
|
|
||||||
expect_initdb_optimization,
|
|
||||||
..
|
|
||||||
} => {
|
|
||||||
assert!(!*expect_initdb_optimization, "expected initdb optimization");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Normal case, write out a L0 delta layer file.
|
|
||||||
// `create_delta_layer` will not modify the layer map.
|
|
||||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
|
||||||
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
|
|
||||||
(
|
|
||||||
// FIXME: even though we have a single image and single delta layer assumption
|
|
||||||
// we push them to vec
|
|
||||||
vec![layer.clone()],
|
|
||||||
Some(layer),
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||||
@@ -2484,18 +2509,21 @@ impl Timeline {
|
|||||||
let metadata = {
|
let metadata = {
|
||||||
let mut guard = self.layers.write().await;
|
let mut guard = self.layers.write().await;
|
||||||
|
|
||||||
guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(FlushLayerError::Cancelled);
|
||||||
|
}
|
||||||
|
|
||||||
|
guard.finish_flush_l0_layer(&layer, &frozen_layer, &self.metrics);
|
||||||
|
|
||||||
if disk_consistent_lsn != old_disk_consistent_lsn {
|
if disk_consistent_lsn != old_disk_consistent_lsn {
|
||||||
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
assert!(disk_consistent_lsn > old_disk_consistent_lsn);
|
||||||
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
self.disk_consistent_lsn.store(disk_consistent_lsn);
|
||||||
|
|
||||||
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
// Schedule remote uploads that will reflect our new disk_consistent_lsn
|
||||||
Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
|
Some(self.schedule_uploads(disk_consistent_lsn, [layer])?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
// release lock on 'layers'
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||||
@@ -4366,25 +4394,10 @@ mod tests {
|
|||||||
.expect("should had been resident")
|
.expect("should had been resident")
|
||||||
.drop_eviction_guard();
|
.drop_eviction_guard();
|
||||||
|
|
||||||
let cancel = tokio_util::sync::CancellationToken::new();
|
|
||||||
let batch = [layer];
|
let batch = [layer];
|
||||||
|
|
||||||
let first = {
|
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
|
||||||
let cancel = cancel.child_token();
|
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
|
||||||
async {
|
|
||||||
let cancel = cancel;
|
|
||||||
timeline
|
|
||||||
.evict_layer_batch(&rc, &batch, &cancel)
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let second = async {
|
|
||||||
timeline
|
|
||||||
.evict_layer_batch(&rc, &batch, &cancel)
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let (first, second) = tokio::join!(first, second);
|
let (first, second) = tokio::join!(first, second);
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use crate::{
|
|||||||
deletion_queue::DeletionQueueClient,
|
deletion_queue::DeletionQueueClient,
|
||||||
task_mgr::{self, TaskKind},
|
task_mgr::{self, TaskKind},
|
||||||
tenant::{
|
tenant::{
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||||
metadata::TimelineMetadata,
|
metadata::TimelineMetadata,
|
||||||
remote_timeline_client::{
|
remote_timeline_client::{
|
||||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||||
@@ -30,6 +31,11 @@ use super::{Timeline, TimelineResources};
|
|||||||
|
|
||||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
// Notify any timeline work to drop out of loops/requests
|
||||||
|
tracing::debug!("Cancelling CancellationToken");
|
||||||
|
timeline.cancel.cancel();
|
||||||
|
|
||||||
// Stop the walreceiver first.
|
// Stop the walreceiver first.
|
||||||
debug!("waiting for wal receiver to shutdown");
|
debug!("waiting for wal receiver to shutdown");
|
||||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||||
@@ -74,6 +80,11 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
|||||||
"failpoint: timeline-delete-before-index-deleted-at"
|
"failpoint: timeline-delete-before-index-deleted-at"
|
||||||
))?
|
))?
|
||||||
});
|
});
|
||||||
|
|
||||||
|
tracing::debug!("Waiting for gate...");
|
||||||
|
timeline.gate.close().await;
|
||||||
|
tracing::debug!("Shutdown complete");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -277,10 +277,7 @@ impl Timeline {
|
|||||||
Some(c) => c,
|
Some(c) => c,
|
||||||
};
|
};
|
||||||
|
|
||||||
let results = match self
|
let results = match self.evict_layer_batch(remote_client, &candidates).await {
|
||||||
.evict_layer_batch(remote_client, &candidates, cancel)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Err(pre_err) => {
|
Err(pre_err) => {
|
||||||
stats.errors += candidates.len();
|
stats.errors += candidates.len();
|
||||||
error!("could not do any evictions: {pre_err:#}");
|
error!("could not do any evictions: {pre_err:#}");
|
||||||
@@ -329,8 +326,7 @@ impl Timeline {
|
|||||||
match state.last_layer_access_imitation {
|
match state.last_layer_access_imitation {
|
||||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||||
_ => {
|
_ => {
|
||||||
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
|
self.imitate_timeline_cached_layer_accesses(ctx).await;
|
||||||
.await;
|
|
||||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -344,20 +340,7 @@ impl Timeline {
|
|||||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||||
// The others wait until the calculation is done so that they take into account the
|
// The others wait until the calculation is done so that they take into account the
|
||||||
// imitated accesses that the winner made.
|
// imitated accesses that the winner made.
|
||||||
//
|
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
|
||||||
// It is critical we are responsive to cancellation here. Otherwise, we deadlock with
|
|
||||||
// tenant deletion (holds TENANTS in read mode) any other task that attempts to
|
|
||||||
// acquire TENANTS in write mode before we here call get_tenant.
|
|
||||||
// See https://github.com/neondatabase/neon/issues/5284.
|
|
||||||
let res = tokio::select! {
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
return ControlFlow::Break(());
|
|
||||||
}
|
|
||||||
res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
|
|
||||||
res
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let tenant = match res {
|
|
||||||
Ok(t) => t,
|
Ok(t) => t,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
return ControlFlow::Break(());
|
return ControlFlow::Break(());
|
||||||
@@ -383,21 +366,12 @@ impl Timeline {
|
|||||||
|
|
||||||
/// Recompute the values which would cause on-demand downloads during restart.
|
/// Recompute the values which would cause on-demand downloads during restart.
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
async fn imitate_timeline_cached_layer_accesses(
|
async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
|
||||||
&self,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) {
|
|
||||||
let lsn = self.get_last_record_lsn();
|
let lsn = self.get_last_record_lsn();
|
||||||
|
|
||||||
// imitiate on-restart initial logical size
|
// imitiate on-restart initial logical size
|
||||||
let size = self
|
let size = self
|
||||||
.calculate_logical_size(
|
.calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
|
||||||
lsn,
|
|
||||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
|
||||||
cancel.clone(),
|
|
||||||
ctx,
|
|
||||||
)
|
|
||||||
.instrument(info_span!("calculate_logical_size"))
|
.instrument(info_span!("calculate_logical_size"))
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ impl LayerManager {
|
|||||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||||
pub(crate) fn finish_flush_l0_layer(
|
pub(crate) fn finish_flush_l0_layer(
|
||||||
&mut self,
|
&mut self,
|
||||||
delta_layer: Option<&ResidentLayer>,
|
delta_layer: &ResidentLayer,
|
||||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||||
metrics: &TimelineMetrics,
|
metrics: &TimelineMetrics,
|
||||||
) {
|
) {
|
||||||
@@ -179,12 +179,14 @@ impl LayerManager {
|
|||||||
// layer to disk at the same time, that would not work.
|
// layer to disk at the same time, that would not work.
|
||||||
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
|
||||||
|
|
||||||
if let Some(l) = delta_layer {
|
let mut updates = self.layer_map.batch_update();
|
||||||
let mut updates = self.layer_map.batch_update();
|
Self::insert_historic_layer(
|
||||||
Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
|
delta_layer.as_ref().clone(),
|
||||||
metrics.record_new_file_metrics(l.layer_desc().file_size);
|
&mut updates,
|
||||||
updates.flush();
|
&mut self.layer_fmgr,
|
||||||
}
|
);
|
||||||
|
metrics.record_new_file_metrics(delta_layer.layer_desc().file_size);
|
||||||
|
updates.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Called when compaction is completed.
|
/// Called when compaction is completed.
|
||||||
|
|||||||
@@ -426,7 +426,7 @@ impl ConnectionManagerState {
|
|||||||
timeline,
|
timeline,
|
||||||
new_sk.wal_source_connconf,
|
new_sk.wal_source_connconf,
|
||||||
events_sender,
|
events_sender,
|
||||||
cancellation,
|
cancellation.clone(),
|
||||||
connect_timeout,
|
connect_timeout,
|
||||||
ctx,
|
ctx,
|
||||||
node_id,
|
node_id,
|
||||||
@@ -447,7 +447,14 @@ impl ConnectionManagerState {
|
|||||||
}
|
}
|
||||||
WalReceiverError::Other(e) => {
|
WalReceiverError::Other(e) => {
|
||||||
// give out an error to have task_mgr give it a really verbose logging
|
// give out an error to have task_mgr give it a really verbose logging
|
||||||
Err(e).context("walreceiver connection handling failure")
|
if cancellation.is_cancelled() {
|
||||||
|
// Ideally we would learn about this via some path other than Other, but
|
||||||
|
// that requires refactoring all the intermediate layers of ingest code
|
||||||
|
// that only emit anyhow::Error
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(e).context("walreceiver connection handling failure")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ use std::sync::atomic::{AtomicUsize, Ordering};
|
|||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||||
WAL_REDO_WAIT_TIME,
|
|
||||||
};
|
};
|
||||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
@@ -207,11 +206,8 @@ impl PostgresRedoManager {
|
|||||||
) -> anyhow::Result<Bytes> {
|
) -> anyhow::Result<Bytes> {
|
||||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||||
let start_time = Instant::now();
|
|
||||||
let mut n_attempts = 0u32;
|
let mut n_attempts = 0u32;
|
||||||
loop {
|
loop {
|
||||||
let lock_time = Instant::now();
|
|
||||||
|
|
||||||
// launch the WAL redo process on first use
|
// launch the WAL redo process on first use
|
||||||
let proc: Arc<WalRedoProcess> = {
|
let proc: Arc<WalRedoProcess> = {
|
||||||
let proc_guard = self.redo_process.read().unwrap();
|
let proc_guard = self.redo_process.read().unwrap();
|
||||||
@@ -236,7 +232,7 @@ impl PostgresRedoManager {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
let started_at = std::time::Instant::now();
|
||||||
|
|
||||||
// Relational WAL records are applied using wal-redo-postgres
|
// Relational WAL records are applied using wal-redo-postgres
|
||||||
let buf_tag = BufferTag { rel, blknum };
|
let buf_tag = BufferTag { rel, blknum };
|
||||||
@@ -244,8 +240,7 @@ impl PostgresRedoManager {
|
|||||||
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
|
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
|
||||||
.context("apply_wal_records");
|
.context("apply_wal_records");
|
||||||
|
|
||||||
let end_time = Instant::now();
|
let duration = started_at.elapsed();
|
||||||
let duration = end_time.duration_since(lock_time);
|
|
||||||
|
|
||||||
let len = records.len();
|
let len = records.len();
|
||||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||||
@@ -596,21 +591,21 @@ trait CloseFileDescriptors: CommandExt {
|
|||||||
|
|
||||||
impl<C: CommandExt> CloseFileDescriptors for C {
|
impl<C: CommandExt> CloseFileDescriptors for C {
|
||||||
fn close_fds(&mut self) -> &mut Command {
|
fn close_fds(&mut self) -> &mut Command {
|
||||||
|
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
|
||||||
|
// which means it should be safe to execute inside a signal handler.
|
||||||
|
// The precise meaning depends on platform. See `man signal-safety`
|
||||||
|
// for the linux definition.
|
||||||
|
//
|
||||||
|
// The set_fds_cloexec_threadsafe function is documented to be
|
||||||
|
// async-signal-safe.
|
||||||
|
//
|
||||||
|
// Aside from this function, the rest of the code is re-entrant and
|
||||||
|
// doesn't make any syscalls. We're just passing constants.
|
||||||
|
//
|
||||||
|
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
|
||||||
|
// which is not async-signal-safe. Be careful.
|
||||||
unsafe {
|
unsafe {
|
||||||
self.pre_exec(move || {
|
self.pre_exec(move || {
|
||||||
// SAFETY: Code executed inside pre_exec should have async-signal-safety,
|
|
||||||
// which means it should be safe to execute inside a signal handler.
|
|
||||||
// The precise meaning depends on platform. See `man signal-safety`
|
|
||||||
// for the linux definition.
|
|
||||||
//
|
|
||||||
// The set_fds_cloexec_threadsafe function is documented to be
|
|
||||||
// async-signal-safe.
|
|
||||||
//
|
|
||||||
// Aside from this function, the rest of the code is re-entrant and
|
|
||||||
// doesn't make any syscalls. We're just passing constants.
|
|
||||||
//
|
|
||||||
// NOTE: It's easy to indirectly cause a malloc or lock a mutex,
|
|
||||||
// which is not async-signal-safe. Be careful.
|
|
||||||
close_fds::set_fds_cloexec_threadsafe(3, &[]);
|
close_fds::set_fds_cloexec_threadsafe(3, &[]);
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -19,7 +19,10 @@
|
|||||||
#include "access/xlog.h"
|
#include "access/xlog.h"
|
||||||
#include "access/xlogutils.h"
|
#include "access/xlogutils.h"
|
||||||
#include "storage/buf_internals.h"
|
#include "storage/buf_internals.h"
|
||||||
|
#include "storage/lwlock.h"
|
||||||
|
#include "storage/ipc.h"
|
||||||
#include "c.h"
|
#include "c.h"
|
||||||
|
#include "postmaster/interrupt.h"
|
||||||
|
|
||||||
#include "libpq-fe.h"
|
#include "libpq-fe.h"
|
||||||
#include "libpq/pqformat.h"
|
#include "libpq/pqformat.h"
|
||||||
@@ -61,23 +64,63 @@ int flush_every_n_requests = 8;
|
|||||||
int n_reconnect_attempts = 0;
|
int n_reconnect_attempts = 0;
|
||||||
int max_reconnect_attempts = 60;
|
int max_reconnect_attempts = 60;
|
||||||
|
|
||||||
|
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
LWLockId lock;
|
||||||
|
pg_atomic_uint64 update_counter;
|
||||||
|
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||||
|
} PagestoreShmemState;
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 150000
|
||||||
|
static shmem_request_hook_type prev_shmem_request_hook = NULL;
|
||||||
|
static void walproposer_shmem_request(void);
|
||||||
|
#endif
|
||||||
|
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||||
|
static PagestoreShmemState *pagestore_shared;
|
||||||
|
static uint64 pagestore_local_counter = 0;
|
||||||
|
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||||
|
|
||||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||||
|
|
||||||
static bool pageserver_flush(void);
|
static bool pageserver_flush(void);
|
||||||
static void pageserver_disconnect(void);
|
static void pageserver_disconnect(void);
|
||||||
|
|
||||||
|
static bool
|
||||||
static pqsigfunc prev_signal_handler;
|
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||||
|
{
|
||||||
|
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pageserver_sighup_handler(SIGNAL_ARGS)
|
AssignPageserverConnstring(const char *newval, void *extra)
|
||||||
{
|
{
|
||||||
if (prev_signal_handler)
|
if(!pagestore_shared)
|
||||||
{
|
return;
|
||||||
prev_signal_handler(postgres_signal_arg);
|
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||||
}
|
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||||
neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
|
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||||
pageserver_disconnect();
|
LWLockRelease(pagestore_shared->lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
CheckConnstringUpdated()
|
||||||
|
{
|
||||||
|
if(!pagestore_shared)
|
||||||
|
return false;
|
||||||
|
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
ReloadConnstring()
|
||||||
|
{
|
||||||
|
if(!pagestore_shared)
|
||||||
|
return;
|
||||||
|
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||||
|
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||||
|
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||||
|
LWLockRelease(pagestore_shared->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
@@ -91,6 +134,11 @@ pageserver_connect(int elevel)
|
|||||||
|
|
||||||
Assert(!connected);
|
Assert(!connected);
|
||||||
|
|
||||||
|
if(CheckConnstringUpdated())
|
||||||
|
{
|
||||||
|
ReloadConnstring();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Connect using the connection string we got from the
|
* Connect using the connection string we got from the
|
||||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||||
@@ -110,7 +158,7 @@ pageserver_connect(int elevel)
|
|||||||
n++;
|
n++;
|
||||||
}
|
}
|
||||||
keywords[n] = "dbname";
|
keywords[n] = "dbname";
|
||||||
values[n] = page_server_connstring;
|
values[n] = local_pageserver_connstring;
|
||||||
n++;
|
n++;
|
||||||
keywords[n] = NULL;
|
keywords[n] = NULL;
|
||||||
values[n] = NULL;
|
values[n] = NULL;
|
||||||
@@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request)
|
|||||||
{
|
{
|
||||||
StringInfoData req_buff;
|
StringInfoData req_buff;
|
||||||
|
|
||||||
|
if(CheckConnstringUpdated())
|
||||||
|
{
|
||||||
|
pageserver_disconnect();
|
||||||
|
ReloadConnstring();
|
||||||
|
}
|
||||||
|
|
||||||
/* If the connection was lost for some reason, reconnect */
|
/* If the connection was lost for some reason, reconnect */
|
||||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||||
{
|
{
|
||||||
@@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request)
|
|||||||
{
|
{
|
||||||
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
|
||||||
{
|
{
|
||||||
|
HandleMainLoopInterrupts();
|
||||||
n_reconnect_attempts += 1;
|
n_reconnect_attempts += 1;
|
||||||
pg_usleep(RECONNECT_INTERVAL_USEC);
|
pg_usleep(RECONNECT_INTERVAL_USEC);
|
||||||
}
|
}
|
||||||
@@ -391,7 +446,8 @@ pageserver_flush(void)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
page_server_api api = {
|
page_server_api api =
|
||||||
|
{
|
||||||
.send = pageserver_send,
|
.send = pageserver_send,
|
||||||
.flush = pageserver_flush,
|
.flush = pageserver_flush,
|
||||||
.receive = pageserver_receive
|
.receive = pageserver_receive
|
||||||
@@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
|||||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Size
|
||||||
|
PagestoreShmemSize(void)
|
||||||
|
{
|
||||||
|
return sizeof(PagestoreShmemState);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
PagestoreShmemInit(void)
|
||||||
|
{
|
||||||
|
bool found;
|
||||||
|
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||||
|
pagestore_shared = ShmemInitStruct("libpagestore shared state",
|
||||||
|
PagestoreShmemSize(),
|
||||||
|
&found);
|
||||||
|
if(!found)
|
||||||
|
{
|
||||||
|
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||||
|
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||||
|
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||||
|
}
|
||||||
|
LWLockRelease(AddinShmemInitLock);
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pagestore_shmem_startup_hook(void)
|
||||||
|
{
|
||||||
|
if(prev_shmem_startup_hook)
|
||||||
|
prev_shmem_startup_hook();
|
||||||
|
|
||||||
|
PagestoreShmemInit();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pagestore_shmem_request(void)
|
||||||
|
{
|
||||||
|
#if PG_VERSION_NUM >= 150000
|
||||||
|
if(prev_shmem_request_hook)
|
||||||
|
prev_shmem_request_hook();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
RequestAddinShmemSpace(PagestoreShmemSize());
|
||||||
|
RequestNamedLWLockTranche("neon_libpagestore", 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pagestore_prepare_shmem(void)
|
||||||
|
{
|
||||||
|
#if PG_VERSION_NUM >= 150000
|
||||||
|
prev_shmem_request_hook = shmem_request_hook;
|
||||||
|
shmem_request_hook = pagestore_shmem_request;
|
||||||
|
#else
|
||||||
|
pagestore_shmem_request();
|
||||||
|
#endif
|
||||||
|
prev_shmem_startup_hook = shmem_startup_hook;
|
||||||
|
shmem_startup_hook = pagestore_shmem_startup_hook;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Module initialization function
|
* Module initialization function
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
pg_init_libpagestore(void)
|
pg_init_libpagestore(void)
|
||||||
{
|
{
|
||||||
|
pagestore_prepare_shmem();
|
||||||
|
|
||||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||||
"connection string to the page server",
|
"connection string to the page server",
|
||||||
NULL,
|
NULL,
|
||||||
@@ -418,7 +534,7 @@ pg_init_libpagestore(void)
|
|||||||
"",
|
"",
|
||||||
PGC_SIGHUP,
|
PGC_SIGHUP,
|
||||||
0, /* no flags required */
|
0, /* no flags required */
|
||||||
NULL, NULL, NULL);
|
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
|
||||||
|
|
||||||
DefineCustomStringVariable("neon.timeline_id",
|
DefineCustomStringVariable("neon.timeline_id",
|
||||||
"Neon timeline_id the server is running on",
|
"Neon timeline_id the server is running on",
|
||||||
@@ -499,7 +615,5 @@ pg_init_libpagestore(void)
|
|||||||
redo_read_buffer_filter = neon_redo_read_buffer_filter;
|
redo_read_buffer_filter = neon_redo_read_buffer_filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
|
|
||||||
|
|
||||||
lfc_init();
|
lfc_init();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
//! User credentials used in authentication.
|
//! User credentials used in authentication.
|
||||||
|
|
||||||
use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
|
use crate::{
|
||||||
|
auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
|
||||||
|
};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use pq_proto::StartupMessageParams;
|
use pq_proto::StartupMessageParams;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
@@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> {
|
|||||||
pub user: &'a str,
|
pub user: &'a str,
|
||||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||||
pub project: Option<String>,
|
pub project: Option<String>,
|
||||||
|
|
||||||
|
pub cache_key: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ClientCredentials<'_> {
|
impl ClientCredentials<'_> {
|
||||||
@@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> {
|
|||||||
ClientCredentials {
|
ClientCredentials {
|
||||||
user: "",
|
user: "",
|
||||||
project: None,
|
project: None,
|
||||||
|
cache_key: "".to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> {
|
|||||||
|
|
||||||
info!(user, project = project.as_deref(), "credentials");
|
info!(user, project = project.as_deref(), "credentials");
|
||||||
|
|
||||||
Ok(Self { user, project })
|
let cache_key = format!(
|
||||||
|
"{}{}",
|
||||||
|
project.as_deref().unwrap_or(""),
|
||||||
|
neon_options(params).unwrap_or("".to_string())
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
user,
|
||||||
|
project,
|
||||||
|
cache_key,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -176,6 +191,7 @@ mod tests {
|
|||||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||||
assert_eq!(creds.user, "john_doe");
|
assert_eq!(creds.user, "john_doe");
|
||||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||||
|
assert_eq!(creds.cache_key, "foo");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -303,4 +319,23 @@ mod tests {
|
|||||||
_ => panic!("bad error: {err:?}"),
|
_ => panic!("bad error: {err:?}"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_neon_options() -> anyhow::Result<()> {
|
||||||
|
let options = StartupMessageParams::new([
|
||||||
|
("user", "john_doe"),
|
||||||
|
("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let sni = Some("project.localhost");
|
||||||
|
let common_names = Some(["localhost".into()].into());
|
||||||
|
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||||
|
assert_eq!(creds.project.as_deref(), Some("project"));
|
||||||
|
assert_eq!(
|
||||||
|
creds.cache_key,
|
||||||
|
"projectneon_endpoint_type:read_write neon_lsn:0/2"
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -80,6 +80,9 @@ struct ProxyCliArgs {
|
|||||||
/// cache for `wake_compute` api method (use `size=0` to disable)
|
/// cache for `wake_compute` api method (use `size=0` to disable)
|
||||||
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
|
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
|
||||||
wake_compute_cache: String,
|
wake_compute_cache: String,
|
||||||
|
/// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
|
||||||
|
#[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
|
||||||
|
wake_compute_lock: String,
|
||||||
/// Allow self-signed certificates for compute nodes (for testing)
|
/// Allow self-signed certificates for compute nodes (for testing)
|
||||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||||
allow_self_signed_compute: bool,
|
allow_self_signed_compute: bool,
|
||||||
@@ -220,10 +223,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
|||||||
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
|
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
let config::WakeComputeLockOptions {
|
||||||
|
shards,
|
||||||
|
permits,
|
||||||
|
epoch,
|
||||||
|
timeout,
|
||||||
|
} = args.wake_compute_lock.parse()?;
|
||||||
|
info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
|
||||||
|
let locks = Box::leak(Box::new(
|
||||||
|
console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
|
||||||
|
.unwrap(),
|
||||||
|
));
|
||||||
|
tokio::spawn(locks.garbage_collect_worker(epoch));
|
||||||
|
|
||||||
let url = args.auth_endpoint.parse()?;
|
let url = args.auth_endpoint.parse()?;
|
||||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||||
|
|
||||||
let api = console::provider::neon::Api::new(endpoint, caches);
|
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||||
auth::BackendType::Console(Cow::Owned(api), ())
|
auth::BackendType::Console(Cow::Owned(api), ())
|
||||||
}
|
}
|
||||||
AuthBackend::Postgres => {
|
AuthBackend::Postgres => {
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use crate::{
|
|||||||
cancellation::CancelClosure,
|
cancellation::CancelClosure,
|
||||||
console::errors::WakeComputeError,
|
console::errors::WakeComputeError,
|
||||||
error::{io_error, UserFacingError},
|
error::{io_error, UserFacingError},
|
||||||
|
proxy::is_neon_param,
|
||||||
};
|
};
|
||||||
use futures::{FutureExt, TryFutureExt};
|
use futures::{FutureExt, TryFutureExt};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
@@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
|||||||
#[allow(unstable_name_collisions)]
|
#[allow(unstable_name_collisions)]
|
||||||
let options: String = params
|
let options: String = params
|
||||||
.options_raw()?
|
.options_raw()?
|
||||||
.filter(|opt| parse_endpoint_param(opt).is_none())
|
.filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
|
||||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
@@ -313,5 +314,11 @@ mod tests {
|
|||||||
|
|
||||||
let params = StartupMessageParams::new([("options", "project = foo")]);
|
let params = StartupMessageParams::new([("options", "project = foo")]);
|
||||||
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||||
|
|
||||||
|
let params = StartupMessageParams::new([(
|
||||||
|
"options",
|
||||||
|
"project = foo neon_endpoint_type:read_write neon_lsn:0/2",
|
||||||
|
)]);
|
||||||
|
assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -264,6 +264,79 @@ impl FromStr for CacheOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Helper for cmdline cache options parsing.
|
||||||
|
pub struct WakeComputeLockOptions {
|
||||||
|
/// The number of shards the lock map should have
|
||||||
|
pub shards: usize,
|
||||||
|
/// The number of allowed concurrent requests for each endpoitn
|
||||||
|
pub permits: usize,
|
||||||
|
/// Garbage collection epoch
|
||||||
|
pub epoch: Duration,
|
||||||
|
/// Lock timeout
|
||||||
|
pub timeout: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WakeComputeLockOptions {
|
||||||
|
/// Default options for [`crate::console::provider::ApiLocks`].
|
||||||
|
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
|
||||||
|
|
||||||
|
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
|
||||||
|
|
||||||
|
/// Parse lock options passed via cmdline.
|
||||||
|
/// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
|
||||||
|
fn parse(options: &str) -> anyhow::Result<Self> {
|
||||||
|
let mut shards = None;
|
||||||
|
let mut permits = None;
|
||||||
|
let mut epoch = None;
|
||||||
|
let mut timeout = None;
|
||||||
|
|
||||||
|
for option in options.split(',') {
|
||||||
|
let (key, value) = option
|
||||||
|
.split_once('=')
|
||||||
|
.with_context(|| format!("bad key-value pair: {option}"))?;
|
||||||
|
|
||||||
|
match key {
|
||||||
|
"shards" => shards = Some(value.parse()?),
|
||||||
|
"permits" => permits = Some(value.parse()?),
|
||||||
|
"epoch" => epoch = Some(humantime::parse_duration(value)?),
|
||||||
|
"timeout" => timeout = Some(humantime::parse_duration(value)?),
|
||||||
|
unknown => bail!("unknown key: {unknown}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// these dont matter if lock is disabled
|
||||||
|
if let Some(0) = permits {
|
||||||
|
timeout = Some(Duration::default());
|
||||||
|
epoch = Some(Duration::default());
|
||||||
|
shards = Some(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
let out = Self {
|
||||||
|
shards: shards.context("missing `shards`")?,
|
||||||
|
permits: permits.context("missing `permits`")?,
|
||||||
|
epoch: epoch.context("missing `epoch`")?,
|
||||||
|
timeout: timeout.context("missing `timeout`")?,
|
||||||
|
};
|
||||||
|
|
||||||
|
ensure!(out.shards > 1, "shard count must be > 1");
|
||||||
|
ensure!(
|
||||||
|
out.shards.is_power_of_two(),
|
||||||
|
"shard count must be a power of two"
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for WakeComputeLockOptions {
|
||||||
|
type Err = anyhow::Error;
|
||||||
|
|
||||||
|
fn from_str(options: &str) -> Result<Self, Self::Err> {
|
||||||
|
let error = || format!("failed to parse cache lock options '{options}'");
|
||||||
|
Self::parse(options).with_context(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -288,4 +361,42 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_lock_options() -> anyhow::Result<()> {
|
||||||
|
let WakeComputeLockOptions {
|
||||||
|
epoch,
|
||||||
|
permits,
|
||||||
|
shards,
|
||||||
|
timeout,
|
||||||
|
} = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
|
||||||
|
assert_eq!(epoch, Duration::from_secs(10 * 60));
|
||||||
|
assert_eq!(timeout, Duration::from_secs(1));
|
||||||
|
assert_eq!(shards, 32);
|
||||||
|
assert_eq!(permits, 4);
|
||||||
|
|
||||||
|
let WakeComputeLockOptions {
|
||||||
|
epoch,
|
||||||
|
permits,
|
||||||
|
shards,
|
||||||
|
timeout,
|
||||||
|
} = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
|
||||||
|
assert_eq!(epoch, Duration::from_secs(60));
|
||||||
|
assert_eq!(timeout, Duration::from_millis(100));
|
||||||
|
assert_eq!(shards, 16);
|
||||||
|
assert_eq!(permits, 8);
|
||||||
|
|
||||||
|
let WakeComputeLockOptions {
|
||||||
|
epoch,
|
||||||
|
permits,
|
||||||
|
shards,
|
||||||
|
timeout,
|
||||||
|
} = "permits=0".parse()?;
|
||||||
|
assert_eq!(epoch, Duration::ZERO);
|
||||||
|
assert_eq!(timeout, Duration::ZERO);
|
||||||
|
assert_eq!(shards, 2);
|
||||||
|
assert_eq!(permits, 0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,5 +13,10 @@ pub mod caches {
|
|||||||
pub use super::provider::{ApiCaches, NodeInfoCache};
|
pub use super::provider::{ApiCaches, NodeInfoCache};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Various cache-related types.
|
||||||
|
pub mod locks {
|
||||||
|
pub use super::provider::ApiLocks;
|
||||||
|
}
|
||||||
|
|
||||||
/// Console's management API.
|
/// Console's management API.
|
||||||
pub mod mgmt;
|
pub mod mgmt;
|
||||||
|
|||||||
@@ -8,7 +8,13 @@ use crate::{
|
|||||||
compute, scram,
|
compute, scram,
|
||||||
};
|
};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use std::sync::Arc;
|
use dashmap::DashMap;
|
||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
use tokio::{
|
||||||
|
sync::{OwnedSemaphorePermit, Semaphore},
|
||||||
|
time::Instant,
|
||||||
|
};
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
pub mod errors {
|
pub mod errors {
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -149,6 +155,9 @@ pub mod errors {
|
|||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
ApiError(ApiError),
|
ApiError(ApiError),
|
||||||
|
|
||||||
|
#[error("Timeout waiting to acquire wake compute lock")]
|
||||||
|
TimeoutError,
|
||||||
}
|
}
|
||||||
|
|
||||||
// This allows more useful interactions than `#[from]`.
|
// This allows more useful interactions than `#[from]`.
|
||||||
@@ -158,6 +167,17 @@ pub mod errors {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<tokio::sync::AcquireError> for WakeComputeError {
|
||||||
|
fn from(_: tokio::sync::AcquireError) -> Self {
|
||||||
|
WakeComputeError::TimeoutError
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl From<tokio::time::error::Elapsed> for WakeComputeError {
|
||||||
|
fn from(_: tokio::time::error::Elapsed) -> Self {
|
||||||
|
WakeComputeError::TimeoutError
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl UserFacingError for WakeComputeError {
|
impl UserFacingError for WakeComputeError {
|
||||||
fn to_string_client(&self) -> String {
|
fn to_string_client(&self) -> String {
|
||||||
use WakeComputeError::*;
|
use WakeComputeError::*;
|
||||||
@@ -167,6 +187,8 @@ pub mod errors {
|
|||||||
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
|
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
|
||||||
// However, API might return a meaningful error.
|
// However, API might return a meaningful error.
|
||||||
ApiError(e) => e.to_string_client(),
|
ApiError(e) => e.to_string_client(),
|
||||||
|
|
||||||
|
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -178,6 +200,7 @@ pub struct ConsoleReqExtra<'a> {
|
|||||||
pub session_id: uuid::Uuid,
|
pub session_id: uuid::Uuid,
|
||||||
/// Name of client application, if set.
|
/// Name of client application, if set.
|
||||||
pub application_name: Option<&'a str>,
|
pub application_name: Option<&'a str>,
|
||||||
|
pub options: Option<&'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Auth secret which is managed by the cloud.
|
/// Auth secret which is managed by the cloud.
|
||||||
@@ -232,3 +255,145 @@ pub struct ApiCaches {
|
|||||||
/// Cache for the `wake_compute` API method.
|
/// Cache for the `wake_compute` API method.
|
||||||
pub node_info: NodeInfoCache,
|
pub node_info: NodeInfoCache,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Various caches for [`console`](super).
|
||||||
|
pub struct ApiLocks {
|
||||||
|
name: &'static str,
|
||||||
|
node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
|
||||||
|
permits: usize,
|
||||||
|
timeout: Duration,
|
||||||
|
registered: prometheus::IntCounter,
|
||||||
|
unregistered: prometheus::IntCounter,
|
||||||
|
reclamation_lag: prometheus::Histogram,
|
||||||
|
lock_acquire_lag: prometheus::Histogram,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ApiLocks {
|
||||||
|
pub fn new(
|
||||||
|
name: &'static str,
|
||||||
|
permits: usize,
|
||||||
|
shards: usize,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> prometheus::Result<Self> {
|
||||||
|
let registered = prometheus::IntCounter::with_opts(
|
||||||
|
prometheus::Opts::new(
|
||||||
|
"semaphores_registered",
|
||||||
|
"Number of semaphores registered in this api lock",
|
||||||
|
)
|
||||||
|
.namespace(name),
|
||||||
|
)?;
|
||||||
|
prometheus::register(Box::new(registered.clone()))?;
|
||||||
|
let unregistered = prometheus::IntCounter::with_opts(
|
||||||
|
prometheus::Opts::new(
|
||||||
|
"semaphores_unregistered",
|
||||||
|
"Number of semaphores unregistered in this api lock",
|
||||||
|
)
|
||||||
|
.namespace(name),
|
||||||
|
)?;
|
||||||
|
prometheus::register(Box::new(unregistered.clone()))?;
|
||||||
|
let reclamation_lag = prometheus::Histogram::with_opts(
|
||||||
|
prometheus::HistogramOpts::new(
|
||||||
|
"reclamation_lag_seconds",
|
||||||
|
"Time it takes to reclaim unused semaphores in the api lock",
|
||||||
|
)
|
||||||
|
.namespace(name)
|
||||||
|
// 1us -> 65ms
|
||||||
|
// benchmarks on my mac indicate it's usually in the range of 256us and 512us
|
||||||
|
.buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
|
||||||
|
)?;
|
||||||
|
prometheus::register(Box::new(reclamation_lag.clone()))?;
|
||||||
|
let lock_acquire_lag = prometheus::Histogram::with_opts(
|
||||||
|
prometheus::HistogramOpts::new(
|
||||||
|
"semaphore_acquire_seconds",
|
||||||
|
"Time it takes to reclaim unused semaphores in the api lock",
|
||||||
|
)
|
||||||
|
.namespace(name)
|
||||||
|
// 0.1ms -> 6s
|
||||||
|
.buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
|
||||||
|
)?;
|
||||||
|
prometheus::register(Box::new(lock_acquire_lag.clone()))?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
name,
|
||||||
|
node_locks: DashMap::with_shard_amount(shards),
|
||||||
|
permits,
|
||||||
|
timeout,
|
||||||
|
lock_acquire_lag,
|
||||||
|
registered,
|
||||||
|
unregistered,
|
||||||
|
reclamation_lag,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_wake_compute_permit(
|
||||||
|
&self,
|
||||||
|
key: &Arc<str>,
|
||||||
|
) -> Result<WakeComputePermit, errors::WakeComputeError> {
|
||||||
|
if self.permits == 0 {
|
||||||
|
return Ok(WakeComputePermit { permit: None });
|
||||||
|
}
|
||||||
|
let now = Instant::now();
|
||||||
|
let semaphore = {
|
||||||
|
// get fast path
|
||||||
|
if let Some(semaphore) = self.node_locks.get(key) {
|
||||||
|
semaphore.clone()
|
||||||
|
} else {
|
||||||
|
self.node_locks
|
||||||
|
.entry(key.clone())
|
||||||
|
.or_insert_with(|| {
|
||||||
|
self.registered.inc();
|
||||||
|
Arc::new(Semaphore::new(self.permits))
|
||||||
|
})
|
||||||
|
.clone()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
|
||||||
|
|
||||||
|
self.lock_acquire_lag
|
||||||
|
.observe((Instant::now() - now).as_secs_f64());
|
||||||
|
|
||||||
|
Ok(WakeComputePermit {
|
||||||
|
permit: Some(permit??),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
|
||||||
|
if self.permits == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
|
||||||
|
loop {
|
||||||
|
for (i, shard) in self.node_locks.shards().iter().enumerate() {
|
||||||
|
interval.tick().await;
|
||||||
|
// temporary lock a single shard and then clear any semaphores that aren't currently checked out
|
||||||
|
// race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
|
||||||
|
// therefore releasing it is safe from race conditions
|
||||||
|
info!(
|
||||||
|
name = self.name,
|
||||||
|
shard = i,
|
||||||
|
"performing epoch reclamation on api lock"
|
||||||
|
);
|
||||||
|
let mut lock = shard.write();
|
||||||
|
let timer = self.reclamation_lag.start_timer();
|
||||||
|
let count = lock
|
||||||
|
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
|
||||||
|
.count();
|
||||||
|
drop(lock);
|
||||||
|
self.unregistered.inc_by(count as u64);
|
||||||
|
timer.observe_duration()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct WakeComputePermit {
|
||||||
|
// None if the lock is disabled
|
||||||
|
permit: Option<OwnedSemaphorePermit>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WakeComputePermit {
|
||||||
|
pub fn should_check_cache(&self) -> bool {
|
||||||
|
self.permit.is_some()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,12 +3,12 @@
|
|||||||
use super::{
|
use super::{
|
||||||
super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
|
super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
|
||||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||||
ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
||||||
};
|
};
|
||||||
use crate::{auth::ClientCredentials, compute, http, scram};
|
use crate::{auth::ClientCredentials, compute, http, scram};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
use std::net::SocketAddr;
|
use std::{net::SocketAddr, sync::Arc};
|
||||||
use tokio::time::Instant;
|
use tokio::time::Instant;
|
||||||
use tokio_postgres::config::SslMode;
|
use tokio_postgres::config::SslMode;
|
||||||
use tracing::{error, info, info_span, warn, Instrument};
|
use tracing::{error, info, info_span, warn, Instrument};
|
||||||
@@ -17,12 +17,17 @@ use tracing::{error, info, info_span, warn, Instrument};
|
|||||||
pub struct Api {
|
pub struct Api {
|
||||||
endpoint: http::Endpoint,
|
endpoint: http::Endpoint,
|
||||||
caches: &'static ApiCaches,
|
caches: &'static ApiCaches,
|
||||||
|
locks: &'static ApiLocks,
|
||||||
jwt: String,
|
jwt: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Api {
|
impl Api {
|
||||||
/// Construct an API object containing the auth parameters.
|
/// Construct an API object containing the auth parameters.
|
||||||
pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
|
pub fn new(
|
||||||
|
endpoint: http::Endpoint,
|
||||||
|
caches: &'static ApiCaches,
|
||||||
|
locks: &'static ApiLocks,
|
||||||
|
) -> Self {
|
||||||
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||||
Ok(v) => v,
|
Ok(v) => v,
|
||||||
Err(_) => "".to_string(),
|
Err(_) => "".to_string(),
|
||||||
@@ -30,6 +35,7 @@ impl Api {
|
|||||||
Self {
|
Self {
|
||||||
endpoint,
|
endpoint,
|
||||||
caches,
|
caches,
|
||||||
|
locks,
|
||||||
jwt,
|
jwt,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -99,6 +105,7 @@ impl Api {
|
|||||||
.query(&[
|
.query(&[
|
||||||
("application_name", extra.application_name),
|
("application_name", extra.application_name),
|
||||||
("project", Some(project)),
|
("project", Some(project)),
|
||||||
|
("options", extra.options),
|
||||||
])
|
])
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
@@ -151,7 +158,7 @@ impl super::Api for Api {
|
|||||||
extra: &ConsoleReqExtra<'_>,
|
extra: &ConsoleReqExtra<'_>,
|
||||||
creds: &ClientCredentials,
|
creds: &ClientCredentials,
|
||||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||||
let key = creds.project().expect("impossible");
|
let key: &str = &creds.cache_key;
|
||||||
|
|
||||||
// Every time we do a wakeup http request, the compute node will stay up
|
// Every time we do a wakeup http request, the compute node will stay up
|
||||||
// for some time (highly depends on the console's scale-to-zero policy);
|
// for some time (highly depends on the console's scale-to-zero policy);
|
||||||
@@ -162,9 +169,22 @@ impl super::Api for Api {
|
|||||||
return Ok(cached);
|
return Ok(cached);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let key: Arc<str> = key.into();
|
||||||
|
|
||||||
|
let permit = self.locks.get_wake_compute_permit(&key).await?;
|
||||||
|
|
||||||
|
// after getting back a permit - it's possible the cache was filled
|
||||||
|
// double check
|
||||||
|
if permit.should_check_cache() {
|
||||||
|
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||||
|
info!(key = &*key, "found cached compute node info");
|
||||||
|
return Ok(cached);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let node = self.do_wake_compute(extra, creds).await?;
|
let node = self.do_wake_compute(extra, creds).await?;
|
||||||
let (_, cached) = self.caches.node_info.insert(key.into(), node);
|
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
|
||||||
info!(key = key, "created a cache entry for compute node info");
|
info!(key = &*key, "created a cache entry for compute node info");
|
||||||
|
|
||||||
Ok(cached)
|
Ok(cached)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
|
|
||||||
use std::convert::Infallible;
|
use std::convert::Infallible;
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
|
|||||||
@@ -3,10 +3,9 @@
|
|||||||
use std::ffi::CStr;
|
use std::ffi::CStr;
|
||||||
|
|
||||||
pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
|
pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
|
||||||
let pos = bytes.iter().position(|&x| x == 0)?;
|
let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
|
||||||
let (cstr, other) = bytes.split_at(pos + 1);
|
let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
|
||||||
// SAFETY: we've already checked that there's a terminator
|
Some((cstr, other))
|
||||||
Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
|
/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
|
||||||
|
|||||||
@@ -15,10 +15,12 @@ use crate::{
|
|||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use futures::TryFutureExt;
|
use futures::TryFutureExt;
|
||||||
|
use itertools::Itertools;
|
||||||
use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
|
use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::{Lazy, OnceCell};
|
||||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||||
use prometheus::{register_histogram_vec, HistogramVec};
|
use prometheus::{register_histogram_vec, HistogramVec};
|
||||||
|
use regex::Regex;
|
||||||
use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
|
use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
|
||||||
use tokio::{
|
use tokio::{
|
||||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||||
@@ -568,6 +570,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
|||||||
"api_console_other_server_error"
|
"api_console_other_server_error"
|
||||||
}
|
}
|
||||||
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
|
WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
|
||||||
|
WakeComputeError::TimeoutError => "timeout_error",
|
||||||
};
|
};
|
||||||
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
|
NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
|
||||||
}
|
}
|
||||||
@@ -881,9 +884,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
|||||||
allow_self_signed_compute,
|
allow_self_signed_compute,
|
||||||
} = self;
|
} = self;
|
||||||
|
|
||||||
|
let console_options = neon_options(params);
|
||||||
|
|
||||||
let extra = console::ConsoleReqExtra {
|
let extra = console::ConsoleReqExtra {
|
||||||
session_id, // aka this connection's id
|
session_id, // aka this connection's id
|
||||||
application_name: params.get("application_name"),
|
application_name: params.get("application_name"),
|
||||||
|
options: console_options.as_deref(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
|
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
|
||||||
@@ -945,3 +951,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
|||||||
proxy_pass(stream, node.stream, &aux).await
|
proxy_pass(stream, node.stream, &aux).await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
|
||||||
|
#[allow(unstable_name_collisions)]
|
||||||
|
let options: String = params
|
||||||
|
.options_raw()?
|
||||||
|
.filter(|opt| is_neon_param(opt))
|
||||||
|
.sorted() // we sort it to use as cache key
|
||||||
|
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Don't even bother with empty options.
|
||||||
|
if options.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_neon_param(bytes: &str) -> bool {
|
||||||
|
static RE: OnceCell<Regex> = OnceCell::new();
|
||||||
|
RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
|
||||||
|
|
||||||
|
RE.get().unwrap().is_match(bytes)
|
||||||
|
}
|
||||||
|
|||||||
@@ -440,6 +440,7 @@ fn helper_create_connect_info(
|
|||||||
let extra = console::ConsoleReqExtra {
|
let extra = console::ConsoleReqExtra {
|
||||||
session_id: uuid::Uuid::new_v4(),
|
session_id: uuid::Uuid::new_v4(),
|
||||||
application_name: Some("TEST"),
|
application_name: Some("TEST"),
|
||||||
|
options: None,
|
||||||
};
|
};
|
||||||
let creds = auth::BackendType::Test(mechanism);
|
let creds = auth::BackendType::Test(mechanism);
|
||||||
(cache, extra, creds)
|
(cache, extra, creds)
|
||||||
|
|||||||
@@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
auth, console,
|
auth, console,
|
||||||
proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
|
proxy::{
|
||||||
|
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
|
||||||
|
NUM_DB_CONNECTIONS_OPENED_COUNTER,
|
||||||
|
},
|
||||||
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
|
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
|
||||||
};
|
};
|
||||||
use crate::{compute, config};
|
use crate::{compute, config};
|
||||||
@@ -41,6 +44,7 @@ pub struct ConnInfo {
|
|||||||
pub dbname: String,
|
pub dbname: String,
|
||||||
pub hostname: String,
|
pub hostname: String,
|
||||||
pub password: String,
|
pub password: String,
|
||||||
|
pub options: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ConnInfo {
|
impl ConnInfo {
|
||||||
@@ -401,26 +405,25 @@ async fn connect_to_compute(
|
|||||||
let tls = config.tls_config.as_ref();
|
let tls = config.tls_config.as_ref();
|
||||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||||
|
|
||||||
let credential_params = StartupMessageParams::new([
|
let params = StartupMessageParams::new([
|
||||||
("user", &conn_info.username),
|
("user", &conn_info.username),
|
||||||
("database", &conn_info.dbname),
|
("database", &conn_info.dbname),
|
||||||
("application_name", APP_NAME),
|
("application_name", APP_NAME),
|
||||||
|
("options", conn_info.options.as_deref().unwrap_or("")),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let creds = config
|
let creds = config
|
||||||
.auth_backend
|
.auth_backend
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|_| {
|
.map(|_| auth::ClientCredentials::parse(¶ms, Some(&conn_info.hostname), common_names))
|
||||||
auth::ClientCredentials::parse(
|
|
||||||
&credential_params,
|
|
||||||
Some(&conn_info.hostname),
|
|
||||||
common_names,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
|
|
||||||
|
let console_options = neon_options(¶ms);
|
||||||
|
|
||||||
let extra = console::ConsoleReqExtra {
|
let extra = console::ConsoleReqExtra {
|
||||||
session_id: uuid::Uuid::new_v4(),
|
session_id: uuid::Uuid::new_v4(),
|
||||||
application_name: Some(APP_NAME),
|
application_name: Some(APP_NAME),
|
||||||
|
options: console_options.as_deref(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let node_info = creds
|
let node_info = creds
|
||||||
|
|||||||
@@ -174,11 +174,23 @@ fn get_conn_info(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let pairs = connection_url.query_pairs();
|
||||||
|
|
||||||
|
let mut options = Option::None;
|
||||||
|
|
||||||
|
for (key, value) in pairs {
|
||||||
|
if key == "options" {
|
||||||
|
options = Some(value.to_string());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(ConnInfo {
|
Ok(ConnInfo {
|
||||||
username: username.to_owned(),
|
username: username.to_owned(),
|
||||||
dbname: dbname.to_owned(),
|
dbname: dbname.to_owned(),
|
||||||
hostname: hostname.to_owned(),
|
hostname: hostname.to_owned(),
|
||||||
password: password.to_owned(),
|
password: password.to_owned(),
|
||||||
|
options,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#![deny(unsafe_code)]
|
||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
pub mod checks;
|
pub mod checks;
|
||||||
pub mod cloud_admin_api;
|
pub mod cloud_admin_api;
|
||||||
pub mod garbage;
|
pub mod garbage;
|
||||||
|
|||||||
@@ -1,19 +1,20 @@
|
|||||||
use anyhow::{bail, Result};
|
use utils::auth::{AuthError, Claims, Scope};
|
||||||
use utils::auth::{Claims, Scope};
|
|
||||||
use utils::id::TenantId;
|
use utils::id::TenantId;
|
||||||
|
|
||||||
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
|
pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
|
||||||
match (&claims.scope, tenant_id) {
|
match (&claims.scope, tenant_id) {
|
||||||
(Scope::Tenant, None) => {
|
(Scope::Tenant, None) => Err(AuthError(
|
||||||
bail!("Attempt to access management api with tenant scope. Permission denied")
|
"Attempt to access management api with tenant scope. Permission denied".into(),
|
||||||
}
|
)),
|
||||||
(Scope::Tenant, Some(tenant_id)) => {
|
(Scope::Tenant, Some(tenant_id)) => {
|
||||||
if claims.tenant_id.unwrap() != tenant_id {
|
if claims.tenant_id.unwrap() != tenant_id {
|
||||||
bail!("Tenant id mismatch. Permission denied")
|
return Err(AuthError("Tenant id mismatch. Permission denied".into()));
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
(Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
|
(Scope::PageServerApi, _) => Err(AuthError(
|
||||||
|
"PageServerApi scope makes no sense for Safekeeper".into(),
|
||||||
|
)),
|
||||||
(Scope::SafekeeperData, _) => Ok(()),
|
(Scope::SafekeeperData, _) => Ok(()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
|
|||||||
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
|
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
|
||||||
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
||||||
use storage_broker::DEFAULT_ENDPOINT;
|
use storage_broker::DEFAULT_ENDPOINT;
|
||||||
use utils::auth::{JwtAuth, Scope};
|
use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
|
||||||
use utils::{
|
use utils::{
|
||||||
id::NodeId,
|
id::NodeId,
|
||||||
logging::{self, LogFormat},
|
logging::{self, LogFormat},
|
||||||
@@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
Some(path) => {
|
Some(path) => {
|
||||||
info!("loading http auth JWT key from {path}");
|
info!("loading http auth JWT key(s) from {path}");
|
||||||
Some(Arc::new(
|
let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
|
||||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
|
||||||
))
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use std::str::FromStr;
|
|||||||
use std::str::{self};
|
use std::str::{self};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tracing::{info, info_span, Instrument};
|
use tracing::{debug, info, info_span, Instrument};
|
||||||
|
|
||||||
use crate::auth::check_permission;
|
use crate::auth::check_permission;
|
||||||
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
||||||
@@ -165,26 +165,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
|||||||
.auth
|
.auth
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("auth_type is configured but .auth of handler is missing");
|
.expect("auth_type is configured but .auth of handler is missing");
|
||||||
let data =
|
let data = auth
|
||||||
auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
|
||||||
|
.map_err(|e| QueryError::Unauthorized(e.0))?;
|
||||||
|
|
||||||
// The handler might be configured to allow only tenant scope tokens.
|
// The handler might be configured to allow only tenant scope tokens.
|
||||||
if matches!(allowed_auth_scope, Scope::Tenant)
|
if matches!(allowed_auth_scope, Scope::Tenant)
|
||||||
&& !matches!(data.claims.scope, Scope::Tenant)
|
&& !matches!(data.claims.scope, Scope::Tenant)
|
||||||
{
|
{
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
return Err(QueryError::Unauthorized(
|
||||||
"passed JWT token is for full access, but only tenant scope is allowed"
|
"passed JWT token is for full access, but only tenant scope is allowed".into(),
|
||||||
)));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
return Err(QueryError::Unauthorized(
|
||||||
"jwt token scope is Tenant, but tenant id is missing"
|
"jwt token scope is Tenant, but tenant id is missing".into(),
|
||||||
)));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
debug!(
|
||||||
"jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
|
"jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
|
||||||
data.claims.scope, data.claims.tenant_id,
|
data.claims.scope, data.claims.tenant_id,
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -263,7 +264,7 @@ impl SafekeeperPostgresHandler {
|
|||||||
|
|
||||||
// when accessing management api supply None as an argument
|
// when accessing management api supply None as an argument
|
||||||
// when using to authorize tenant pass corresponding tenant id
|
// when using to authorize tenant pass corresponding tenant id
|
||||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
|
||||||
if self.auth.is_none() {
|
if self.auth.is_none() {
|
||||||
// auth is set to Trust, nothing to check so just return ok
|
// auth is set to Trust, nothing to check so just return ok
|
||||||
return Ok(());
|
return Ok(());
|
||||||
@@ -275,7 +276,7 @@ impl SafekeeperPostgresHandler {
|
|||||||
.claims
|
.claims
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("claims presence already checked");
|
.expect("claims presence already checked");
|
||||||
check_permission(claims, tenant_id)
|
check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
|
async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||||
|
|||||||
@@ -86,6 +86,41 @@ paths:
|
|||||||
default:
|
default:
|
||||||
$ref: "#/components/responses/GenericError"
|
$ref: "#/components/responses/GenericError"
|
||||||
|
|
||||||
|
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
|
||||||
|
parameters:
|
||||||
|
- name: tenant_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
- name: source_timeline_id
|
||||||
|
in: path
|
||||||
|
required: true
|
||||||
|
schema:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
|
||||||
|
post:
|
||||||
|
tags:
|
||||||
|
- "Timeline"
|
||||||
|
summary: Register new timeline as copy of existing timeline
|
||||||
|
description: ""
|
||||||
|
operationId: v1CopyTenantTimeline
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/TimelineCopyRequest"
|
||||||
|
responses:
|
||||||
|
"201":
|
||||||
|
description: Timeline created
|
||||||
|
# TODO: return timeline info?
|
||||||
|
"403":
|
||||||
|
$ref: "#/components/responses/ForbiddenError"
|
||||||
|
default:
|
||||||
|
$ref: "#/components/responses/GenericError"
|
||||||
|
|
||||||
|
|
||||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -210,6 +245,18 @@ components:
|
|||||||
type: integer
|
type: integer
|
||||||
minimum: 0
|
minimum: 0
|
||||||
|
|
||||||
|
TimelineCopyRequest:
|
||||||
|
type: object
|
||||||
|
required:
|
||||||
|
- target_timeline_id
|
||||||
|
- until_lsn
|
||||||
|
properties:
|
||||||
|
target_timeline_id:
|
||||||
|
type: string
|
||||||
|
format: hex
|
||||||
|
until_lsn:
|
||||||
|
type: string
|
||||||
|
|
||||||
SkTimelineInfo:
|
SkTimelineInfo:
|
||||||
type: object
|
type: object
|
||||||
required:
|
required:
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
|
|||||||
use crate::GlobalTimelines;
|
use crate::GlobalTimelines;
|
||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
use utils::{
|
use utils::{
|
||||||
auth::JwtAuth,
|
auth::SwappableJwtAuth,
|
||||||
http::{
|
http::{
|
||||||
endpoint::{self, auth_middleware, check_permission_with},
|
endpoint::{self, auth_middleware, check_permission_with},
|
||||||
error::ApiError,
|
error::ApiError,
|
||||||
@@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
|||||||
if ALLOWLIST_ROUTES.contains(request.uri()) {
|
if ALLOWLIST_ROUTES.contains(request.uri()) {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
// Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
|
// Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
|
||||||
request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
|
request
|
||||||
|
.data::<Option<Arc<SwappableJwtAuth>>>()
|
||||||
|
.unwrap()
|
||||||
|
.as_deref()
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use remote_storage::RemoteStorageConfig;
|
use remote_storage::RemoteStorageConfig;
|
||||||
@@ -6,7 +7,10 @@ use tokio::runtime::Runtime;
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use storage_broker::Uri;
|
use storage_broker::Uri;
|
||||||
|
|
||||||
use utils::id::{NodeId, TenantId, TenantTimelineId};
|
use utils::{
|
||||||
|
auth::SwappableJwtAuth,
|
||||||
|
id::{NodeId, TenantId, TenantTimelineId},
|
||||||
|
};
|
||||||
|
|
||||||
mod auth;
|
mod auth;
|
||||||
pub mod broker;
|
pub mod broker;
|
||||||
@@ -69,7 +73,7 @@ pub struct SafeKeeperConf {
|
|||||||
pub wal_backup_enabled: bool,
|
pub wal_backup_enabled: bool,
|
||||||
pub pg_auth: Option<Arc<JwtAuth>>,
|
pub pg_auth: Option<Arc<JwtAuth>>,
|
||||||
pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
|
pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
|
||||||
pub http_auth: Option<Arc<JwtAuth>>,
|
pub http_auth: Option<Arc<SwappableJwtAuth>>,
|
||||||
pub current_thread_runtime: bool,
|
pub current_thread_runtime: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ impl WalReceivers {
|
|||||||
.count()
|
.count()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Unregister walsender.
|
/// Unregister walreceiver.
|
||||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||||
let mut shared = self.mutex.lock();
|
let mut shared = self.mutex.lock();
|
||||||
shared.slots[id] = None;
|
shared.slots[id] = None;
|
||||||
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
|
|||||||
Streaming,
|
Streaming,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scope guard to access slot in WalSenders registry and unregister from it in
|
/// Scope guard to access slot in WalReceivers registry and unregister from
|
||||||
/// Drop.
|
/// it in Drop.
|
||||||
pub struct WalReceiverGuard {
|
pub struct WalReceiverGuard {
|
||||||
id: WalReceiverId,
|
id: WalReceiverId,
|
||||||
walreceivers: Arc<WalReceivers>,
|
walreceivers: Arc<WalReceivers>,
|
||||||
|
|||||||
@@ -361,7 +361,6 @@ class PgProtocol:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class AuthKeys:
|
class AuthKeys:
|
||||||
pub: str
|
|
||||||
priv: str
|
priv: str
|
||||||
|
|
||||||
def generate_token(self, *, scope: str, **token_data: str) -> str:
|
def generate_token(self, *, scope: str, **token_data: str) -> str:
|
||||||
@@ -626,6 +625,8 @@ class NeonEnvBuilder:
|
|||||||
sk.stop(immediate=True)
|
sk.stop(immediate=True)
|
||||||
|
|
||||||
for pageserver in self.env.pageservers:
|
for pageserver in self.env.pageservers:
|
||||||
|
pageserver.assert_no_metric_errors()
|
||||||
|
|
||||||
pageserver.stop(immediate=True)
|
pageserver.stop(immediate=True)
|
||||||
|
|
||||||
if self.env.attachment_service is not None:
|
if self.env.attachment_service is not None:
|
||||||
@@ -875,9 +876,31 @@ class NeonEnv:
|
|||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def auth_keys(self) -> AuthKeys:
|
def auth_keys(self) -> AuthKeys:
|
||||||
pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
|
|
||||||
priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
|
priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
|
||||||
return AuthKeys(pub=pub, priv=priv)
|
return AuthKeys(priv=priv)
|
||||||
|
|
||||||
|
def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
|
||||||
|
# compare generate_auth_keys() in local_env.rs
|
||||||
|
subprocess.run(
|
||||||
|
["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
|
||||||
|
cwd=self.repo_dir,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"openssl",
|
||||||
|
"pkey",
|
||||||
|
"-in",
|
||||||
|
privkey_path,
|
||||||
|
"-pubout",
|
||||||
|
"-out",
|
||||||
|
pubkey_path,
|
||||||
|
],
|
||||||
|
cwd=self.repo_dir,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
del self.auth_keys
|
||||||
|
|
||||||
def generate_endpoint_id(self) -> str:
|
def generate_endpoint_id(self) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -1784,6 +1807,21 @@ class NeonPageserver(PgProtocol):
|
|||||||
|
|
||||||
assert not errors
|
assert not errors
|
||||||
|
|
||||||
|
def assert_no_metric_errors(self):
|
||||||
|
"""
|
||||||
|
Certain metrics should _always_ be zero: they track conditions that indicate a bug.
|
||||||
|
"""
|
||||||
|
if not self.running:
|
||||||
|
log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
|
||||||
|
return
|
||||||
|
|
||||||
|
for metric in [
|
||||||
|
"pageserver_tenant_manager_unexpected_errors_total",
|
||||||
|
"pageserver_deletion_queue_unexpected_errors_total",
|
||||||
|
]:
|
||||||
|
value = self.http_client().get_metric_value(metric)
|
||||||
|
assert value == 0, f"Nonzero {metric} == {value}"
|
||||||
|
|
||||||
def log_contains(self, pattern: str) -> Optional[str]:
|
def log_contains(self, pattern: str) -> Optional[str]:
|
||||||
"""Check that the pageserver log contains a line that matches the given regex"""
|
"""Check that the pageserver log contains a line that matches the given regex"""
|
||||||
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
|
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
|
||||||
|
|||||||
@@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session):
|
|||||||
assert res_json is None
|
assert res_json is None
|
||||||
return res_json
|
return res_json
|
||||||
|
|
||||||
|
def reload_auth_validation_keys(self):
|
||||||
|
res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
|
||||||
|
self.verbose_error(res)
|
||||||
|
|
||||||
def tenant_list(self) -> List[Dict[Any, Any]]:
|
def tenant_list(self) -> List[Dict[Any, Any]]:
|
||||||
res = self.get(f"http://localhost:{self.port}/v1/tenant")
|
res = self.get(f"http://localhost:{self.port}/v1/tenant")
|
||||||
self.verbose_error(res)
|
self.verbose_error(res)
|
||||||
|
|||||||
@@ -1,12 +1,35 @@
|
|||||||
|
import os
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
|
from fixtures.neon_fixtures import (
|
||||||
from fixtures.pageserver.http import PageserverApiException
|
NeonEnv,
|
||||||
|
NeonEnvBuilder,
|
||||||
|
PgProtocol,
|
||||||
|
)
|
||||||
|
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||||
from fixtures.types import TenantId, TimelineId
|
from fixtures.types import TenantId, TimelineId
|
||||||
|
|
||||||
|
|
||||||
|
def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
|
||||||
|
http_client.timeline_create(
|
||||||
|
pg_version=env.pg_version,
|
||||||
|
tenant_id=env.initial_tenant,
|
||||||
|
new_timeline_id=TimelineId.generate(),
|
||||||
|
ancestor_timeline_id=env.initial_timeline,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
|
||||||
|
with pytest.raises(
|
||||||
|
PageserverApiException,
|
||||||
|
match="Forbidden: JWT authentication error",
|
||||||
|
):
|
||||||
|
assert_client_authorized(env, http_client)
|
||||||
|
|
||||||
|
|
||||||
def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
||||||
neon_env_builder.auth_enabled = True
|
neon_env_builder.auth_enabled = True
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
@@ -27,30 +50,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
|||||||
ps.safe_psql("set FOO", password=pageserver_token)
|
ps.safe_psql("set FOO", password=pageserver_token)
|
||||||
|
|
||||||
# tenant can create branches
|
# tenant can create branches
|
||||||
tenant_http_client.timeline_create(
|
assert_client_authorized(env, tenant_http_client)
|
||||||
pg_version=env.pg_version,
|
|
||||||
tenant_id=env.initial_tenant,
|
|
||||||
new_timeline_id=TimelineId.generate(),
|
|
||||||
ancestor_timeline_id=env.initial_timeline,
|
|
||||||
)
|
|
||||||
# console can create branches for tenant
|
# console can create branches for tenant
|
||||||
pageserver_http_client.timeline_create(
|
assert_client_authorized(env, pageserver_http_client)
|
||||||
pg_version=env.pg_version,
|
|
||||||
tenant_id=env.initial_tenant,
|
|
||||||
new_timeline_id=TimelineId.generate(),
|
|
||||||
ancestor_timeline_id=env.initial_timeline,
|
|
||||||
)
|
|
||||||
|
|
||||||
# fail to create branch using token with different tenant_id
|
# fail to create branch using token with different tenant_id
|
||||||
with pytest.raises(
|
with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
|
||||||
PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
|
assert_client_authorized(env, invalid_tenant_http_client)
|
||||||
):
|
|
||||||
invalid_tenant_http_client.timeline_create(
|
|
||||||
pg_version=env.pg_version,
|
|
||||||
tenant_id=env.initial_tenant,
|
|
||||||
new_timeline_id=TimelineId.generate(),
|
|
||||||
ancestor_timeline_id=env.initial_timeline,
|
|
||||||
)
|
|
||||||
|
|
||||||
# create tenant using management token
|
# create tenant using management token
|
||||||
pageserver_http_client.tenant_create(TenantId.generate())
|
pageserver_http_client.tenant_create(TenantId.generate())
|
||||||
@@ -58,7 +65,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
|
|||||||
# fail to create tenant using tenant token
|
# fail to create tenant using tenant token
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
PageserverApiException,
|
PageserverApiException,
|
||||||
match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
|
match="Forbidden: JWT authentication error",
|
||||||
):
|
):
|
||||||
tenant_http_client.tenant_create(TenantId.generate())
|
tenant_http_client.tenant_create(TenantId.generate())
|
||||||
|
|
||||||
@@ -82,6 +89,96 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
|
|||||||
assert cur.fetchone() == (5000050000,)
|
assert cur.fetchone() == (5000050000,)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
|
||||||
|
neon_env_builder.auth_enabled = True
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
|
||||||
|
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
|
||||||
|
|
||||||
|
pageserver_token_old = env.auth_keys.generate_pageserver_token()
|
||||||
|
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
|
||||||
|
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# This test is to ensure that the pageserver supports multiple keys.
|
||||||
|
# The neon_local tool generates one key pair at a hardcoded path by default.
|
||||||
|
# As a preparation for our test, move the public key of the key pair into a
|
||||||
|
# directory at the same location as the hardcoded path by:
|
||||||
|
# 1. moving the the file at `configured_pub_key_path` to a temporary location
|
||||||
|
# 2. creating a new directory at `configured_pub_key_path`
|
||||||
|
# 3. moving the file from the temporary location into the newly created directory
|
||||||
|
configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
|
||||||
|
os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
|
||||||
|
os.mkdir(configured_pub_key_path)
|
||||||
|
os.rename(
|
||||||
|
Path(env.repo_dir) / "auth_public_key.pem.file",
|
||||||
|
configured_pub_key_path / "auth_public_key_old.pem",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add a new key pair
|
||||||
|
# This invalidates env.auth_keys and makes them be regenerated
|
||||||
|
env.regenerate_keys_at(
|
||||||
|
Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reload the keys on the pageserver side
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# We can continue doing things using the old token
|
||||||
|
assert_client_authorized(env, pageserver_http_client_old)
|
||||||
|
|
||||||
|
pageserver_token_new = env.auth_keys.generate_pageserver_token()
|
||||||
|
pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
|
||||||
|
|
||||||
|
# The new token also works
|
||||||
|
assert_client_authorized(env, pageserver_http_client_new)
|
||||||
|
|
||||||
|
# Remove the old token and reload
|
||||||
|
os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# Reloading fails now with the old token, but the new token still works
|
||||||
|
assert_client_not_authorized(env, pageserver_http_client_old)
|
||||||
|
assert_client_authorized(env, pageserver_http_client_new)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
|
||||||
|
neon_env_builder.auth_enabled = True
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
|
||||||
|
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
|
||||||
|
|
||||||
|
pageserver_token_old = env.auth_keys.generate_pageserver_token()
|
||||||
|
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
|
||||||
|
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# Regenerate the keys
|
||||||
|
env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
|
||||||
|
|
||||||
|
# Reload the keys on the pageserver side
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# Next attempt fails as we use the old auth token
|
||||||
|
with pytest.raises(
|
||||||
|
PageserverApiException,
|
||||||
|
match="Forbidden: JWT authentication error",
|
||||||
|
):
|
||||||
|
pageserver_http_client_old.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
# same goes for attempts trying to create a timeline
|
||||||
|
assert_client_not_authorized(env, pageserver_http_client_old)
|
||||||
|
|
||||||
|
pageserver_token_new = env.auth_keys.generate_pageserver_token()
|
||||||
|
pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
|
||||||
|
|
||||||
|
# timeline creation works with the new token
|
||||||
|
assert_client_authorized(env, pageserver_http_client_new)
|
||||||
|
|
||||||
|
# reloading also works with the new token
|
||||||
|
pageserver_http_client_new.reload_auth_validation_keys()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("auth_enabled", [False, True])
|
@pytest.mark.parametrize("auth_enabled", [False, True])
|
||||||
def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||||
neon_env_builder.auth_enabled = auth_enabled
|
neon_env_builder.auth_enabled = auth_enabled
|
||||||
|
|||||||
60
test_runner/regress/test_bad_connection.py
Normal file
60
test_runner/regress/test_bad_connection.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
from fixtures.log_helper import log
|
||||||
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
|
||||||
|
env = neon_env_builder.init_start()
|
||||||
|
env.pageserver.allowed_errors.append(".*simulated connection error.*")
|
||||||
|
|
||||||
|
pageserver_http = env.pageserver.http_client()
|
||||||
|
env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
|
||||||
|
endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
|
||||||
|
|
||||||
|
# Enable failpoint after starting everything else up so that loading initial
|
||||||
|
# basebackup doesn't fail
|
||||||
|
pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
|
||||||
|
|
||||||
|
pg_conn = endpoint.connect()
|
||||||
|
cur = pg_conn.cursor()
|
||||||
|
|
||||||
|
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||||
|
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||||
|
# from shared_buffers without hitting the page server, which defeats the point
|
||||||
|
# of this test.
|
||||||
|
cur.execute("CREATE TABLE foo (t text)")
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO foo
|
||||||
|
SELECT 'long string to consume some space' || g
|
||||||
|
FROM generate_series(1, 100000) g
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that the table is larger than shared_buffers
|
||||||
|
cur.execute(
|
||||||
|
"""
|
||||||
|
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
||||||
|
from pg_settings where name = 'shared_buffers'
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
assert row is not None
|
||||||
|
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||||
|
assert int(row[0]) < int(row[1])
|
||||||
|
|
||||||
|
cur.execute("SELECT count(*) FROM foo")
|
||||||
|
assert cur.fetchone() == (100000,)
|
||||||
|
|
||||||
|
end_time = time.time() + 30
|
||||||
|
times_executed = 0
|
||||||
|
while time.time() < end_time:
|
||||||
|
if random.random() < 0.5:
|
||||||
|
cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
|
||||||
|
else:
|
||||||
|
cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
|
||||||
|
cur.fetchall()
|
||||||
|
times_executed += 1
|
||||||
|
log.info(f"Workload executed {times_executed} times")
|
||||||
@@ -1,9 +1,13 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||||
from fixtures.remote_storage import RemoteStorageKind
|
from fixtures.remote_storage import RemoteStorageKind
|
||||||
|
|
||||||
|
|
||||||
def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||||
|
num_connections = 3
|
||||||
|
|
||||||
neon_env_builder.num_pageservers = 2
|
neon_env_builder.num_pageservers = 2
|
||||||
neon_env_builder.enable_pageserver_remote_storage(
|
neon_env_builder.enable_pageserver_remote_storage(
|
||||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||||
@@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
|||||||
alt_pageserver_id = env.pageservers[1].id
|
alt_pageserver_id = env.pageservers[1].id
|
||||||
env.pageservers[1].tenant_attach(env.initial_tenant)
|
env.pageservers[1].tenant_attach(env.initial_tenant)
|
||||||
|
|
||||||
pg_conn = endpoint.connect()
|
pg_conns = [endpoint.connect() for i in range(num_connections)]
|
||||||
cur = pg_conn.cursor()
|
curs = [pg_conn.cursor() for pg_conn in pg_conns]
|
||||||
|
|
||||||
|
def execute(statement: str):
|
||||||
|
for cur in curs:
|
||||||
|
cur.execute(statement)
|
||||||
|
|
||||||
|
def fetchone():
|
||||||
|
results = [cur.fetchone() for cur in curs]
|
||||||
|
assert all(result == results[0] for result in results)
|
||||||
|
return results[0]
|
||||||
|
|
||||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||||
# shared_buffers, otherwise the SELECT after restart will just return answer
|
# shared_buffers, otherwise the SELECT after restart will just return answer
|
||||||
# from shared_buffers without hitting the page server, which defeats the point
|
# from shared_buffers without hitting the page server, which defeats the point
|
||||||
# of this test.
|
# of this test.
|
||||||
cur.execute("CREATE TABLE foo (t text)")
|
curs[0].execute("CREATE TABLE foo (t text)")
|
||||||
cur.execute(
|
curs[0].execute(
|
||||||
"""
|
"""
|
||||||
INSERT INTO foo
|
INSERT INTO foo
|
||||||
SELECT 'long string to consume some space' || g
|
SELECT 'long string to consume some space' || g
|
||||||
@@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Verify that the table is larger than shared_buffers
|
# Verify that the table is larger than shared_buffers
|
||||||
cur.execute(
|
curs[0].execute(
|
||||||
"""
|
"""
|
||||||
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
|
||||||
from pg_settings where name = 'shared_buffers'
|
from pg_settings where name = 'shared_buffers'
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
row = cur.fetchone()
|
row = curs[0].fetchone()
|
||||||
assert row is not None
|
assert row is not None
|
||||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||||
assert int(row[0]) < int(row[1])
|
assert int(row[0]) < int(row[1])
|
||||||
|
|
||||||
cur.execute("SELECT count(*) FROM foo")
|
execute("SELECT count(*) FROM foo")
|
||||||
assert cur.fetchone() == (100000,)
|
assert fetchone() == (100000,)
|
||||||
|
|
||||||
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
|
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
|
||||||
|
|
||||||
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
|
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
|
||||||
cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
|
execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
|
||||||
connstring = cur.fetchone()
|
connstring = fetchone()
|
||||||
assert connstring is not None
|
assert connstring is not None
|
||||||
expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
|
expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
|
||||||
assert expected_connstring == expected_connstring
|
assert expected_connstring == expected_connstring
|
||||||
@@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
|||||||
0
|
0
|
||||||
].stop() # Stop the old pageserver just to make sure we're reading from the new one
|
].stop() # Stop the old pageserver just to make sure we're reading from the new one
|
||||||
|
|
||||||
cur.execute("SELECT count(*) FROM foo")
|
execute("SELECT count(*) FROM foo")
|
||||||
assert cur.fetchone() == (100000,)
|
assert fetchone() == (100000,)
|
||||||
|
|
||||||
|
# Try failing back, and this time we will stop the current pageserver before reconfiguring
|
||||||
|
# the endpoint. Whereas the previous reconfiguration was like a healthy migration, this
|
||||||
|
# is more like what happens in an unexpected pageserver failure.
|
||||||
|
env.pageservers[0].start()
|
||||||
|
env.pageservers[1].stop()
|
||||||
|
|
||||||
|
endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
|
||||||
|
|
||||||
|
execute("SELECT count(*) FROM foo")
|
||||||
|
assert fetchone() == (100000,)
|
||||||
|
|
||||||
|
env.pageservers[0].stop()
|
||||||
|
env.pageservers[1].start()
|
||||||
|
|
||||||
|
# Test a (former) bug where a child process spins without updating its connection string
|
||||||
|
# by executing a query separately. This query will hang until we issue the reconfigure.
|
||||||
|
async def reconfigure_async():
|
||||||
|
await asyncio.sleep(
|
||||||
|
1
|
||||||
|
) # Sleep for 1 second just to make sure we actually started our count(*) query
|
||||||
|
endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
|
||||||
|
|
||||||
|
def execute_count():
|
||||||
|
execute("SELECT count(*) FROM FOO")
|
||||||
|
|
||||||
|
async def execute_and_reconfigure():
|
||||||
|
task_exec = asyncio.to_thread(execute_count)
|
||||||
|
task_reconfig = asyncio.create_task(reconfigure_async())
|
||||||
|
await asyncio.gather(
|
||||||
|
task_exec,
|
||||||
|
task_reconfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(execute_and_reconfigure())
|
||||||
|
assert fetchone() == (100000,)
|
||||||
|
|
||||||
|
# One final check that nothing hangs
|
||||||
|
execute("SELECT count(*) FROM foo")
|
||||||
|
assert fetchone() == (100000,)
|
||||||
|
|||||||
@@ -134,6 +134,9 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
|
|||||||
env.neon_cli.pageserver_stop(env.pageserver.id)
|
env.neon_cli.pageserver_stop(env.pageserver.id)
|
||||||
env.neon_cli.safekeeper_stop()
|
env.neon_cli.safekeeper_stop()
|
||||||
|
|
||||||
|
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||||
|
env.pageserver.running = False
|
||||||
|
|
||||||
# Default start
|
# Default start
|
||||||
res = env.neon_cli.raw_cli(["start"])
|
res = env.neon_cli.raw_cli(["start"])
|
||||||
res.check_returncode()
|
res.check_returncode()
|
||||||
@@ -155,6 +158,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
|
|||||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
|
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
|
||||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)
|
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)
|
||||||
|
|
||||||
|
# Keep NeonEnv state up to date, it usually owns starting/stopping services
|
||||||
|
env.pageservers[0].running = False
|
||||||
|
env.pageservers[1].running = False
|
||||||
|
|
||||||
# Addressing a nonexistent ID throws
|
# Addressing a nonexistent ID throws
|
||||||
with pytest.raises(RuntimeError):
|
with pytest.raises(RuntimeError):
|
||||||
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
|
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user